Parent Directory
|
Revision Log
Revision 1.1 - (view) (download) (as text)
1 : | wilke | 1.1 | #! /usr/bin/perl |
2 : | use strict; | ||
3 : | use warnings; | ||
4 : | use LWP::Simple; | ||
5 : | use Getopt::Long; | ||
6 : | use Bio::DB::RefSeq; | ||
7 : | use Bio::SeqIO; | ||
8 : | |||
9 : | # read in parameters | ||
10 : | my $key = ''; | ||
11 : | my $contig_id = ''; | ||
12 : | my $destination = "/tmp/"; | ||
13 : | my $file_type = "genbank"; | ||
14 : | my $tax_id = ""; | ||
15 : | my $project = "", | ||
16 : | my $org_name = ""; | ||
17 : | my $plasmid = 0; | ||
18 : | my $file = ''; | ||
19 : | |||
20 : | GetOptions ( 'key=s' => \$key , | ||
21 : | 'contigID=s' => \$contig_id, | ||
22 : | 'destination=s' => \$destination, | ||
23 : | 'type=s' => \$file_type, | ||
24 : | 'taxonomyID=s' => \$tax_id, | ||
25 : | 'organism=s' => \$org_name, | ||
26 : | 'plasmid' => \$plasmid, | ||
27 : | 'file=s' => \$file, | ||
28 : | 'project=s' => \$project, | ||
29 : | ); | ||
30 : | |||
31 : | |||
32 : | |||
33 : | if ($tax_id){ | ||
34 : | |||
35 : | print "Output file = " , get_entries_for_tax_id($tax_id); | ||
36 : | |||
37 : | } | ||
38 : | if ($project) { | ||
39 : | print "Output file = " , get_entries_for_project_id($project); | ||
40 : | } | ||
41 : | if($contig_id){ | ||
42 : | |||
43 : | $contig_id =~ s /\s*,\s*/,/g; | ||
44 : | my @ids = split "," , $contig_id ; | ||
45 : | |||
46 : | # connection to NCBI | ||
47 : | my $db = new Bio::DB::RefSeq; | ||
48 : | |||
49 : | # define output | ||
50 : | my $seqout = Bio::SeqIO->new( | ||
51 : | -file => ">/tmp/out.gbff", | ||
52 : | -format => 'Genbank', | ||
53 : | ); | ||
54 : | |||
55 : | # get data for ids | ||
56 : | my $seqio = $db->get_Stream_by_id( \@ids ); | ||
57 : | while( my $seq = $seqio->next_seq ) { | ||
58 : | print "seqid is ", $seq->id, "\n"; | ||
59 : | print $seq->get_dates , "\t" , $seq->division , "\n"; | ||
60 : | # write data to output file | ||
61 : | $seqout->write_seq($seq); | ||
62 : | } | ||
63 : | |||
64 : | # most of the time RefSeq_ID eq RefSeq acc | ||
65 : | # my $seq = $db->get_Seq_by_id($contig_id); # RefSeq ID | ||
66 : | # print "accession is ", $seq->accession_number, "\n"; | ||
67 : | |||
68 : | |||
69 : | |||
70 : | } | ||
71 : | |||
72 : | |||
73 : | |||
74 : | |||
75 : | |||
76 : | # | ||
77 : | # Functions | ||
78 : | # | ||
79 : | |||
80 : | |||
81 : | sub get_entries_for_project_id{ | ||
82 : | my ($project) = @_; | ||
83 : | |||
84 : | |||
85 : | my $url = "http://www.ncbi.nlm.nih.gov/sites/entrez?Db=genomeprj&Cmd=Retrieve&list_uids="; | ||
86 : | my $search_result = get($url.$project); | ||
87 : | |||
88 : | my $url_seq_overview = "http://www.ncbi.nlm.nih.gov/"; | ||
89 : | |||
90 : | |||
91 : | # print $search_result; | ||
92 : | |||
93 : | my @lines = split ( "\n" , $search_result); | ||
94 : | |||
95 : | my $nr_seq = 0; | ||
96 : | my $nr_proj = 0; | ||
97 : | my $url_seq = ""; | ||
98 : | my $url_proj = ""; | ||
99 : | my $genome_name = ""; | ||
100 : | |||
101 : | my $found_genome_information_table = 0; | ||
102 : | |||
103 : | my $next = ""; | ||
104 : | my $id_list = ""; | ||
105 : | foreach my $line ( @lines ){ | ||
106 : | |||
107 : | if ($line =~/Genome information:/){ | ||
108 : | $found_genome_information_table = 1; | ||
109 : | next; # skip table line | ||
110 : | }; | ||
111 : | |||
112 : | $found_genome_information_table = 0 if ( $found_genome_information_table and $line =~ /<\/table>/); | ||
113 : | $id_list .= $line if ( $found_genome_information_table ); # collect id entries | ||
114 : | |||
115 : | print $line , "\n" if ( $found_genome_information_table ); | ||
116 : | |||
117 : | |||
118 : | } | ||
119 : | |||
120 : | my @ids; | ||
121 : | my @blocks = split "<\/tr>" , $id_list ; | ||
122 : | foreach my $block (@blocks){ | ||
123 : | my @local_ids = $block =~/([^>]+)<\/a><\/td>/gc ; | ||
124 : | print join "\t" , @local_ids , "\n"; | ||
125 : | push @ids , $local_ids[0]; | ||
126 : | } | ||
127 : | |||
128 : | |||
129 : | my $query = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=".join ("," , @ids) ."&rettype=gb" ; | ||
130 : | # print $query , "\n"; | ||
131 : | my $file = get($query); | ||
132 : | |||
133 : | # print $file; | ||
134 : | |||
135 : | open(TMP , ">/tmp/$project.gbff") or die "Can't open /tmp/$project.gbff"; | ||
136 : | print TMP $file ; | ||
137 : | close(TMP); | ||
138 : | |||
139 : | my $result = check_contigs("/tmp/$project.gbff"); | ||
140 : | |||
141 : | return " /tmp/$project.gbff" , "\n"; | ||
142 : | } | ||
143 : | |||
144 : | |||
145 : | sub get_entries_for_tax_id{ | ||
146 : | my ($tax_id) = @_; | ||
147 : | |||
148 : | my $url = "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id="; | ||
149 : | my $search_result = get($url.$tax_id); | ||
150 : | |||
151 : | my $url_seq_overview = "http://www.ncbi.nlm.nih.gov/"; | ||
152 : | |||
153 : | |||
154 : | # print $search_result; | ||
155 : | |||
156 : | my @lines = split ( "\n" , $search_result); | ||
157 : | |||
158 : | my $nr_seq = 0; | ||
159 : | my $nr_proj = 0; | ||
160 : | my $url_seq = ""; | ||
161 : | my $url_proj = ""; | ||
162 : | my $genome_name = ""; | ||
163 : | |||
164 : | my $next = ""; | ||
165 : | foreach my $line ( @lines ){ | ||
166 : | |||
167 : | if ( $next eq "Sequences"){ | ||
168 : | ($url_seq) = $line =~ m/href=\"([^\"]*)\"/; | ||
169 : | ($nr_seq) = $line =~ m/>(\d*)<\/font/; | ||
170 : | |||
171 : | print STDERR "Genome Sequences: $nr_seq\n"; | ||
172 : | $url_seq =~s/&/&/g; | ||
173 : | # print "URL:\t$url_seq\n"; | ||
174 : | |||
175 : | $next = ""; | ||
176 : | } | ||
177 : | elsif ( $next eq "Projects"){ | ||
178 : | ($url_proj) = $line =~ m/href=\"([^\"]*)\"/; | ||
179 : | ($nr_proj) = $line =~ m/>(\d*)<\/font/; | ||
180 : | |||
181 : | |||
182 : | print STDERR "Genome Projects: $nr_proj \n"; | ||
183 : | |||
184 : | |||
185 : | $next = ""; | ||
186 : | } | ||
187 : | |||
188 : | if ( $line =~ /<title>Taxonomy browser/ ){ | ||
189 : | # print STDERR $line,"\n"; | ||
190 : | } | ||
191 : | if ( $line =~ m/<title>Taxonomy browser/){ | ||
192 : | # print STDERR $line,"\n"; | ||
193 : | |||
194 : | } | ||
195 : | if ( $line =~ /<title>Taxonomy browser\s*\(([^()]+)\)\<\/title\>/ ) { | ||
196 : | $genome_name = $1; | ||
197 : | print STDERR "Genome Name = $1\n"; | ||
198 : | } | ||
199 : | |||
200 : | if ($line =~ m/(Genome[\w;&]+Sequence)/){ | ||
201 : | $next = "Sequences"; | ||
202 : | } | ||
203 : | elsif ($line =~ m/(Genome[\w;&]+Projects)/){ | ||
204 : | $next = "Projects"; | ||
205 : | } | ||
206 : | |||
207 : | } | ||
208 : | |||
209 : | |||
210 : | my $page = get($url_seq_overview.$url_seq); | ||
211 : | |||
212 : | #print "\n $url_seq_overview$url_seq\n"; | ||
213 : | |||
214 : | my (@ids) = $page =~m/www.ncbi.nlm.nih.gov\/sites\/entrez\?Db=genome\&\;Cmd=ShowDetailView\&\;TermToSearch=\d+\">(\w+)<\/a>/gc; | ||
215 : | #<a href="http://www.ncbi.nlm.nih.gov/sites/entrez?Db=genome&Cmd=ShowDetailView&TermToSearch=19221">AC_000091</a> | ||
216 : | # print @ids , "\n"; | ||
217 : | |||
218 : | |||
219 : | my $query = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=".join ("," , @ids) ."&rettype=gb" ; | ||
220 : | # print $query , "\n"; | ||
221 : | my $file = get($query); | ||
222 : | |||
223 : | # print $file; | ||
224 : | |||
225 : | open(TMP , ">/tmp/$tax_id.gbff") or die "Can't open /tmp/$tax_id.gbff"; | ||
226 : | print TMP $file ; | ||
227 : | close(TMP); | ||
228 : | |||
229 : | my $result = check_contigs("/tmp/$tax_id.gbff"); | ||
230 : | |||
231 : | return " /tmp/$tax_id.gbff"; | ||
232 : | } | ||
233 : | |||
234 : | |||
235 : | sub check_contigs{ | ||
236 : | |||
237 : | my ($file) = @_; | ||
238 : | |||
239 : | my $seqio_object = Bio::SeqIO->new( | ||
240 : | -file => $file , | ||
241 : | -format => "genbank", | ||
242 : | ); | ||
243 : | |||
244 : | while ( my $seq = $seqio_object->next_seq ) { | ||
245 : | |||
246 : | print $seq->accession_number , "\n"; | ||
247 : | print $seq->get_dates , "\t" , $seq->division , "\n"; | ||
248 : | # print $seq->seq , "\n"; | ||
249 : | } | ||
250 : | |||
251 : | } |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |