[Bio] / FigKernelScripts / embl2gff.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/embl2gff.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.20 - (view) (download) (as text)

1 : olson 1.20 #
2 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
3 :     # for Interpretations of Genomes. All Rights Reserved.
4 :     #
5 :     # This file is part of the SEED Toolkit.
6 :     #
7 :     # The SEED Toolkit is free software. You can redistribute
8 :     # it and/or modify it under the terms of the SEED Toolkit
9 :     # Public License.
10 :     #
11 :     # You should have received a copy of the SEED Toolkit Public License
12 :     # along with this program; if not write to the University of Chicago
13 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14 :     # Genomes at veronika@thefig.info or download a copy from
15 :     # http://www.theseed.org/LICENSE.TXT.
16 :     #
17 :    
18 : efrank 1.1 #__perl__
19 :    
20 : efrank 1.2 use URI::Escape;
21 : efrank 1.5 use Data::Dumper;
22 : efrank 1.2
23 : efrank 1.17 my $usage= "Usage: embl2gff -taxId N -orgVersion N [-extra fileName] [-ensemblVer verString] emblFile1.dat [emblFile2.dat ...]\n";
24 : efrank 1.11
25 :     ##
26 :     # cmd line args
27 :     ##
28 :    
29 : efrank 1.17 my ($taxId, $orgVerNo, $extraFile,$ensemblVer)=(-1, -1,"extra.txt", "Ensembl-31");
30 : efrank 1.11 my @fileNames;
31 :     while (@ARGV) {
32 :     my $t=shift;
33 :    
34 :     if ($t eq "-orgVersion") {
35 :     $orgVerNo=shift;
36 :     print "Set org version to $orgVerNo\n";
37 : efrank 1.17 } elsif ($t eq "-taxId") {
38 :     $taxId=shift; print "Taxon ID set to $taxId\n";
39 : efrank 1.11 } elsif ($t eq "-extra") {
40 :     $extraFile=shift; print "Set extras file to $extraFile\n";
41 : efrank 1.15 } elsif ($t eq "-ensemblVer") {
42 :     $ensemblVer=shift; print "Set Ensembl version to $ensemblVer\n";
43 : efrank 1.11 } else {
44 :     push( @fileNames, $t );
45 :     }
46 :     }
47 :    
48 : efrank 1.17 if ($orgVerNo eq "-1") { die $usage; }
49 :     if ($taxId eq "-1") { die $usage; }
50 :    
51 : efrank 1.11 print "Files to process:", &Dumper(\@fileNames),"\n";
52 :    
53 : efrank 1.8 ##
54 : efrank 1.16 # Genes can be split over clones. Those clones can end up in
55 :     # two EMBL entries. In that case, the EMBL file enters the
56 :     # transcripts *twice*, duplicating the translation and we end
57 :     # up duplicating the transcript. When this happens, the location
58 :     # info includes a colon (:) as part of scoping information to
59 :     # point to the other clone. We can't just skip anything with a
60 :     # :, so when we hit a colon in the location info, we put the transcript
61 :     # ID into this hash (colonHash) and let it go through. If we hit
62 :     # another colon, we look in the hash, and if th id is already there,
63 :     # we skip it
64 :     #
65 :     ##
66 :    
67 :     my %colonHash;
68 :     my $hasColon = 0;
69 :     my %tscriptHash;
70 :    
71 :     ##
72 : efrank 1.8 # Initialize a hash that remaps tag names in db_xrefs. A
73 :     # Xref not in this list will be dropped.
74 :     ##
75 :    
76 : efrank 1.9 my %dbXrefMap = (
77 : efrank 1.8 #these are from scanning ensembl -31 release
78 :     #human and common to many
79 :     RefSeq_dna => 'NCBI_NM',
80 :     RefSeq_dna_predicted => 'NCBI_NM',
81 :     RefSeq_peptide => 'NCBI_NP',
82 :     RefSeq_peptide_predicted => 'NCBI_NP',
83 :     HUGO => 'HGNC',
84 :     EntrezGene => 'EntrezGene',
85 : efrank 1.18 UniGene => 'UniGene',
86 : efrank 1.8 "Uniprot/SPTREMBL" => 'Uniprot/SPTREMBL',
87 :     "Uniprot/SWISSPROT" => 'Uniprot/SWISSPROT',
88 : efrank 1.18 EMBL => 'EMBL',
89 :     protein_id => 'protein_id',
90 : efrank 1.8 MIM => 'MIM',
91 :     GO => 'GO',
92 :     IPI => 'IPI',
93 :     PDB => 'PDB',
94 :     #Anopheles related
95 :     Anopheles_symbol => 'Anopheles_symbol',
96 :     Celera_Gene => 'Celera_Gene',
97 :     Celera_Pep => 'Celera_Pep',
98 :     Celera_Trans => 'Celera_Trans',
99 :     prediction_SPTREMBL => 'prediction_SPTREMBL',
100 :     #worm
101 :     wormbase_transcript => 'WP',
102 :     wormpep_id => 'WP',
103 :     #chicken
104 :     #chimp
105 :     Ens_Hs_transcript => 'Ens_Hs_transcript',
106 :     Ens_Hs_translation => 'Ens_Hs_translation',
107 :     #fly
108 :     FlyBaseName_translations => 'FlyBaseName_translations',
109 :     FlyBaseORFNames => 'FlyBaseORFNames',
110 :     FlyBaseSynonyms => 'FlyBaseSynonyms',
111 :     drosophila_translation_id => 'drosophila_translation_id',
112 :     flybase_polypeptide_id => 'FB',
113 :     flybase_transcript_id => 'FB',
114 :     #mouse- are these only in STS?
115 :     MGD => 'MGD',
116 :     "Whitehead-MRC_RH" => 'Whitehead-MRC_RH',
117 :     #rat- some are mostly STS
118 :     RGD => 'RGD',
119 :     RGD_NUM => 'RGD_NUM',
120 :     RH => 'RH',
121 :     'RH_map.2.2' => 'RH_map.2.2',
122 :     #yeast
123 :     SGD => 'SGD_LOCUS',
124 :     #tetraodon (green puffer)
125 :     Genoscope_annotated_gene => 'Genoscope_annotated_gene',
126 :     Genoscope_pred_gene => 'Genoscope_pred_gene',
127 :     Genoscope_pred_transcript => 'Genoscope_pred_transcript',
128 :     #zebra fish
129 :     ZFIN => "ZFIN",
130 :     ZFIN_ID => 'ZFIN_ID'
131 :     );
132 :    
133 : mkubal 1.4
134 : efrank 1.9
135 :     ##
136 :     # load up additional alias and attribute
137 :     # info that was parsed separately
138 :     ##
139 :    
140 : efrank 1.11 if ( ! $extraFile eq "" && ! -r( $extraFile) ) {
141 :     die "File with pre-parsed information, $extraFile, not found.\n"
142 :     }
143 :    
144 :     my $extra_info=load_extra("<$extraFile");
145 : efrank 1.13 #print &Dumper($extra_info),"\n";
146 : efrank 1.9
147 :    
148 :     # Watch out- this prototype has global vars, like below, that are
149 :     # changed in do_file() and also in the foreach file loop below, coupled
150 :     # with state in the parser so that you get one big snarled ball of
151 :     # interdependent goo. For heaven's sake.
152 :    
153 :    
154 :     my $taxonomy = "";
155 :     my $write_header= 1;
156 :    
157 :     my $file_counter = 0;
158 :     my $out_file_counter = 0;
159 :     my $peg_counter = 0;
160 :     my $thresh= 800*1024*1024;
161 :     my $bytes = 10 * $thresh; #force fake over high water mark 1st time to open files
162 :    
163 : efrank 1.11 foreach $file (@fileNames)
164 : efrank 1.1 {
165 : efrank 1.9
166 :     print "Doing file $file_counter ($file).\n";
167 :     print " peg_counter=$peg_counter\n";
168 :    
169 :     if ($bytes > $thresh)
170 :     {
171 :     print " $bytes above high-water of $thresh. Starting new output file.\n";
172 : efrank 1.1
173 : efrank 1.9 if ($file_counter > 0) {
174 :     " Concat'd files\n";
175 :     system `cat Sample_header_$out_file_counter.gff Sample_body_$out_file_counter.gff Sample_seqs_$out_file_counter.gff Sample_contigs_$out_file_counter.gff > Sample_$out_file_counter.gff`;
176 :     }
177 :    
178 :     $out_file_counter = $out_file_counter + 1;
179 :    
180 :     open(OUTPUT1,">Sample_header_$out_file_counter.gff");
181 :     open(OUTPUT2,">Sample_body_$out_file_counter.gff");
182 :     open(OUTPUT3,">Sample_seqs_$out_file_counter.gff");
183 :     open(OUTPUT4,">Sample_contigs_$out_file_counter.gff");
184 :     $bytes = 0;
185 :     $write_header = 1;
186 :     }
187 :    
188 : efrank 1.10 ($thisBytes, $peg_counter) = do_file( $file, $peg_counter, 1);
189 : efrank 1.9 $write_header = 0;
190 :     $bytes = $bytes + $thisBytes;
191 :     print " Wrote $thisBytes bytes. Now total=$bytes\n";
192 : efrank 1.1
193 : efrank 1.9 $file_counter = $file_counter + 1;
194 :     }
195 :    
196 :    
197 :     system `cat Sample_header_$out_file_counter.gff Sample_body_$out_file_counter.gff Sample_seqs_$out_file_counter.gff Sample_contigs_$out_file_counter.gff > Sample_$out_file_counter.gff`;
198 :    
199 :     exit(0);
200 :    
201 :    
202 :     sub do_file {
203 :    
204 :     my($file, $peg_counter, $debug) = @_;
205 :    
206 :     my $bytesOut = 0;
207 :    
208 :     $new_entry = 0; #says when we move to new CDS
209 :     $biggest = 0; #used to find start/end of gene
210 :     $smallest = 100000000; #used to find start/end of gene
211 :     $record_translation = 0;
212 :     $last_seq_region_name = "";
213 :     $record_next_line = 0;
214 :     my $doing_tax;
215 :    
216 : efrank 1.11 #the exact value of this is used by embl2gff_addmd5
217 :     my $checksumPlaceholder = "checksum_placeholder_xxxxxxxxxxx";
218 : efrank 1.9
219 :     open(INPUT,$file);
220 :     while ($_ =<INPUT>)
221 : efrank 1.1 {
222 : efrank 1.9 if ($write_header)
223 :     {
224 :     if ($_ =~ /AC\s+(.*)(\s+)$/){$contig = $1;}
225 :     if ($_ =~ /OS\s+(.*)/ ){$name = $1;}
226 :     if ($_ =~ /OC\s+(.*)/ )
227 :     {
228 :     $temp = $1;
229 :     if ($taxonomy eq "") {$doing_tax=1;}
230 :     if ($doing_tax) {$taxonomy = $taxonomy.$1;}
231 :     }
232 :     if ($_ =~ /XX/ )
233 :     {
234 :     if ($doing_tax) {$doing_tax=0;}
235 :     }
236 :    
237 :     if ($_ =~ /FT\s+source\s+(\d+)..(\d+)/ ){$start = $1; $stop =$2}
238 :     if ($_ =~ /db_xref="taxon:(.*)"/ )
239 :     {
240 :     $taxon_id = $1;
241 :    
242 :     $bytesOut = $bytesOut + length( "#gff-version 3\n");
243 : efrank 1.11 $bytesOut = $bytesOut + length( "#seed\tgenome_id\t$taxon_id.$orgVerNo\n");
244 : efrank 1.9 $bytesOut = $bytesOut + length( "#seed\ttaxon_id\t$taxon_id\n");
245 :     $bytesOut = $bytesOut + length( "#seed\tname\t$name\n");
246 :     $bytesOut = $bytesOut + length( "#seed\ttaxonomy\t$taxonomy\n");
247 : efrank 1.11 $bytesOut = $bytesOut + length( "#seed\tgenome_md5\t$checksumPlaceholder\n");
248 : efrank 1.15 $bytesOut = $bytesOut + length( "#seed\tproject\t$ensemblVer\n");
249 : efrank 1.9
250 :     print OUTPUT1 "#gff-version 3\n";
251 : efrank 1.11 print OUTPUT1 "#seed\tgenome_id\t$taxon_id.$orgVerNo\n";
252 : efrank 1.9 print OUTPUT1 "#seed\ttaxon_id\t$taxon_id\n";
253 :     print OUTPUT1 "#seed\tname\t$name\n";
254 :     print OUTPUT1 "#seed\ttaxonomy\t$taxonomy\n";
255 : efrank 1.11 print OUTPUT1 "#seed\tgenome_md5\t$checksumPlaceholder\n";
256 : efrank 1.15 print OUTPUT1 "#seed\tproject\t$ensemblVer\n";
257 : efrank 1.9 $write_header=0;
258 :     }
259 :     }
260 : efrank 1.1
261 :    
262 : efrank 1.9 if ($_ =~ /^AC\s+(.*)/ ){$contig = $1;}
263 :     if ($_ =~ /FT\s+source\s+(\d+)..(\d+)/ )
264 :     {
265 :     $start = $1; $stop =$2;
266 :     $seq_region_name = "##sequence-region\t$contig\t$start\t$stop\n";
267 :     if ($seq_region_name ne $last_seq_region_name)
268 :     {
269 :     $bytesOut = $bytesOut + length("##sequence-region\t$contig\t$start\t$stop\n");
270 :     print OUTPUT2 "##sequence-region\t$contig\t$start\t$stop\n";
271 :     $last_seq_region_name = $seq_region_name;
272 :     }
273 : efrank 1.1 }
274 :    
275 :    
276 : efrank 1.9 if ($_ =~/FT\s+CDS\s+/)
277 :     {
278 : efrank 1.16 #if ( !( index($_, ":") eq -1) ) {
279 :     # print "Skipping split CDS $_\n";
280 :     #} else {
281 :     if (1) {
282 :     #print "KEEPing CDS $_\n";
283 :     if ($record_translation eq 1){
284 :     print "ERROR- new CDS before wrote translation $gene $tscript\n";
285 :     }
286 : efrank 1.15
287 : efrank 1.16 $strand ="";
288 :     $new_entry = 1;
289 :     $peg_counter = $peg_counter + 1;
290 :     $figXref = "FIG_ID:fig|$taxon_id.$orgVerNo.peg.$peg_counter";
291 :    
292 :     $col9 = "ID=cds."."$peg_counter;Alias=";
293 : efrank 1.17 $col9Sep = "";
294 : efrank 1.16 $col9Ont = "";
295 : efrank 1.18 $col9OntSep = "";
296 : efrank 1.16 $col9Xref = uri_escape($figXref);
297 :     $col9XrefSep =",";
298 :     $prot_id = "pro.".$peg_counter;
299 : efrank 1.8
300 : efrank 1.16 if ($debug) { print "\n\n"; }
301 :     }
302 : efrank 1.9 }
303 : efrank 1.1
304 : efrank 1.9 if($new_entry)
305 :     {
306 :     if($_ =~ /(complement\()?(\d+)\..*\.(\d+)/)
307 :     {
308 :     if($1){$strand ="-";} else{$strand = "+";}
309 :     if ($2 < $3){ $bigger = $3; $smaller = $2}
310 :     else { $bigger = $2; $smaller = $3}
311 :     if($bigger > $biggest){$biggest = $bigger};
312 :     if($smaller < $smallest){$smallest = $smaller};
313 :     }
314 : mkubal 1.3
315 : efrank 1.8
316 : efrank 1.9 if ($_ =~/gene="(.*)"/)
317 :     {
318 :     if ($debug) { print "$peg_counter\n"; }
319 :     if ($debug) { print "$contig\n"; }
320 : efrank 1.12 $gene = $1;
321 :     $col9 = $col9.$col9Sep.uri_escape("EnsemblGene:$gene");
322 : efrank 1.9 $col9Sep = ",";
323 :     if ($debug) { print "GENE:$gene\n"; };
324 :     }
325 : efrank 1.1
326 : efrank 1.17 if ($_ =~/protein_id="(.*)"/ )
327 :     {
328 :     $protId = $1;
329 :     $col9 = $col9.$col9Sep.uri_escape("EnsemblProtein:$protId");
330 :     $col9Sep = ",";
331 :     if ($debug) { print "ProteinId:$protId\n"; }
332 :     }
333 :    
334 : efrank 1.9 if ($_ =~/"transcript_id=(.*)"/ )
335 :     {
336 : efrank 1.12 $tscript = $1;
337 : efrank 1.16 if ($colonHash{$tscript}) {
338 :     #already handled this one. bail out
339 :     $oldContig=$colonHash{$tscript};
340 :     if ($contig eq $oldContig) {
341 :     print "Repeated occurance for $tscript. AC=$contig. Old=$oldContig";
342 :     } else {
343 :     print "Repeated occurance for $tscript. DIFFER AC=$contig. Old=$oldContig";
344 :     }
345 :     $peg_counter = $peg_counter - 1;
346 :     $new_entry=0;
347 : efrank 1.17 #next
348 : efrank 1.16 next;
349 :     } else {
350 :     print "First occurance for $tscript. AC=$contig.\n";
351 :     $colonHash{$tscript}=$contig;
352 :     }
353 :    
354 : efrank 1.12 $col9 = $col9.$col9Sep.uri_escape("EnsemblTranscript:$tscript");
355 : efrank 1.9 $col9Sep = ",";
356 :     if ($debug) { print "TSCRIPT:$tscript\n"; }
357 :     }
358 : efrank 1.8
359 : efrank 1.1
360 : efrank 1.9 # handle the db_xrefs
361 : efrank 1.8
362 : efrank 1.9 if ($_ =~/db_xref="(.*)"/)
363 :     {
364 :     @temp = split(":",$1);
365 :     $oldTag = @temp[0];
366 :     $newTag = $dbXrefMap{$oldTag};
367 :     if ($newTag) {
368 :     $newValue = join( ":", @temp[1..$#temp]);
369 :     #GO is goofy because they did GO:GO:1234
370 :     if ($oldTag eq 'GO') {
371 :     $x = uri_escape("$newValue");
372 : efrank 1.16 $col9Ont = $col9Ont.$col9OntSep.$x;
373 :     $col9OntSep=",";
374 : efrank 1.9 } else {
375 :     $x = uri_escape("$newTag:$newValue");
376 : efrank 1.16
377 :     $col9 = $col9.$col9Sep.$x;
378 :     $col9Sep = ",";
379 : efrank 1.9
380 : efrank 1.16 $col9Xref = $col9Xref.$col9XrefSep.$x;
381 :     $col9XrefSep = ",";
382 : efrank 1.9 }
383 :     }
384 :     }
385 :    
386 :    
387 :     # if ($_ =~ /FT\s+\/translation="(\w+)"/) #This line is wrong but gets emacs to indent
388 :     if ($_ =~ /FT\s+\/translation="(\w+)/)
389 :     {
390 :     $translation = $1;
391 :     $record_translation = 1;
392 :     }
393 : efrank 1.1
394 : efrank 1.9 if ($record_translation)
395 :     {
396 : efrank 1.15 if ($_ =~ /FT\s+\/translation="([\*\w]+)\"$/)
397 : efrank 1.10 {
398 :     #trans all in one line. already caught the
399 :     #translation above so don't need to append
400 :     #but do need to finalize so set to 0:
401 :     $record_translation = 0;
402 :     }
403 : mkubal 1.19 #if ($_ =~ /FT\s+([\*\w]+)/ ) {
404 :     # $translation = $translation.$1;
405 :     #}
406 :     #changed to prevent duplication of last line of seq
407 :     if ($_ =~ /FT\s+([\*\w]+)[^\"]$/ ) {
408 : efrank 1.10 $translation = $translation.$1;
409 :     }
410 : mkubal 1.19 if ($_ =~/FT\s+([\*\w]+)\"$/ )
411 : efrank 1.9 {
412 :     $translation = $translation.$1;
413 : efrank 1.10 $record_translation = 0;
414 :     }
415 :    
416 :     if ($_ =~/FT\s+\"$/ )
417 : efrank 1.9 {
418 :     $record_translation = 0;
419 : efrank 1.10 }
420 :    
421 :    
422 :     if (!$record_translation)
423 :     {
424 : efrank 1.9 #add extra info per gene
425 :    
426 : efrank 1.12 if ($gene && $extra_info->{$gene} )
427 : efrank 1.9 {
428 :     foreach $x (@{$extra_info->{$gene}})
429 :     {
430 : efrank 1.13 #print " extra gene info $gene -> $x\n";
431 : efrank 1.9 $col9 = $col9.$col9Sep.uri_escape("$x");
432 :     $col9Sep = ",";
433 :     }
434 :     }
435 :    
436 :     #
437 :     #add extra info per transcript
438 :    
439 :     if ($tscript && $extra_info->{$tscript})
440 :     {
441 :     foreach $x (@{$extra_info->{$tscript}} )
442 :     {
443 : efrank 1.13 #print " extra txcript info $tscript -> $x\n";
444 : efrank 1.9 $col9 = $col9.$col9Sep.uri_escape("$x");
445 :     $col9Sep = ",";
446 :     }
447 :     }
448 :    
449 : efrank 1.18 #look for a function
450 :    
451 :     my $function;
452 :     if ($tscript && $extra_info->{$tscript."_function"})
453 :     {
454 :     #there should only be one entry.
455 :     foreach $fn (@{$extra_info->{$tscript."_function"}} ) {
456 :     print " found function [$fn]\n";
457 :     $function=uri_escape($fn);
458 :     }
459 :     }
460 :    
461 :    
462 :     $col9 = $col9.";Dbxref=".$col9Xref.";Ontology_term=".$col9Ont.";Note=".$function.";translation_id=$prot_id;";
463 : efrank 1.10 if ($debug) { print "COLLATE\t$peg_counter\t$gene\t$tscript\n" };
464 : efrank 1.9 if ($debug) { print "col9 = [$gene] $col9\n"; }
465 :     $bytesOut = $bytesOut + length("$contig\tEnsembl\tcds\t$smallest\t$biggest\t.\t$strand\t.\t$col9\n");
466 :     print OUTPUT2 "$contig\tEnsembl\tcds\t$smallest\t$biggest\t.\t$strand\t.\t$col9\n";
467 :     $new_entry = 0;
468 :     $biggest = 0; $smallest = 100000000;
469 :     $record_translation = 0;
470 :     if ($debug) { print "final:$translation\n"; }
471 :     $translation =~ s/\s//g;
472 : efrank 1.1
473 : efrank 1.9 $bytesOut = $bytesOut + length( ">$prot_id\n$translation\n" );
474 :     print OUTPUT3 ">$prot_id\n$translation\n";
475 :     $translation="";
476 :     $gene="";
477 :     $tscript="";
478 : efrank 1.10 }
479 :     } #record translation
480 : efrank 1.1
481 :    
482 : efrank 1.9 } # new entry
483 : efrank 1.1
484 : efrank 1.9 if($record_next_line)
485 :     {
486 :     if($_ =~ /(\w+)(\s+)?(\w+)?(\s+)?(\w+)?(\s+)?(\w+)?(\s+)?(\w+)?(\s+)?(\w+)?.*/)
487 :     #if($_ =~ /([GACTNX])(\s+)?([GACTNX])?(\s+)?([GACTNX])?(\s+)?([GACTNX])?(\s+)?([GACTNX])?(\s+)?([GACTNX])?(\s*)(\d+)/)
488 :     {
489 :     $temp = $1.$3.$5.$7.$9.$11;
490 :     #print OUTPUT4 "$temp\n";
491 :     if($temp =~/(\w+[^0-9])(\d+)$/)
492 :     {
493 :     $seq = $1;
494 :     $seq =~ s/[0-9]//g;
495 : efrank 1.11 #if ($debug) { print "CULPRIT:$2\n"; }
496 : efrank 1.9 $bytesOut=$bytesOut + length("$seq\n");
497 :     print OUTPUT4 "$seq\n";
498 :     }
499 :     else
500 :     {
501 :     $temp =~ s/[0-9]//g;
502 :     $bytesOut = $bytesOut + length( "$temp\n" );
503 :     print OUTPUT4 "$temp\n";
504 : efrank 1.1 }
505 : efrank 1.9 }
506 :     else
507 :     {
508 :     $record_next_line = 0;
509 :     }
510 :     }
511 :     if ($_ =~ /^SQ.*/)
512 :     {
513 :     $bytesOut = $bytesOut + length( ">$contig\n");
514 :     print OUTPUT4 ">$contig\n";
515 :     $record_next_line = 1;
516 :     }
517 : efrank 1.1
518 : efrank 1.9 } #while input
519 :    
520 :     return $bytesOut, $peg_counter;
521 :    
522 :     } #end do_file
523 : efrank 1.1
524 :    
525 : efrank 1.5 sub load_extra {
526 :     # some info is not in the embl file. load a file with extra information to
527 :     # add to the alias and attribute info that's been parsed already. The format
528 :     # of each line is
529 :     #
530 :     # key Alias|Attribute text
531 :     #
532 :     # where the pieces are tab separated. key is typically an ensembl gene id
533 :     # or ensembl transcript id depending upon whether the extra info is to be
534 :     # associated per gene or per transcript.
535 :     #
536 :     # returns a hash from key to text.
537 :     #
538 :    
539 :    
540 :     my($fname) = @_;
541 :     my %extra_info;
542 :    
543 :     open(EXTRA_INPUT,$fname);
544 :     @lines = <EXTRA_INPUT>;
545 :     foreach $_ (@lines)
546 :     {
547 :     chomp $_;
548 :     @temp = split("\t",$_);
549 :     $key = @temp[0];
550 :     $text = @temp[2];
551 :     push( @{$extra_info{$key}}, $text);
552 :     }
553 :    
554 :     #print &Dumper(\%extra_info),"\n";
555 :     return \%extra_info;
556 :     }
557 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3