[Bio] / FigKernelScripts / make_fam_tabs.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/make_fam_tabs.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (view) (download) (as text)

1 : olson 1.4 #
2 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
3 :     # for Interpretations of Genomes. All Rights Reserved.
4 :     #
5 :     # This file is part of the SEED Toolkit.
6 :     #
7 :     # The SEED Toolkit is free software. You can redistribute
8 :     # it and/or modify it under the terms of the SEED Toolkit
9 :     # Public License.
10 :     #
11 :     # You should have received a copy of the SEED Toolkit Public License
12 :     # along with this program; if not write to the University of Chicago
13 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14 :     # Genomes at veronika@thefig.info or download a copy from
15 :     # http://www.theseed.org/LICENSE.TXT.
16 :     #
17 :    
18 : overbeek 1.1
19 :     $usage = "usage: make_fam_tabs Sources Synonyms Dir";
20 :    
21 :     (
22 :     ($sources = shift @ARGV) &&
23 :     ($syn = shift @ARGV) &&
24 :     ($dir = shift @ARGV)
25 :     )
26 :     || die $usage;
27 :    
28 :     open(TAB1,">$dir/localid_cid") || die "could not open $dir/localid_cid";
29 :     open(TAB2,">$dir/localfam_cid") || die "could not open $dir/localfam_cid";
30 :     open(TAB3,">$dir/localfam_function") || die "could not open $dir/localfam_function";
31 :    
32 :     opendir(SOURCES,$sources) || die "could not open $sources";
33 :     @specific = grep { $_ !~ /^\./ } readdir(SOURCES);
34 :     closedir(SOURCES);
35 :    
36 :     open(SYN,"<$syn") || die "could not open $syn";
37 :     $cid = 1;
38 :     while (defined($_ = <SYN>))
39 :     {
40 : overbeek 1.3 chomp;
41 : overbeek 1.1 @prots = split(/\t/,$_);
42 :     foreach $prot (@prots)
43 :     {
44 : overbeek 1.5 unless ($prot) {print STDERR "No protein id (|$prot|) in $syn at $_\n"; next}
45 : overbeek 1.1 print TAB1 "$prot\t$cid\n";
46 :     $to_cid{$prot} = $cid;
47 :     }
48 :     $cid++;
49 :     }
50 :    
51 :    
52 :     foreach $source (@specific)
53 :     {
54 : overbeek 1.3 next unless (-d "$sources/$source"); # don't try and process it if it is not a directory!
55 : overbeek 1.1 open(IDMAP,"<$sources/$source/id.map") || die "could not open $sources/$source/id.map";
56 :     while (defined($_ = <IDMAP>))
57 :     {
58 : overbeek 1.3 chomp;
59 : overbeek 1.1 ($fam,$lid) = split(/\t/,$_);
60 : overbeek 1.3 if (!$to_cid{$lid})
61 : overbeek 1.1 {
62 : overbeek 1.3 # we don't have a synonym for this protein
63 :     # we can either ignore it or just give it a unique id. Lets do that.
64 :     $to_cid{$lid}=$cid;
65 : overbeek 1.5 # and write it to the file
66 :     unless ($lid) {print STDERR "No protein id (|$lid|) in $syn at $_\n"; next}
67 :     print TAB1 "$lid\t$cid\n";
68 : overbeek 1.3 $cid++;
69 : overbeek 1.1 }
70 : overbeek 1.3 print TAB2 "$fam\t$to_cid{$lid}\n";
71 : overbeek 1.1 }
72 :     close(IDMAP);
73 :    
74 :     if (open(FAMFUNC,"<$sources/$source/family.funcs"))
75 :     {
76 :     while (defined($_ = <FAMFUNC>))
77 :     {
78 : overbeek 1.3 # ignore things that begin with a # or things that are just white space
79 :     # some files also have the last line with just a | on it. This should just be skipped
80 :     next if (/^\s+$/ || /^\s*\#/ || /^\|/);
81 :     chomp;
82 :     # this was originally written with this regexp
83 :     # but there were problems when the function was not defined and also with some functions
84 :     # that had leading or trailing spaces. These are removed now.
85 :     #if (($_ =~ /^(\S+)\t(\S[^\t]*\S)$/) && (length($2) < 256))
86 :    
87 :     if ($_ !~ /\t/) {print STDERR "Not enough columns in $sources/$source/family.funcs at $_\n"; next}
88 :     my @line=split /\t/;
89 :     if ($#line > 1) {print STDERR "Too many columns in $sources/$source/family.funcs at $_\n"; next}
90 :    
91 :     # trim leading and trailing spaces
92 :     $line[1] =~ s/^\s+//; $line[1] =~ s/\s*$//;
93 :     $line[0] =~ s/^\s+//; $line[0] =~ s/\s*$//;
94 :    
95 :     # define the function if it is unknown
96 :     unless (defined $line[1]) {$line[1] = "unknown"}
97 :     print TAB3 join("\t", @line), "\n";
98 :    
99 : overbeek 1.1 }
100 :     close(FAMFUNC);
101 :     }
102 :     }
103 :    
104 :     close(TAB1);
105 :     close(TAB2);
106 :     close(TAB3);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3