[Bio] / FigKernelPackages / raelib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/raelib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.17, Thu May 26 18:15:59 2005 UTC revision 1.18, Fri Jun 10 15:36:37 2005 UTC
# Line 73  Line 73 
73    
74  =head2 pirsfcorrespondence  =head2 pirsfcorrespondence
75    
76   Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written.  Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written. This is so that we can easily go back and forth.
  This is so that we can easily go back and forth.  
77    
78   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat   The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat
79    
# Line 83  Line 82 
82     to      : file to write information to     to      : file to write information to
83     verbose : report on progress     verbose : report on progress
84    
85    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
86    
87   Returns the number of lines in the pirsinfo file that were read.   Returns the number of lines in the pirsinfo file that were read.
88    
89  =cut  =cut
# Line 93  Line 94 
94    print STDERR "File $from does not exist as called in $0\n";    print STDERR "File $from does not exist as called in $0\n";
95    return 0;    return 0;
96   }   }
97     if ($from =~ /\.gz$/) {
98      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
99     }
100     else {
101   open (IN, $from) || die "Can't open $from";   open (IN, $from) || die "Can't open $from";
102     }
103   open (OUT, ">$to") || die "Can't write to $to";   open (OUT, ">$to") || die "Can't write to $to";
104   my $linecount;   my $linecount;
105   while (<IN>) {   while (<IN>) {
# Line 121  Line 127 
127     to      : file to write information to     to      : file to write information to
128     verbose : report on progress     verbose : report on progress
129    
130   Returns the number of lines in the pirsinfo file that were read.  Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
131    
132     Returns the number of lines in the uniprotkb file that were read.
133    
134  =cut  =cut
135    
# Line 131  Line 139 
139    print STDERR "File $from does not exist as called in $0\n";    print STDERR "File $from does not exist as called in $0\n";
140    return 0;    return 0;
141   }   }
142     if ($from =~ /\.gz$/) {
143      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
144     }
145     else {
146   open (IN, $from) || die "Can't open $from";   open (IN, $from) || die "Can't open $from";
147     }
148   open (OUT, ">$to") || die "Can't write to $to";   open (OUT, ">$to") || die "Can't write to $to";
149   my $linecount;   my $linecount;
150   while (<IN>) {   while (<IN>) {
# Line 151  Line 164 
164   return $linecount;   return $linecount;
165  }  }
166    
167    =head2 prositecorrespondence
168    
169    Generate a correspondence table between prosite and seed using sp id's and seed ids.
170    
171    The SwissProt prosite file is from ftp://ca.expasy.org/databases/prosite/release_with_updates/prosite.dat and is in horrible swiss prot format, so we'll parse out those things that we need and put them in the file
172    
173    The output file will have the following columns:
174    
175    prosite family accession number, prosite family name, family type, swiss-prot protein id, fig protein id.
176    
177    The family type is one of rule, pattern, or matrix. Right now (Prosite Release 19.2 of 24-May-2005) there are 4 rules, 1322 patterns, and 521 matrices.
178    
179     This method takes three arguments:
180       from    : prosite file
181       to      : file to write information to
182       verbose : report on progress
183    
184    Note that if the from filename ends in .gz it assumed to be a gzipped file and will be opened accordingly.
185    
186     Returns the number of lines in the prosite file that were read.
187    
188    =cut
189    
190    sub prositecorrespondence {
191     my ($self, $from, $to, $verbose)=@_;
192     unless (-e $from) {
193      print STDERR "File $from does not exist as called in $0\n";
194      return 0;
195     }
196     if ($from =~ /\.gz$/) {
197      open(IN, "|gunzip -c $from") || die "Can't open $from using a gunzip pipe";
198     }
199     else {
200      open (IN, $from) || die "Can't open $from";
201     }
202     open (OUT, ">$to") || die "Can't write to $to";
203     my $linecount;
204     my ($famac, $famname, $famtype)=('','','');
205     while (<IN>) {
206      chomp;
207      $linecount++;
208      if ($verbose && !($linecount % 10000))  {print STDERR "Parsed $linecount lines\n"}
209      if (m#//#) {($famac, $famname, $famtype)=('','',''); next}
210      elsif (m/^ID\s*(.*?);\s*(\S+)/) {($famname, $famtype)=($1, $2); next}
211      elsif (m/^AC\s*(\S+)/) {$famac=$1; next}
212      next unless (m/^DR/); # ignore all the other crap in the prosite file for now. Note we might, at some point, want to grab all that, but that is for another time.
213      #
214      # this is the format of the DR lines:
215      # DR   P11460, FATB_VIBAN , T; P40409, FEUA_BACSU , T; P37580, FHUD_BACSU , T;
216      s/^DR\s*//;
217      foreach my $piece (split /\s*\;\s*/, $_) {
218       my ($acc, $nam, $unk)=split /\s*\,\s*/, $piece;
219       foreach my $fig ($self->swiss_pir_ids($acc)) {
220        print join "\t", ($famac, $famname, $famtype, $acc, $fig), "\n";
221       }
222      }
223     }
224    }
225    
226  =head2 swiss_pir_ids()  =head2 swiss_pir_ids()
227    
228  SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's.  SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's, and return an array of FIG id's that match the ID.
229    
230  =cut  =cut
231    
232  sub swiss_pir_ids {  sub swiss_pir_ids {
233   my ($self, $id)=@_;   my ($self, $id)=@_;
234   return () unless ($id);   return () unless ($id);
235     $id =~ s/^\s+//; $id =~ s/\s+$//; # trim off the whitespace
236    
237   my @return=($fig->by_alias("uni|$id"));   my @return=($fig->by_alias("uni|$id"));
238   return @return if ($return[0]);   return @return if ($return[0]);

Legend:
Removed from v.1.17  
changed lines
  Added in v.1.18

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3