[Bio] / FigKernelScripts / svr_find_protein.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/svr_find_protein.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 use strict;
2 :    
3 :     use Getopt::Long;
4 :     use ScriptThing;
5 :     use SAPserver;
6 :    
7 :     #
8 :     # This is a SAS Component
9 :     #
10 :    
11 :    
12 :     =head1 svr_fids_for_md5
13 :    
14 :     Output the FIG IDs and functional assignments of all features that produce a
15 :     specific protein. This script takes as input a single protein sequence on the
16 :     command-line or a FASTA file. It outputs a tab-delimited file connecting each
17 :     specified protein to its features.
18 :    
19 :     ------
20 :     Example:
21 :    
22 :     svr_find_protein MENIIARRYAKAIASRADINDFYQNLCILNFAFVLPKFKNIIESNEIKKERKMEFLDSFFDIKNSSFQNFLRLLIENSRLECIPQIVKELERQKAFKENIFVGIVYSKEKLSQENLKDLEVKLNKKFDANIKLNNKISQDDSVKIELEELGYELSFSMKALQNKLNEYILKII
23 :    
24 :     or
25 :    
26 :     svr_find_protein < prots.fasta > fids.tbl
27 :     ------
28 :    
29 :     =back
30 :    
31 :     =head2 Command-Line Options
32 :    
33 :     =over 4
34 :    
35 :     =item -n
36 :    
37 :     If specified and the protein is provided as a parameter (instead of a FASTA
38 :     file provided as input), then the name to be given to the protein in the
39 :     output file.
40 :    
41 :     =back
42 :    
43 :     =head2 Output Format
44 :    
45 :     The standard output is a tab-delimited file with four columns: (1) the
46 :     protein ID, (2) the FIG ID of a gene that produces the protein, (3) the
47 :     name of the genome containing the gene, and (4) the functional assignment of the
48 :     gene. There will frequently be multiple features for a single protein.
49 :    
50 :     =cut
51 :    
52 :     use SeedUtils;
53 :     use SAPserver;
54 :     use Getopt::Long;
55 :     use ScriptThing;
56 :     use gjoseqlib;
57 :     use Digest::MD5;
58 :    
59 :     my $usage = "usage: svr_find_protein [-n name] proteinSequence";
60 :    
61 :     my $name;
62 :     my $rc = GetOptions('n=s' => \$name);
63 :     if (! $rc) { print STDERR $usage; exit }
64 :     my $sapObject = SAPserver->new();
65 :    
66 :     my $protein = $ARGV[0];
67 :    
68 :     # This list will contain FASTA protein sequence triplets.
69 :     my @seqs;
70 :     # Are we using a protein from the command line or a FASTA file on the standard
71 :     # input?
72 :     if ($protein) {
73 :     # Here we have a protein from the command line. Insure we have a name for it.
74 :     if (! $name) {
75 :     $name = "protein1";
76 :     }
77 :     push @seqs, [$name, undef, $protein];
78 :     } else {
79 :     # Here we have input via a FASTA file. First, we do a little hack so that we have
80 :     # an option to run in this mode in the debugging environment (where we can't use
81 :     # direct STDIN support).
82 :     ScriptThing::AdjustStdin();
83 :     # Now read the FASTA tripled from the standard input.
84 :     @seqs = gjoseqlib::read_fasta(\*STDIN);
85 :     }
86 :     # Now we run through the triples in batches of 100, getting the feature IDs.
87 :     for (my $i = 0; $i < @seqs; $i += 100) {
88 :     # Get the range of triples in this group.
89 :     my $i2 = $i + 99;
90 :     $i2 = $#seqs if ($i2 > $#seqs);
91 :     # Compute the associated features.
92 :     my $protHash = $sapObject->proteins_to_fids(-prots => [map { $_->[2] } @seqs[$i .. $i2] ]);
93 :     # Now loop through the sequences in this group, writing out the features for each.
94 :     for (my $j = $i; $j <= $i2; $j++) {
95 :     # Get the current triple.
96 :     my ($id, undef, $sequence) = @{$seqs[$j]};
97 :     # Get this sequence's features.
98 :     my $fids = $protHash->{$sequence};
99 :     # Insure we found some.
100 : parrello 1.2 if (! $fids || ! @$fids) {
101 : parrello 1.1 print STDERR "No genes found for $id.\n";
102 :     } else {
103 :     # Compute the feature assignments.
104 :     my $fidHash = $sapObject->ids_to_data(-ids => $fids,
105 :     -data => ['genome-name', 'function']);
106 :     # Output the features.
107 :     for my $fid (@$fids) {
108 :     print join("\t", $id, $fid, @{$fidHash->{$fid}[0]}) . "\n";
109 :     }
110 :     }
111 :     }
112 :     }
113 :     # All done.

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3