[Bio] / FigKernelScripts / find_neighbors_using_figfams.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/find_neighbors_using_figfams.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (view) (download) (as text)

1 : gdpusch 1.1 # -*- perl -*-
2 :     ########################################################################
3 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     ########################################################################
18 :    
19 :     use strict;
20 :     use warnings;
21 :    
22 :     use FIG;
23 : gdpusch 1.10 my $fig = FIG->new();
24 : gdpusch 1.1
25 :     use FigFams;
26 :     use ToCall;
27 :     use NewGenome;
28 :    
29 :     $0 =~ m/([^\/]+)$/o;
30 :     my $self = $1;
31 :     my $usage = "usage: $self [-fams=FigfamsData] To_Call_Dir N FoundFamilies [old] > closest.N.genomes";
32 :    
33 :     if (@ARGV && ($ARGV[0] =~ m/-h(elp)?/)) {
34 :     print STDERR "\n $usage\n\n";
35 :     exit (0);
36 :     }
37 :    
38 :     my ($i, $fam_data, $to_call_dir, $N, $foundF);
39 :    
40 : gdpusch 1.8 #...First, let's determine which FigfamsData directory to use:
41 : gdpusch 1.1 for ($i=0; ($i < @ARGV) && ($ARGV[$i] !~ /^-fams=/); $i++) {}
42 :     if ($i < @ARGV) {
43 :     if ($ARGV[$i] =~ /^-fams=(\S+)/) {
44 :     if (-d $1) {
45 :     $fam_data = $1;
46 :     }
47 :     else {
48 :     die "Nonexistent FIGfams directory $1\n";
49 :     }
50 :     splice(@ARGV,$i,1);
51 :     }
52 :     }
53 :    
54 : olson 1.6 my $figfams = new FigFams($fig, $fam_data);
55 :    
56 : gdpusch 1.1 # Now pick up the normal parameters
57 :     (
58 :     ($to_call_dir = shift @ARGV) &&
59 :     ($N = shift @ARGV) &&
60 : gdpusch 1.7 ($foundF = shift @ARGV)
61 : gdpusch 1.1 )
62 :     || die "\n $usage\n\n";
63 :    
64 :    
65 :     ### This is a bit complex. Normally, one uses this program to call genes
66 :     # in a newly-sequenced genome. In this case the functionality of NewGenome.pm
67 :     # is needed. To call genes in an existing genome (to, for example, see how well
68 :     # we are doing) you would use ToCall.pm.
69 :    
70 :     my $to_call;
71 : gdpusch 1.9 my $keep_original_calls = 0;
72 : gdpusch 1.1 if ((@ARGV > 0) && ($ARGV[0] =~ /^old/i)) {
73 : gdpusch 1.9 print STDERR qq(Re-annotating original calls\n) if $ENV{VERBOSE};
74 :     $keep_original_calls = 1;
75 : gdpusch 1.10 $to_call = ToCall->new($to_call_dir);
76 : gdpusch 1.1 }
77 :     else {
78 : gdpusch 1.10 $to_call = NewGenome->new($to_call_dir);
79 : gdpusch 1.1 }
80 :    
81 : gdpusch 1.8
82 : gdpusch 1.9 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
83 :     #...Open auxiliary files (unless "Keep Original Calls" selected)
84 :     #-----------------------------------------------------------------------
85 : gdpusch 1.8 open(FOUNDFAMS, ">>$foundF") || die "Could not append-open $foundF";
86 :    
87 : gdpusch 1.10 my $called_by_file = qq($to_call_dir/called_by);
88 : gdpusch 1.8 open(CALLED_BY, ">>$called_by_file")
89 :     || die "Could not append-open $called_by_file";
90 :    
91 :    
92 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
93 :     #...Get the current ORF-calls if they exist;
94 :     # else, try to call a set of candidate ORFs.
95 :     #-----------------------------------------------------------------------
96 : gdpusch 1.1 my @orf_ids = $to_call->get_fids_for_type('orf');
97 :     if (not @orf_ids) {
98 : gdpusch 1.8 $to_call->possible_orfs();
99 : gdpusch 1.1 @orf_ids = $to_call->get_fids_for_type('orf');
100 :     }
101 :    
102 :     my (%seq_of, %len_of, $orf_len);
103 :     for my $fid (@orf_ids) {
104 : gdpusch 1.2 if (($orf_len = $to_call->get_feature_length($fid)) > 900) {
105 : gdpusch 1.1 $len_of{$fid} = $orf_len;
106 :     $seq_of{$fid} = $to_call->get_feature_sequence($fid);
107 :     }
108 :     }
109 :    
110 : gdpusch 1.7
111 : gdpusch 1.1 my @peg_ids = ();
112 :     my %genomes_hit;
113 :     my %weight_of_hits;
114 :     foreach my $orf_id (sort { $len_of{$b} <=> $len_of{$a} } keys %len_of) {
115 : gdpusch 1.2
116 :     print STDERR "\nChecking $orf_id ($len_of{$orf_id})\n" if $ENV{VERBOSE};
117 : gdpusch 1.1 my ($fam, $sims) = $figfams->place_in_family($seq_of{$orf_id});
118 : gdpusch 1.2 if (not defined($fam)) {
119 :     print STDERR "\tCould not place in family --- skipping\n" if $ENV{VERBOSE};
120 :     next;
121 :     }
122 :     else {
123 : gdpusch 1.1 my $fam_id = $fam->family_id();
124 :     my $func = $fam->family_function();
125 : gdpusch 1.2 print STDERR "\tplaced in $fam_id ("
126 :     , (scalar @$sims)
127 :     , " sims)\t$func\n"
128 :     if $ENV{VERBOSE};
129 : gdpusch 1.1
130 : gdpusch 1.10 my $annot = [ qq(RAST),
131 :     qq($func\nCalled by "$self" using FIGfam $fam_id.)
132 :     ];
133 :    
134 :     my $orf;
135 :     if ($keep_original_calls) {
136 :     $orf = &ToCall::PEG::new( 'ToCall::PEG', $to_call, $orf_id, $seq_of{$orf_id});
137 :     }
138 :     else {
139 :     $orf = &NewGenome::ORF::new('NewGenome::ORF', $to_call, $orf_id);
140 :     }
141 :    
142 :     my $fid = $orf->promote_to_peg($sims, $func, $annot);
143 :    
144 :     if ($fid) {
145 :     print CALLED_BY "$fid\t$self\n";
146 :     print FOUNDFAMS "$fid\t$fam_id\t$func\n";
147 : overbeek 1.3 }
148 : gdpusch 1.10 else {
149 :     die "Could not promote $orf_id";
150 : gdpusch 1.9 }
151 : gdpusch 1.8
152 : gdpusch 1.2 if ($fid) {
153 :     push @peg_ids, $fid;
154 :    
155 :     for ($i=0; ($i < $N) && ($i < @$sims); ++$i) {
156 : gdpusch 1.10 my $org_2 = &FIG::genome_of($sims->[$i]->id2);
157 :     if (not defined($genomes_hit{$org_2})) {
158 :     $genomes_hit{$org_2} = 0;
159 :     $weight_of_hits{$org_2} = 0;
160 :     }
161 :    
162 :     ++$genomes_hit{$org_2};
163 :    
164 :     if ($sims->[$i]->ln2) {
165 :     $weight_of_hits{$org_2} +=
166 :     ($sims->[$i]->bsc() / $sims->[$i]->ln2());
167 :     }
168 : gdpusch 1.2 }
169 :     }
170 :     else {
171 :     die "Could not promote $orf_id";
172 : gdpusch 1.1 }
173 :     }
174 : gdpusch 1.2
175 : gdpusch 1.1 last if (@peg_ids >= $N);
176 :     }
177 :    
178 :     my $num_pegs = @peg_ids;
179 :     warn "Found $num_pegs PEG candidates\n";
180 :    
181 :     if ($num_pegs == 0) {
182 :     die "Could not find any features of sufficient length and in a FIGfam\n";
183 :     }
184 :    
185 :     my @best = sort { $weight_of_hits{$b} <=> $weight_of_hits{$a} } keys(%genomes_hit);
186 :     for ($i=0; ($i < $N) && ($i < @best); ++$i) {
187 : gdpusch 1.10 print STDOUT (join(qq(\t), ($best[$i],
188 :     $genomes_hit{$best[$i]},
189 :     $weight_of_hits{$best[$i]},
190 :     $fig->genus_species($best[$i])
191 :     )
192 :     ),
193 :     qq(\n)
194 :     );
195 : gdpusch 1.1 }
196 : gdpusch 1.10 close(CALLED_BY);
197 :     close(FOUNDFAMS);
198 : gdpusch 1.7
199 :     $to_call->export_features();
200 :    
201 : gdpusch 1.1 exit(0);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3