[Bio] / FigKernelScripts / find_neighbors_using_figfams.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/find_neighbors_using_figfams.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (view) (download) (as text)

1 : gdpusch 1.1 # -*- perl -*-
2 :     ########################################################################
3 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     ########################################################################
18 :    
19 :     use strict;
20 :     use warnings;
21 :    
22 :     use FIG;
23 :     my $fig = new FIG;
24 :    
25 :     use FigFams;
26 : gdpusch 1.2 my $figfams = new FigFams($fig);
27 : gdpusch 1.1
28 : olson 1.5 print STDERR "$0 using figfams in $figfams->{dir}\n";
29 :    
30 : gdpusch 1.1 use ToCall;
31 :     use NewGenome;
32 :    
33 :     $0 =~ m/([^\/]+)$/o;
34 :     my $self = $1;
35 :     my $usage = "usage: $self [-fams=FigfamsData] To_Call_Dir N FoundFamilies [old] > closest.N.genomes";
36 :    
37 :     if (@ARGV && ($ARGV[0] =~ m/-h(elp)?/)) {
38 :     print STDERR "\n $usage\n\n";
39 :     exit (0);
40 :     }
41 :    
42 :     my ($i, $fam_data, $to_call_dir, $N, $foundF);
43 :    
44 :     # First, let's determine which FigfamsData directory to use
45 :     $fam_data = "$FIG_Config::data/FigfamsData";
46 :     for ($i=0; ($i < @ARGV) && ($ARGV[$i] !~ /^-fams=/); $i++) {}
47 :     if ($i < @ARGV) {
48 :     if ($ARGV[$i] =~ /^-fams=(\S+)/) {
49 :     if (-d $1) {
50 :     $fam_data = $1;
51 :     }
52 :     else {
53 :     die "Nonexistent FIGfams directory $1\n";
54 :     }
55 :     splice(@ARGV,$i,1);
56 :     }
57 :     }
58 :    
59 :     # Now pick up the normal parameters
60 :     (
61 :     ($to_call_dir = shift @ARGV) &&
62 :     ($N = shift @ARGV) &&
63 :     ($foundF = shift @ARGV) && open(FOUNDFAMS, ">>$foundF")
64 :     )
65 :     || die "\n $usage\n\n";
66 :    
67 :    
68 :     ### This is a bit complex. Normally, one uses this program to call genes
69 :     # in a newly-sequenced genome. In this case the functionality of NewGenome.pm
70 :     # is needed. To call genes in an existing genome (to, for example, see how well
71 :     # we are doing) you would use ToCall.pm.
72 :    
73 :     my $to_call;
74 : overbeek 1.3 my $old;
75 :    
76 : gdpusch 1.1 if ((@ARGV > 0) && ($ARGV[0] =~ /^old/i)) {
77 : overbeek 1.3 $old = 1;
78 : gdpusch 1.1 $to_call = new ToCall($to_call_dir);
79 :     }
80 :     else {
81 :     $to_call = new NewGenome($to_call_dir);
82 :     }
83 :    
84 :     my @orf_ids = $to_call->get_fids_for_type('orf');
85 :     if (not @orf_ids) {
86 :     $to_call->possible_orfs;
87 :     @orf_ids = $to_call->get_fids_for_type('orf');
88 :     }
89 :    
90 :     my (%seq_of, %len_of, $orf_len);
91 :     for my $fid (@orf_ids) {
92 : gdpusch 1.2 if (($orf_len = $to_call->get_feature_length($fid)) > 900) {
93 : gdpusch 1.1 $len_of{$fid} = $orf_len;
94 :     $seq_of{$fid} = $to_call->get_feature_sequence($fid);
95 :     }
96 :     }
97 :    
98 :     my @peg_ids = ();
99 :     my %genomes_hit;
100 :     my %weight_of_hits;
101 :     foreach my $orf_id (sort { $len_of{$b} <=> $len_of{$a} } keys %len_of) {
102 : gdpusch 1.2
103 :     print STDERR "\nChecking $orf_id ($len_of{$orf_id})\n" if $ENV{VERBOSE};
104 : gdpusch 1.1 my ($fam, $sims) = $figfams->place_in_family($seq_of{$orf_id});
105 : gdpusch 1.2 if (not defined($fam)) {
106 :     print STDERR "\tCould not place in family --- skipping\n" if $ENV{VERBOSE};
107 :     next;
108 :     }
109 :     else {
110 : gdpusch 1.1 my $fam_id = $fam->family_id();
111 :     my $func = $fam->family_function();
112 : gdpusch 1.2 print STDERR "\tplaced in $fam_id ("
113 :     , (scalar @$sims)
114 :     , " sims)\t$func\n"
115 :     if $ENV{VERBOSE};
116 : gdpusch 1.1
117 : overbeek 1.3 my $fid;
118 :     if (! $old)
119 :     {
120 :     my $orf = &NewGenome::ORF::new('NewGenome::ORF', $to_call, $orf_id);
121 :     $fid = $orf->promote_to_peg($sims, $func);
122 :     }
123 :     else
124 :     {
125 :     $fid = $orf_id;
126 :     }
127 :    
128 : gdpusch 1.2 if ($fid) {
129 :     push @peg_ids, $fid;
130 :    
131 :     print FOUNDFAMS "$fid\t$fam_id\t$func\n";
132 :    
133 :     for ($i=0; ($i < $N) && ($i < @$sims); ++$i) {
134 :     ++$genomes_hit{&FIG::genome_of($sims->[$i]->id2)};
135 :     $weight_of_hits{&FIG::genome_of($sims->[$i]->id2)} +=
136 : gdpusch 1.4 ($sims->[$i]->bsc / $sims->[$i]->ln2);
137 : gdpusch 1.2 }
138 :     }
139 :     else {
140 :     die "Could not promote $orf_id";
141 : gdpusch 1.1 }
142 :     }
143 : gdpusch 1.2
144 : gdpusch 1.1 last if (@peg_ids >= $N);
145 :     }
146 :    
147 :     my $num_pegs = @peg_ids;
148 :     warn "Found $num_pegs PEG candidates\n";
149 :    
150 :     if ($num_pegs == 0) {
151 :     die "Could not find any features of sufficient length and in a FIGfam\n";
152 :     }
153 :    
154 :     my @best = sort { $weight_of_hits{$b} <=> $weight_of_hits{$a} } keys(%genomes_hit);
155 :     for ($i=0; ($i < $N) && ($i < @best); ++$i) {
156 :     print "$best[$i]\t$genomes_hit{$best[$i]}\t$weight_of_hits{$best[$i]}\n";
157 :     }
158 :     close(FOUNDFAMS);
159 :     $to_call->export_features;
160 :    
161 :     exit(0);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3