[Bio] / FigKernelScripts / get_neighbors_and_corr_to_ref.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/get_neighbors_and_corr_to_ref.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (view) (download) (as text)

1 : overbeek 1.1 ########################################################################
2 :    
3 : olson 1.6 # This is a SAS Component
4 :    
5 : overbeek 1.1 use SeedHTML;
6 :     use strict;
7 :     use SeedEnv;
8 :     use ProtSims;
9 :     use gjoseqlib;
10 : olson 1.6 use Data::Dumper;
11 : olson 1.7 use SeedAware;
12 : overbeek 1.1
13 :     my $usage = "usage: get_neighbors_and_corr_to_ref GenomeDir";
14 :     my $gdir;
15 :    
16 : olson 1.6 $| = 1;
17 :    
18 :     my $sapO = SAPserver->new;
19 :    
20 : overbeek 1.1 ($gdir = shift @ARGV)
21 :     || die $usage;
22 : overbeek 1.5 ($gdir =~ /(\d+\.\d+)$/) || die "Invalid Genome Directory: $gdir";
23 :     my $gdir_id = $1;
24 : overbeek 1.1
25 :     my @fasta = &gjoseqlib::read_fasta("$gdir/Features/peg/fasta");
26 : overbeek 1.4 my %id2seqH = map { ($_->[2] && (length($_->[2]) > 30)) ? ($_->[0] => $_->[2]) : () } @fasta;
27 : overbeek 1.1
28 :     &SeedUtils::verify_dir("$gdir/CorrToReferenceGenomes");
29 : olson 1.6 print "Finding neighbors\n";
30 : overbeek 1.1 my @poss_pegs = &prioritize_pegs_used_to_find_neighbors($gdir);
31 : olson 1.6 print "found " . scalar(@poss_pegs) . " poss_pegs\n";
32 : overbeek 1.1 my %counts;
33 :     my $best = 0;
34 :     my $tuple;
35 :     while (($best < 500) && ($tuple = shift @poss_pegs))
36 :     {
37 : overbeek 1.4 my($role,$peg) = @$tuple;
38 :     if ($id2seqH{$peg} && (length($id2seqH{$peg}) > 30))
39 :     {
40 :     &compute_hits_and_set_best($tuple,\%id2seqH,\%counts,\$best);
41 :     }
42 : overbeek 1.1 }
43 :     if ($best == 0) { die "$gdir describes a genome without enough RAST-called genes to identify neighbors" }
44 :     my @reference = sort { $counts{$b} <=> $counts{$a} } keys(%counts);
45 :     if (@reference > 30) { $#reference = 29 }
46 :    
47 :     my $genomesH = $sapO->all_genomes(-complete => 1);
48 :     open(CLOSE,">$gdir/closest.genomes") || die "could not open closest.genomes";
49 : olson 1.6 print "Generating correspondences for these genomes:\n";
50 :     print "\t$_\n" for @reference;
51 : overbeek 1.1 foreach my $g2 (@reference)
52 :     {
53 : overbeek 1.5 if ($g2 ne $gdir_id)
54 :     {
55 : olson 1.6 print "Generating correspondences for $g2...\n";
56 : overbeek 1.5 &generate_correspondence_table($g2,$gdir);
57 : olson 1.6 print "Generating correspondences for $g2...done\n";
58 : overbeek 1.5 print CLOSE join("\t",($g2,$genomesH->{$g2})),"\n";
59 :     }
60 : overbeek 1.1 }
61 :     close(CLOSE);
62 :    
63 :     sub generate_correspondence_table {
64 :     my($g2,$gdir) = @_;
65 :    
66 :     ($gdir =~ /(\d+\.\d+)$/) || die "Invalid Genome Directory: $gdir";
67 :     my $g1 = $1;
68 : overbeek 1.5 if ($g1 ne $g2)
69 :     {
70 : olson 1.7 my $exe = SeedAware::executable_for("svr_corresponding_genes");
71 :     SeedAware::system_with_redirect([$exe, "-d", $gdir, $g1, $g2],
72 :     { stdout => "$gdir/CorrToReferenceGenomes/$g2" });
73 :     #system "svr_corresponding_genes -d $gdir $g1 $g2 > $gdir/CorrToReferenceGenomes/$g2";
74 : overbeek 1.5 }
75 : overbeek 1.1 }
76 :    
77 :     sub prioritize_pegs_used_to_find_neighbors {
78 :     my($gdir) = @_;
79 :    
80 :     my %by_func;
81 :    
82 : overbeek 1.4 my %uniqH;
83 : olson 1.7
84 :     my $af_fh;
85 :     if (!open($af_fh, "<", "$gdir/assigned_functions"))
86 :     {
87 :     warn "Cannot open $gdir/assigned_functions: $!";
88 :     return ();
89 :     }
90 :    
91 :     while (defined(my $line = <$af_fh>))
92 : overbeek 1.1 {
93 :     if ($line =~ /^(fig\|\d+\.\d+\.peg\.\d+)\t(\S[^\#]+\S)/)
94 :     {
95 : overbeek 1.4 $uniqH{$1} = $2;
96 : overbeek 1.1 }
97 :     }
98 : olson 1.7 close($af_fh);
99 : overbeek 1.4
100 :     foreach my $peg (keys(%uniqH))
101 :     {
102 :     my $func = $uniqH{$peg};
103 :     $func =~ s/\s*\#.*$//;
104 :     push(@{$by_func{$func}},$peg);
105 :     }
106 :    
107 : overbeek 1.1 my @synthetases = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /tRNA synthetase/ } keys(%by_func);
108 :     my @ribosomal_proteins = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /ribosomal protein/ } keys(%by_func);
109 :     my @ok_pegs = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } keys(%by_func);
110 :     my @prioritized = ();
111 :     my %seen;
112 :     foreach my $tuple (@synthetases,@ribosomal_proteins,@ok_pegs)
113 :     {
114 :     if (! $seen{$tuple->[0]})
115 :     {
116 :     $seen{$tuple->[0]} = 1;
117 :     push(@prioritized,$tuple);
118 :     }
119 :     }
120 :     return @prioritized;
121 :     }
122 :    
123 :     sub compute_hits_and_set_best {
124 :     my($tuple,$id2seqH,$counts,$bestP) = @_;
125 :    
126 :     my($role,$peg) = @$tuple;
127 : olson 1.6 print "Get figfam pegs for $role\n";
128 : overbeek 1.1 my $figfam_pegs = &figfam_pegs_for_role($role);
129 :    
130 : olson 1.6 print "Compute sims\n";
131 : overbeek 1.1 my @sims = &ProtSims::blastP([[$peg,'',$id2seqH->{$peg}]],$figfam_pegs,10);
132 : olson 1.6 print "Computed " . scalar(@sims) . " sims\n";
133 : overbeek 1.1 my $i;
134 :     for ($i=0; (($i < @sims) && ($i < 50)); $i++)
135 :     {
136 :     my $g2 = &SeedUtils::genome_of($sims[$i]->id2);
137 :     $counts->{$g2} += 50 - $i;
138 :     if ($counts->{$g2} > $$bestP) { $$bestP = $counts->{$g2} }
139 :     }
140 :     }
141 :    
142 :     sub figfam_pegs_for_role {
143 :     my($role) = @_;
144 :    
145 :     my %figfams;
146 : olson 1.6
147 :     my $res = $sapO->all_figfams(-roles => $role);
148 :     my @pegs;
149 :     for my $ff (keys %$res)
150 : overbeek 1.1 {
151 : olson 1.6 my $fids = $sapO->figfam_fids(-id => $ff);
152 :     push(@pegs, @$fids);
153 : overbeek 1.1 }
154 : olson 1.6
155 :     my $idsH = $sapO->ids_to_sequences(-ids => \@pegs, -protein => 1);
156 :    
157 :     return [map { my $seq = $idsH->{$_}; $seq ? [$_,'',$seq] : () } keys(%$idsH)];
158 : overbeek 1.1 }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3