[Bio] / FigKernelScripts / get_neighbors_and_corr_to_ref.pl Repository:
ViewVC logotype

Diff of /FigKernelScripts/get_neighbors_and_corr_to_ref.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Sun Dec 13 01:11:40 2009 UTC revision 1.6, Fri Feb 19 19:38:21 2010 UTC
# Line 1  Line 1 
1  ########################################################################  ########################################################################
2    
3    # This is a SAS Component
4    
5  use SeedHTML;  use SeedHTML;
6  use strict;  use strict;
7  use SeedEnv;  use SeedEnv;
8  use ProtSims;  use ProtSims;
9  use gjoseqlib;  use gjoseqlib;
10    use Data::Dumper;
11    
12  my $usage = "usage: get_neighbors_and_corr_to_ref GenomeDir";  my $usage = "usage: get_neighbors_and_corr_to_ref GenomeDir";
13  my $gdir;  my $gdir;
14    
15    $| = 1;
16    
17    my $sapO = SAPserver->new;
18    
19  ($gdir   =  shift @ARGV)  ($gdir   =  shift @ARGV)
20      || die $usage;      || die $usage;
21    ($gdir =~ /(\d+\.\d+)$/) || die "Invalid Genome Directory: $gdir";
22    my $gdir_id = $1;
23    
24  my @fasta = &gjoseqlib::read_fasta("$gdir/Features/peg/fasta");  my @fasta = &gjoseqlib::read_fasta("$gdir/Features/peg/fasta");
25  my %id2seqH = map { $_->[0] => $_->[2] } @fasta;  my %id2seqH = map { ($_->[2] && (length($_->[2]) > 30)) ? ($_->[0] => $_->[2]) : () } @fasta;
26    
27  &SeedUtils::verify_dir("$gdir/CorrToReferenceGenomes");  &SeedUtils::verify_dir("$gdir/CorrToReferenceGenomes");
28    print "Finding neighbors\n";
29  my @poss_pegs = &prioritize_pegs_used_to_find_neighbors($gdir);  my @poss_pegs = &prioritize_pegs_used_to_find_neighbors($gdir);
30    print "found " . scalar(@poss_pegs) . " poss_pegs\n";
31  my %counts;  my %counts;
32  my $best  = 0;  my $best  = 0;
33  my $tuple;  my $tuple;
34  while (($best < 500) && ($tuple = shift @poss_pegs))  while (($best < 500) && ($tuple = shift @poss_pegs))
35  {  {
36        my($role,$peg) = @$tuple;
37        if ($id2seqH{$peg} && (length($id2seqH{$peg}) > 30))
38        {
39      &compute_hits_and_set_best($tuple,\%id2seqH,\%counts,\$best);      &compute_hits_and_set_best($tuple,\%id2seqH,\%counts,\$best);
40  }  }
41    }
42  if ($best == 0) { die "$gdir describes a genome without enough RAST-called genes to identify neighbors" }  if ($best == 0) { die "$gdir describes a genome without enough RAST-called genes to identify neighbors" }
43  my @reference = sort { $counts{$b} <=> $counts{$a} } keys(%counts);  my @reference = sort { $counts{$b} <=> $counts{$a} } keys(%counts);
44  if (@reference > 30) { $#reference = 29 }  if (@reference > 30) { $#reference = 29 }
45    
 my $sapO = SAPserver->new;  
46  my $genomesH  = $sapO->all_genomes(-complete => 1);  my $genomesH  = $sapO->all_genomes(-complete => 1);
47  open(CLOSE,">$gdir/closest.genomes") || die "could not open closest.genomes";  open(CLOSE,">$gdir/closest.genomes") || die "could not open closest.genomes";
48    print "Generating correspondences for these genomes:\n";
49    print "\t$_\n" for @reference;
50  foreach my $g2 (@reference)  foreach my $g2 (@reference)
51  {  {
52        if ($g2 ne $gdir_id)
53        {
54            print "Generating correspondences for $g2...\n";
55      &generate_correspondence_table($g2,$gdir);      &generate_correspondence_table($g2,$gdir);
56            print "Generating correspondences for $g2...done\n";
57      print CLOSE join("\t",($g2,$genomesH->{$g2})),"\n";      print CLOSE join("\t",($g2,$genomesH->{$g2})),"\n";
58  }  }
59    }
60  close(CLOSE);  close(CLOSE);
61    
62  sub generate_correspondence_table {  sub generate_correspondence_table {
# Line 44  Line 64 
64    
65      ($gdir =~ /(\d+\.\d+)$/) || die "Invalid Genome Directory: $gdir";      ($gdir =~ /(\d+\.\d+)$/) || die "Invalid Genome Directory: $gdir";
66      my $g1 = $1;      my $g1 = $1;
67        if ($g1 ne $g2)
68        {
69      system "svr_corresponding_genes -d $gdir $g1 $g2 > $gdir/CorrToReferenceGenomes/$g2";      system "svr_corresponding_genes -d $gdir $g1 $g2 > $gdir/CorrToReferenceGenomes/$g2";
70  }  }
71    }
72    
73  sub prioritize_pegs_used_to_find_neighbors {  sub prioritize_pegs_used_to_find_neighbors {
74      my($gdir) = @_;      my($gdir) = @_;
# Line 53  Line 76 
76      my %by_func;      my %by_func;
77      (-s "$gdir/assigned_functions") || die "$gdir contains no assigned_functions";      (-s "$gdir/assigned_functions") || die "$gdir contains no assigned_functions";
78    
79        my %uniqH;
80      foreach my $line (`cat $gdir/assigned_functions`)      foreach my $line (`cat $gdir/assigned_functions`)
81      {      {
82          if ($line =~ /^(fig\|\d+\.\d+\.peg\.\d+)\t(\S[^\#]+\S)/)          if ($line =~ /^(fig\|\d+\.\d+\.peg\.\d+)\t(\S[^\#]+\S)/)
83          {          {
84              my($peg,$func) = ($1,$2);              $uniqH{$1} = $2;
85              $func =~ s/\s*\#$//;          }
             push(@{$by_func{$2}},$1);  
86          }          }
87    
88        foreach my $peg (keys(%uniqH))
89        {
90            my $func = $uniqH{$peg};
91            $func =~ s/\s*\#.*$//;
92            push(@{$by_func{$func}},$peg);
93      }      }
94    
95      my @synthetases        = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /tRNA synthetase/ }   keys(%by_func);      my @synthetases        = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /tRNA synthetase/ }   keys(%by_func);
96      my @ribosomal_proteins = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /ribosomal protein/ } keys(%by_func);      my @ribosomal_proteins = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /ribosomal protein/ } keys(%by_func);
97      my @ok_pegs            = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 }                                    keys(%by_func);      my @ok_pegs            = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 }                                    keys(%by_func);
# Line 82  Line 112 
112      my($tuple,$id2seqH,$counts,$bestP) = @_;      my($tuple,$id2seqH,$counts,$bestP) = @_;
113    
114      my($role,$peg) = @$tuple;      my($role,$peg) = @$tuple;
115        print "Get figfam pegs for $role\n";
116      my $figfam_pegs = &figfam_pegs_for_role($role);      my $figfam_pegs = &figfam_pegs_for_role($role);
117    
118        print "Compute sims\n";
119      my @sims        = &ProtSims::blastP([[$peg,'',$id2seqH->{$peg}]],$figfam_pegs,10);      my @sims        = &ProtSims::blastP([[$peg,'',$id2seqH->{$peg}]],$figfam_pegs,10);
120        print "Computed " . scalar(@sims) . " sims\n";
121      my $i;      my $i;
122      for ($i=0; (($i < @sims) && ($i < 50)); $i++)      for ($i=0; (($i < @sims) && ($i < 50)); $i++)
123      {      {
# Line 94  Line 127 
127      }      }
128  }  }
129    
 #### MUST update ###  
130  sub figfam_pegs_for_role {  sub figfam_pegs_for_role {
131      my($role) = @_;      my($role) = @_;
132    
133      my %figfams;      my %figfams;
134      foreach $_ (`cat /Users/rossoverbeek/FIGdisk/FIG/Data/FigfamsData/family.functions`)  
135      {      my $res = $sapO->all_figfams(-roles => $role);
136          if ((index($_,$role) >= 0) && ($_ =~ /^(FIG\d{6})/))      my @pegs;
137        for my $ff (keys %$res)
138          {          {
139              $figfams{$1} = 1;          my $fids = $sapO->figfam_fids(-id => $ff);
140          }          push(@pegs, @$fids);
141      }      }
142      my $sapO = SAPserver->new;  
143      my $genomesH  = $sapO->all_genomes(-complete => 1);      my $idsH = $sapO->ids_to_sequences(-ids => \@pegs, -protein => 1);
144      my @ids =  grep { $genomesH->{&SeedUtils::genome_of($_)} }  
                map { (($_ =~ /^(\S+)\t(\S+)/) && $figfams{$1}) ? $2 : () }  
               `cat /Users/rossoverbeek/FIGdisk/FIG/Data/FigfamsData/families.2c`;  
     my $idsH = $sapO->ids_to_sequences(-ids => \@ids, -protein => 1);  
145      return [map { my $seq = $idsH->{$_}; $seq ? [$_,'',$seq] : () } keys(%$idsH)];      return [map { my $seq = $idsH->{$_}; $seq ? [$_,'',$seq] : () } keys(%$idsH)];
146  }  }

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.6

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3