[Bio] / FigKernelScripts / get_neighbors_and_corr_to_ref.pl Repository:
ViewVC logotype

Diff of /FigKernelScripts/get_neighbors_and_corr_to_ref.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Sun Dec 13 01:11:40 2009 UTC revision 1.7, Mon Aug 30 16:08:24 2010 UTC
# Line 1  Line 1 
1  ########################################################################  ########################################################################
2    
3    # This is a SAS Component
4    
5  use SeedHTML;  use SeedHTML;
6  use strict;  use strict;
7  use SeedEnv;  use SeedEnv;
8  use ProtSims;  use ProtSims;
9  use gjoseqlib;  use gjoseqlib;
10    use Data::Dumper;
11    use SeedAware;
12    
13  my $usage = "usage: get_neighbors_and_corr_to_ref GenomeDir";  my $usage = "usage: get_neighbors_and_corr_to_ref GenomeDir";
14  my $gdir;  my $gdir;
15    
16    $| = 1;
17    
18    my $sapO = SAPserver->new;
19    
20  ($gdir   =  shift @ARGV)  ($gdir   =  shift @ARGV)
21      || die $usage;      || die $usage;
22    ($gdir =~ /(\d+\.\d+)$/) || die "Invalid Genome Directory: $gdir";
23    my $gdir_id = $1;
24    
25  my @fasta = &gjoseqlib::read_fasta("$gdir/Features/peg/fasta");  my @fasta = &gjoseqlib::read_fasta("$gdir/Features/peg/fasta");
26  my %id2seqH = map { $_->[0] => $_->[2] } @fasta;  my %id2seqH = map { ($_->[2] && (length($_->[2]) > 30)) ? ($_->[0] => $_->[2]) : () } @fasta;
27    
28  &SeedUtils::verify_dir("$gdir/CorrToReferenceGenomes");  &SeedUtils::verify_dir("$gdir/CorrToReferenceGenomes");
29    print "Finding neighbors\n";
30  my @poss_pegs = &prioritize_pegs_used_to_find_neighbors($gdir);  my @poss_pegs = &prioritize_pegs_used_to_find_neighbors($gdir);
31    print "found " . scalar(@poss_pegs) . " poss_pegs\n";
32  my %counts;  my %counts;
33  my $best  = 0;  my $best  = 0;
34  my $tuple;  my $tuple;
35  while (($best < 500) && ($tuple = shift @poss_pegs))  while (($best < 500) && ($tuple = shift @poss_pegs))
36  {  {
37        my($role,$peg) = @$tuple;
38        if ($id2seqH{$peg} && (length($id2seqH{$peg}) > 30))
39        {
40      &compute_hits_and_set_best($tuple,\%id2seqH,\%counts,\$best);      &compute_hits_and_set_best($tuple,\%id2seqH,\%counts,\$best);
41  }  }
42    }
43  if ($best == 0) { die "$gdir describes a genome without enough RAST-called genes to identify neighbors" }  if ($best == 0) { die "$gdir describes a genome without enough RAST-called genes to identify neighbors" }
44  my @reference = sort { $counts{$b} <=> $counts{$a} } keys(%counts);  my @reference = sort { $counts{$b} <=> $counts{$a} } keys(%counts);
45  if (@reference > 30) { $#reference = 29 }  if (@reference > 30) { $#reference = 29 }
46    
 my $sapO = SAPserver->new;  
47  my $genomesH  = $sapO->all_genomes(-complete => 1);  my $genomesH  = $sapO->all_genomes(-complete => 1);
48  open(CLOSE,">$gdir/closest.genomes") || die "could not open closest.genomes";  open(CLOSE,">$gdir/closest.genomes") || die "could not open closest.genomes";
49    print "Generating correspondences for these genomes:\n";
50    print "\t$_\n" for @reference;
51  foreach my $g2 (@reference)  foreach my $g2 (@reference)
52  {  {
53        if ($g2 ne $gdir_id)
54        {
55            print "Generating correspondences for $g2...\n";
56      &generate_correspondence_table($g2,$gdir);      &generate_correspondence_table($g2,$gdir);
57            print "Generating correspondences for $g2...done\n";
58      print CLOSE join("\t",($g2,$genomesH->{$g2})),"\n";      print CLOSE join("\t",($g2,$genomesH->{$g2})),"\n";
59  }  }
60    }
61  close(CLOSE);  close(CLOSE);
62    
63  sub generate_correspondence_table {  sub generate_correspondence_table {
# Line 44  Line 65 
65    
66      ($gdir =~ /(\d+\.\d+)$/) || die "Invalid Genome Directory: $gdir";      ($gdir =~ /(\d+\.\d+)$/) || die "Invalid Genome Directory: $gdir";
67      my $g1 = $1;      my $g1 = $1;
68      system "svr_corresponding_genes -d $gdir $g1 $g2 > $gdir/CorrToReferenceGenomes/$g2";      if ($g1 ne $g2)
69        {
70            my $exe = SeedAware::executable_for("svr_corresponding_genes");
71            SeedAware::system_with_redirect([$exe, "-d", $gdir, $g1, $g2],
72                                        { stdout => "$gdir/CorrToReferenceGenomes/$g2" });
73            #system "svr_corresponding_genes -d $gdir $g1 $g2 > $gdir/CorrToReferenceGenomes/$g2";
74        }
75  }  }
76    
77  sub prioritize_pegs_used_to_find_neighbors {  sub prioritize_pegs_used_to_find_neighbors {
78      my($gdir) = @_;      my($gdir) = @_;
79    
80      my %by_func;      my %by_func;
     (-s "$gdir/assigned_functions") || die "$gdir contains no assigned_functions";  
81    
82      foreach my $line (`cat $gdir/assigned_functions`)      my %uniqH;
83    
84        my $af_fh;
85        if (!open($af_fh, "<", "$gdir/assigned_functions"))
86        {
87            warn "Cannot open $gdir/assigned_functions: $!";
88            return ();
89        }
90    
91        while (defined(my $line = <$af_fh>))
92      {      {
93          if ($line =~ /^(fig\|\d+\.\d+\.peg\.\d+)\t(\S[^\#]+\S)/)          if ($line =~ /^(fig\|\d+\.\d+\.peg\.\d+)\t(\S[^\#]+\S)/)
94          {          {
95              my($peg,$func) = ($1,$2);              $uniqH{$1} = $2;
             $func =~ s/\s*\#$//;  
             push(@{$by_func{$2}},$1);  
96          }          }
97      }      }
98        close($af_fh);
99    
100        foreach my $peg (keys(%uniqH))
101        {
102            my $func = $uniqH{$peg};
103            $func =~ s/\s*\#.*$//;
104            push(@{$by_func{$func}},$peg);
105        }
106    
107      my @synthetases        = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /tRNA synthetase/ }   keys(%by_func);      my @synthetases        = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /tRNA synthetase/ }   keys(%by_func);
108      my @ribosomal_proteins = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /ribosomal protein/ } keys(%by_func);      my @ribosomal_proteins = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /ribosomal protein/ } keys(%by_func);
109      my @ok_pegs            = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 }                                    keys(%by_func);      my @ok_pegs            = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 }                                    keys(%by_func);
# Line 82  Line 124 
124      my($tuple,$id2seqH,$counts,$bestP) = @_;      my($tuple,$id2seqH,$counts,$bestP) = @_;
125    
126      my($role,$peg) = @$tuple;      my($role,$peg) = @$tuple;
127        print "Get figfam pegs for $role\n";
128      my $figfam_pegs = &figfam_pegs_for_role($role);      my $figfam_pegs = &figfam_pegs_for_role($role);
129    
130        print "Compute sims\n";
131      my @sims        = &ProtSims::blastP([[$peg,'',$id2seqH->{$peg}]],$figfam_pegs,10);      my @sims        = &ProtSims::blastP([[$peg,'',$id2seqH->{$peg}]],$figfam_pegs,10);
132        print "Computed " . scalar(@sims) . " sims\n";
133      my $i;      my $i;
134      for ($i=0; (($i < @sims) && ($i < 50)); $i++)      for ($i=0; (($i < @sims) && ($i < 50)); $i++)
135      {      {
# Line 94  Line 139 
139      }      }
140  }  }
141    
 #### MUST update ###  
142  sub figfam_pegs_for_role {  sub figfam_pegs_for_role {
143      my($role) = @_;      my($role) = @_;
144    
145      my %figfams;      my %figfams;
146      foreach $_ (`cat /Users/rossoverbeek/FIGdisk/FIG/Data/FigfamsData/family.functions`)  
147      {      my $res = $sapO->all_figfams(-roles => $role);
148          if ((index($_,$role) >= 0) && ($_ =~ /^(FIG\d{6})/))      my @pegs;
149        for my $ff (keys %$res)
150          {          {
151              $figfams{$1} = 1;          my $fids = $sapO->figfam_fids(-id => $ff);
152            push(@pegs, @$fids);
153          }          }
154      }  
155      my $sapO = SAPserver->new;      my $idsH = $sapO->ids_to_sequences(-ids => \@pegs, -protein => 1);
156      my $genomesH  = $sapO->all_genomes(-complete => 1);  
     my @ids =  grep { $genomesH->{&SeedUtils::genome_of($_)} }  
                map { (($_ =~ /^(\S+)\t(\S+)/) && $figfams{$1}) ? $2 : () }  
               `cat /Users/rossoverbeek/FIGdisk/FIG/Data/FigfamsData/families.2c`;  
     my $idsH = $sapO->ids_to_sequences(-ids => \@ids, -protein => 1);  
157      return [map { my $seq = $idsH->{$_}; $seq ? [$_,'',$seq] : () } keys(%$idsH)];      return [map { my $seq = $idsH->{$_}; $seq ? [$_,'',$seq] : () } keys(%$idsH)];
158  }  }

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.7

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3