[Bio] / FigKernelScripts / get_neighbors_and_corr_to_ref.pl Repository:
ViewVC logotype

Diff of /FigKernelScripts/get_neighbors_and_corr_to_ref.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Sun Dec 13 01:11:40 2009 UTC revision 1.5, Thu Jan 7 23:55:55 2010 UTC
# Line 11  Line 11 
11    
12  ($gdir   =  shift @ARGV)  ($gdir   =  shift @ARGV)
13      || die $usage;      || die $usage;
14    ($gdir =~ /(\d+\.\d+)$/) || die "Invalid Genome Directory: $gdir";
15    my $gdir_id = $1;
16    
17  my @fasta = &gjoseqlib::read_fasta("$gdir/Features/peg/fasta");  my @fasta = &gjoseqlib::read_fasta("$gdir/Features/peg/fasta");
18  my %id2seqH = map { $_->[0] => $_->[2] } @fasta;  my %id2seqH = map { ($_->[2] && (length($_->[2]) > 30)) ? ($_->[0] => $_->[2]) : () } @fasta;
19    
20  &SeedUtils::verify_dir("$gdir/CorrToReferenceGenomes");  &SeedUtils::verify_dir("$gdir/CorrToReferenceGenomes");
21  my @poss_pegs = &prioritize_pegs_used_to_find_neighbors($gdir);  my @poss_pegs = &prioritize_pegs_used_to_find_neighbors($gdir);
# Line 23  Line 25 
25  my $tuple;  my $tuple;
26  while (($best < 500) && ($tuple = shift @poss_pegs))  while (($best < 500) && ($tuple = shift @poss_pegs))
27  {  {
28        my($role,$peg) = @$tuple;
29        if ($id2seqH{$peg} && (length($id2seqH{$peg}) > 30))
30        {
31      &compute_hits_and_set_best($tuple,\%id2seqH,\%counts,\$best);      &compute_hits_and_set_best($tuple,\%id2seqH,\%counts,\$best);
32  }  }
33    }
34  if ($best == 0) { die "$gdir describes a genome without enough RAST-called genes to identify neighbors" }  if ($best == 0) { die "$gdir describes a genome without enough RAST-called genes to identify neighbors" }
35  my @reference = sort { $counts{$b} <=> $counts{$a} } keys(%counts);  my @reference = sort { $counts{$b} <=> $counts{$a} } keys(%counts);
36  if (@reference > 30) { $#reference = 29 }  if (@reference > 30) { $#reference = 29 }
# Line 34  Line 40 
40  open(CLOSE,">$gdir/closest.genomes") || die "could not open closest.genomes";  open(CLOSE,">$gdir/closest.genomes") || die "could not open closest.genomes";
41  foreach my $g2 (@reference)  foreach my $g2 (@reference)
42  {  {
43        if ($g2 ne $gdir_id)
44        {
45      &generate_correspondence_table($g2,$gdir);      &generate_correspondence_table($g2,$gdir);
46      print CLOSE join("\t",($g2,$genomesH->{$g2})),"\n";      print CLOSE join("\t",($g2,$genomesH->{$g2})),"\n";
47  }  }
48    }
49  close(CLOSE);  close(CLOSE);
50    
51  sub generate_correspondence_table {  sub generate_correspondence_table {
# Line 44  Line 53 
53    
54      ($gdir =~ /(\d+\.\d+)$/) || die "Invalid Genome Directory: $gdir";      ($gdir =~ /(\d+\.\d+)$/) || die "Invalid Genome Directory: $gdir";
55      my $g1 = $1;      my $g1 = $1;
56        if ($g1 ne $g2)
57        {
58      system "svr_corresponding_genes -d $gdir $g1 $g2 > $gdir/CorrToReferenceGenomes/$g2";      system "svr_corresponding_genes -d $gdir $g1 $g2 > $gdir/CorrToReferenceGenomes/$g2";
59  }  }
60    }
61    
62  sub prioritize_pegs_used_to_find_neighbors {  sub prioritize_pegs_used_to_find_neighbors {
63      my($gdir) = @_;      my($gdir) = @_;
# Line 53  Line 65 
65      my %by_func;      my %by_func;
66      (-s "$gdir/assigned_functions") || die "$gdir contains no assigned_functions";      (-s "$gdir/assigned_functions") || die "$gdir contains no assigned_functions";
67    
68        my %uniqH;
69      foreach my $line (`cat $gdir/assigned_functions`)      foreach my $line (`cat $gdir/assigned_functions`)
70      {      {
71          if ($line =~ /^(fig\|\d+\.\d+\.peg\.\d+)\t(\S[^\#]+\S)/)          if ($line =~ /^(fig\|\d+\.\d+\.peg\.\d+)\t(\S[^\#]+\S)/)
72          {          {
73              my($peg,$func) = ($1,$2);              $uniqH{$1} = $2;
74              $func =~ s/\s*\#$//;          }
             push(@{$by_func{$2}},$1);  
75          }          }
76    
77        foreach my $peg (keys(%uniqH))
78        {
79            my $func = $uniqH{$peg};
80            $func =~ s/\s*\#.*$//;
81            push(@{$by_func{$func}},$peg);
82      }      }
83    
84      my @synthetases        = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /tRNA synthetase/ }   keys(%by_func);      my @synthetases        = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /tRNA synthetase/ }   keys(%by_func);
85      my @ribosomal_proteins = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /ribosomal protein/ } keys(%by_func);      my @ribosomal_proteins = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /ribosomal protein/ } keys(%by_func);
86      my @ok_pegs            = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 }                                    keys(%by_func);      my @ok_pegs            = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 }                                    keys(%by_func);
# Line 82  Line 101 
101      my($tuple,$id2seqH,$counts,$bestP) = @_;      my($tuple,$id2seqH,$counts,$bestP) = @_;
102    
103      my($role,$peg) = @$tuple;      my($role,$peg) = @$tuple;
104    
105      my $figfam_pegs = &figfam_pegs_for_role($role);      my $figfam_pegs = &figfam_pegs_for_role($role);
106    
107      my @sims        = &ProtSims::blastP([[$peg,'',$id2seqH->{$peg}]],$figfam_pegs,10);      my @sims        = &ProtSims::blastP([[$peg,'',$id2seqH->{$peg}]],$figfam_pegs,10);
# Line 94  Line 114 
114      }      }
115  }  }
116    
117  #### MUST update ###  
118  sub figfam_pegs_for_role {  sub figfam_pegs_for_role {
119      my($role) = @_;      my($role) = @_;
120    
121      my %figfams;      my %figfams;
122      foreach $_ (`cat /Users/rossoverbeek/FIGdisk/FIG/Data/FigfamsData/family.functions`)      foreach $_ (`cat $FIG_Config::FigfamsData/family.functions`)
123      {      {
124          if ((index($_,$role) >= 0) && ($_ =~ /^(FIG\d{6})/))          if ((index($_,$role) >= 0) && ($_ =~ /^(FIG\d{6})/))
125          {          {
# Line 110  Line 130 
130      my $genomesH  = $sapO->all_genomes(-complete => 1);      my $genomesH  = $sapO->all_genomes(-complete => 1);
131      my @ids =  grep { $genomesH->{&SeedUtils::genome_of($_)} }      my @ids =  grep { $genomesH->{&SeedUtils::genome_of($_)} }
132                 map { (($_ =~ /^(\S+)\t(\S+)/) && $figfams{$1}) ? $2 : () }                 map { (($_ =~ /^(\S+)\t(\S+)/) && $figfams{$1}) ? $2 : () }
133                `cat /Users/rossoverbeek/FIGdisk/FIG/Data/FigfamsData/families.2c`;                `cat $FIG_Config::FigfamsData/families.2c`;
134      my $idsH = $sapO->ids_to_sequences(-ids => \@ids, -protein => 1);      my $idsH = $sapO->ids_to_sequences(-ids => \@ids, -protein => 1);
135      return [map { my $seq = $idsH->{$_}; $seq ? [$_,'',$seq] : () } keys(%$idsH)];      return [map { my $seq = $idsH->{$_}; ($seq && (length($seq) > 30)) ? [$_,'',$seq] : () } keys(%$idsH)];
136  }  }

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.5

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3