Parent Directory
|
Revision Log
fixes to ToCal.pm
# -*- perl -*- ######################################################################## # Copyright (c) 2003-2006 University of Chicago and Fellowship # for Interpretations of Genomes. All Rights Reserved. # # This file is part of the SEED Toolkit. # # The SEED Toolkit is free software. You can redistribute # it and/or modify it under the terms of the SEED Toolkit # Public License. # # You should have received a copy of the SEED Toolkit Public License # along with this program; if not write to the University of Chicago # at info@ci.uchicago.edu or the Fellowship for Interpretation of # Genomes at veronika@thefig.info or download a copy from # http://www.theseed.org/LICENSE.TXT. ######################################################################## use strict; use warnings; use FIG; my $fig = new FIG; use FigFams; my $figfams = new FigFams($fig); use ToCall; use NewGenome; $0 =~ m/([^\/]+)$/o; my $self = $1; my $usage = "usage: $self [-fams=FigfamsData] To_Call_Dir N FoundFamilies [old] > closest.N.genomes"; if (@ARGV && ($ARGV[0] =~ m/-h(elp)?/)) { print STDERR "\n $usage\n\n"; exit (0); } my ($i, $fam_data, $to_call_dir, $N, $foundF); # First, let's determine which FigfamsData directory to use $fam_data = "$FIG_Config::data/FigfamsData"; for ($i=0; ($i < @ARGV) && ($ARGV[$i] !~ /^-fams=/); $i++) {} if ($i < @ARGV) { if ($ARGV[$i] =~ /^-fams=(\S+)/) { if (-d $1) { $fam_data = $1; } else { die "Nonexistent FIGfams directory $1\n"; } splice(@ARGV,$i,1); } } # Now pick up the normal parameters ( ($to_call_dir = shift @ARGV) && ($N = shift @ARGV) && ($foundF = shift @ARGV) && open(FOUNDFAMS, ">>$foundF") ) || die "\n $usage\n\n"; ### This is a bit complex. Normally, one uses this program to call genes # in a newly-sequenced genome. In this case the functionality of NewGenome.pm # is needed. To call genes in an existing genome (to, for example, see how well # we are doing) you would use ToCall.pm. my $to_call; my $old; if ((@ARGV > 0) && ($ARGV[0] =~ /^old/i)) { $old = 1; $to_call = new ToCall($to_call_dir); } else { $to_call = new NewGenome($to_call_dir); } my @orf_ids = $to_call->get_fids_for_type('orf'); if (not @orf_ids) { $to_call->possible_orfs; @orf_ids = $to_call->get_fids_for_type('orf'); } my (%seq_of, %len_of, $orf_len); for my $fid (@orf_ids) { if (($orf_len = $to_call->get_feature_length($fid)) > 900) { $len_of{$fid} = $orf_len; $seq_of{$fid} = $to_call->get_feature_sequence($fid); } } my @peg_ids = (); my %genomes_hit; my %weight_of_hits; foreach my $orf_id (sort { $len_of{$b} <=> $len_of{$a} } keys %len_of) { print STDERR "\nChecking $orf_id ($len_of{$orf_id})\n" if $ENV{VERBOSE}; my ($fam, $sims) = $figfams->place_in_family($seq_of{$orf_id}); if (not defined($fam)) { print STDERR "\tCould not place in family --- skipping\n" if $ENV{VERBOSE}; next; } else { my $fam_id = $fam->family_id(); my $func = $fam->family_function(); print STDERR "\tplaced in $fam_id (" , (scalar @$sims) , " sims)\t$func\n" if $ENV{VERBOSE}; my $fid; if (! $old) { my $orf = &NewGenome::ORF::new('NewGenome::ORF', $to_call, $orf_id); $fid = $orf->promote_to_peg($sims, $func); } else { $fid = $orf_id; } if ($fid) { push @peg_ids, $fid; print FOUNDFAMS "$fid\t$fam_id\t$func\n"; for ($i=0; ($i < $N) && ($i < @$sims); ++$i) { ++$genomes_hit{&FIG::genome_of($sims->[$i]->id2)}; $weight_of_hits{&FIG::genome_of($sims->[$i]->id2)} += ($sims->[$i]->bsc / $len_of{$orf_id}); } } else { die "Could not promote $orf_id"; } } last if (@peg_ids >= $N); } my $num_pegs = @peg_ids; warn "Found $num_pegs PEG candidates\n"; if ($num_pegs == 0) { die "Could not find any features of sufficient length and in a FIGfam\n"; } my @best = sort { $weight_of_hits{$b} <=> $weight_of_hits{$a} } keys(%genomes_hit); for ($i=0; ($i < $N) && ($i < @best); ++$i) { print "$best[$i]\t$genomes_hit{$best[$i]}\t$weight_of_hits{$best[$i]}\n"; } close(FOUNDFAMS); $to_call->export_features; exit(0);
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |