[Bio] / FortyEight / find-genomes-not-in-seed.pl Repository:
ViewVC logotype

View of /FortyEight/find-genomes-not-in-seed.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Tue Feb 27 21:00:27 2007 UTC (12 years, 8 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.2: +14 -0 lines
move email support to Job48
remove stopligth status
make quality_check set up correction.status and correction.request if corrections needed

#
# Find any genomes currently in the 48-hour queue that are finished and appear to
# not exist in the SEED.
#

use strict;
use Data::Dumper;
use FIG;
use Job48;

my $fig = new FIG();

my @genomes = $fig->genomes();

my %by_tax;
my %genome_to_name;
my %name_to_genome;
my %contig_to_genome;
my %normalized_name_to_genome;
my %normalized_gs_to_genome;

for my $g (@genomes)
{
    my($tax, $vers) = split(/\./, $g);
    push @{$by_tax{$tax}}, $g;
    my $gs = $fig->genus_species($g);
    $name_to_genome{$gs} = $g;
    $genome_to_name{$g} = $gs;

    #
    # normalized names
    #
    my $ngs = lc($gs);
    $ngs =~ s/\s//g;
    $normalized_name_to_genome{$ngs} = $g;
    if ($gs =~ /^\s*(\S+)\s+(\S+)/)
    {
	$normalized_gs_to_genome{lc("$1$2")} = $g;
    }
}

#warn Dumper(\%normalized_name_to_genome);
#warn Dumper(\%normalized_gs_to_genome);


#
# Poke the db to read all contig ids.
#
warn "Reading contigs\n";
my $res = $fig->db_handle->SQL(qq(SELECT genome, contig from contig_lengths));
for my $ent (@$res)
{
    my($genome, $contig) = @$ent;
    
    push @{$contig_to_genome{$contig}}, $genome;
}
warn "done reading contigs\n";

my @jobs = Job48::all_jobs();
@jobs = grep { $_->active() } @jobs;

for my $job (@jobs)
{
#    print "Job " . $job->id . " " . $job->genome_id . " " . $job->genome_name . "\n";
    check($job);
}

sub check
{
    my($job) = @_;

    my $id = $job->id;
    my $g = $job->genome_id();
    my $gs = $job->genome_name();
    my @inseed;
    my $status = "UNKNOWN";

    #
    # find normalized names
    #
    my $ngs = lc($gs);
    my $gsonly;
    $ngs =~ s/\s//g;

    if ($gs =~ /^\s*(\S+)\s+(\S+)/)
    {
	$gsonly = lc("$1$2");
    }

#    warn "$g $gs $ngs $gsonly\n";

    if (!$job->finished())
    {
	$status = "INCOMPLETE";
    }
    elsif (my $sname = $name_to_genome{$gs})
    {
	$status = "NAME_IN_SEED";
	@inseed = ($sname, $genome_to_name{$sname});
	$job->meta->set_metadata("seed.genome_id", $sname);
	$job->meta->set_metadata("seed.genome_name", $genome_to_name{$sname});
    }
    else
    {
	(my $tax = $g) =~ s/\..*$//;
	my @bytax = @{$by_tax{$tax}} if $by_tax{$tax};
	if (@bytax)
	{
	    $status = "TAX_IN_SEED";
	    
	    for my $seedg (@bytax)
	    {
		my $seedname = $genome_to_name{$seedg};
		push(@inseed, $seedg, $seedname);
		$job->meta->set_metadata("seed.genome_id", $seedg);
		$job->meta->set_metadata("seed.genome_name", $seedname);
	    }
	}
	else
	{
	    if (my $sname = $normalized_name_to_genome{$ngs})
	    {
		$status = "NORMALIZED_NAME_IN_SEED";
		@inseed = ($sname, $genome_to_name{$sname});
		$job->meta->set_metadata("seed.genome_id", $sname);
		$job->meta->set_metadata("seed.genome_name", $genome_to_name{$sname});
	    }
	    elsif (my $sname = $normalized_gs_to_genome{$gsonly})
	    {
		$status = "NORMALIZED_GS_IN_SEED";
		@inseed = ($sname, $genome_to_name{$sname});
		$job->meta->set_metadata("seed.genome_id", $sname);
		$job->meta->set_metadata("seed.genome_name", $genome_to_name{$sname});
	    }

	    #
	    # Search for contig names that map.
	    #

	    my @clist;
	    for my $contig ($job->contigs())
	    {
		my $glist = $contig_to_genome{$contig};
		if ($glist)
		{
		    $status = "MATCHING_CONTIG_ID";
		    for my $sg (@$glist)
		    {
			push(@inseed, $sg, $genome_to_name{$sg});
			push(@clist, [$sg, $genome_to_name{$sg}]);
		    }
		    last;
		}
	    }
	    $job->meta->set_metadata("seed.matching_contigs", \@clist) if @clist;

	    if ($status eq 'UNKNOWN')
	    {
		$status = "NEW";
	    }
	}
    }
    $job->meta->set_metadata("seed.status", $status);

    print join("\t", $status, $job->id, $job->user, $g, $gs, @inseed), "\n";

}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3