[Bio] / FortyEight / imp_salvage.pl Repository:
ViewVC logotype

View of /FortyEight/imp_salvage.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Thu Sep 13 19:59:18 2007 UTC (12 years, 5 months ago) by olson
Branch: MAIN
new script

#
# Salvage annotations for the organisms that are replacements of SEED genomes.
#
# This is true if the file REPLACES exists and containst the genome ID of an
# existing SEED genome.
#
# Note that for this to work the SEED that this script is invoked from has to
# have a current copy of the canonical SEED organism data.
# 
# Salvage does the following:
#
# Determine the old->new mapping using make_peg_maps_from_fasta.
#
# Determine the set of pegs for which we will salvage assignments by finding
# the set of mapped pegs from the old organism that are in subsystems. We will
# carry across only those assignments; the others will be taken from the RAST
# pipeline. We will construct annotation file entries for the other mapped PEGs
# that have the old annotation data to keep the history, but they will be capped
# with the annotation for the RAST assignment.
#


use strict;
use FIG;
use Data::Dumper;
use FIG_Config;
use File::Copy;
use File::Basename;
use ImportJob;
use GenomeMeta;
use JobStage;

@ARGV == 1 or die "Usage: $0 job-dir\n";

my $fig = new FIG();

my $max_hits = 300;

my $jobdir = shift;

-d $jobdir or die "$0: job dir $jobdir does not exist\n";

my $stage = new JobStage('ImportJob', 'salvage', $jobdir);

$stage or die "$0: Could not create job object";

my $job = $stage->job;
my $job_id = $job->id;

$stage->log("Running on " . $stage->hostname);

$stage->set_status("running");
$stage->set_running("yes");

$stage->set_qualified_metadata("host", $stage->hostname);

#
#
# Begin
#
#

open(JOBS, "<$jobdir/rast.jobs") or $stage->fatal("Cannot open $jobdir/rast.jobs: $!");

while (my $rjdir = <JOBS>)
{
    chomp $rjdir;

    my $rj = new Job48($rjdir);
    my $repfile = $rj->orgdir() . "/REPLACES";

    my $repl = &FIG::file_head($repfile);
    if ($repl)
    {
	chomp $repl;
	do_salvage($rj, $repl);
    }
}

close(JOBS);

sub do_salvage
{
    my($job, $old_genome) = @_;

    print "Do replacement on " . $job->genome_name()  . " from $old_genome\n";

    my $orgdir = $job->orgdir();

    #
    # Figure out what the RAST assignments are. We do this in case a salvage
    # is run multiple times.
    # We stash a copy of the original RAST-generated data in rast.assigned_functions
    # and rast.annotations if these files do not yet exist.
    #

    if (! -f "$orgdir/rast.annotations" and -f "$orgdir/annotations")
    {
	copy("$orgdir/annotations", "$orgdir/rast.annotations") or
	    $stage->fatal("Cannot copy $orgdir/annotations to $orgdir/rast.annotations: $!");
    }

    #
    # We may not have yet built the assigned_functions from the proposed*functions.
    #

    if (! -f "$orgdir/assigned_functions")
    {
	my $rc = system("cat $orgdir/proposed*functions > $orgdir/assigned_functions");
	$rc == 0 or $stage->fatal("Cannot create $orgdir/assigned_functions from $orgdir/proposed*functions: rc=$rc");
    }
    
    if (! -f "$orgdir/rast.assigned_functions" and -f "$orgdir/assigned_functions")
    {
	copy("$orgdir/assigned_functions", "$orgdir/rast.assigned_functions") or
	    $stage->fatal("Cannot copy $orgdir/assigned_functions to $orgdir/rast.assigned_functions: $!");
    }

    #
    # Assignments & annotations in place. Compute mappings.
    #

    my $old_orgdir = "$FIG_Config::organisms/$old_genome";

    -d $old_orgdir or $stage->fatal("Old organism dir $old_orgdir does not exist");

    my $maps = "$orgdir/peg_maps";
    #
    # XXX - should rerun this every time, later.
    if (! -f $maps)
    {
	$stage->run_process("make_peg_maps_from_fasta",
			    "$FIG_Config::bin/make_peg_maps_from_fasta",
			    $old_orgdir,
			    $orgdir,
			    $maps);
    }
    
    #
    # Ingest the map and mark each peg with its subsystem status.
    #

    open(M, "<$maps") or $stage->fatal("Cannot open peg maps $maps: $!");
    my @map;
    my(%old_to_new, %new_to_old);
    while (<M>)
    {
	chomp;
	my($peg_old, $peg_new) = split(/\t/);
	my $ss = [$fig->peg_to_subsystems($peg_old)];
	my $ent = {
	    old => $peg_old,
	    new => $peg_new,
	    ss => $ss,
	};
	
	push(@map, $ent);
	$old_to_new{$peg_old} = $ent;
	$new_to_old{$peg_new} = $ent;
    }
    close(M);

    #
    # Given this map, we construct the new assigned_functions and annotations files.
    #
    # We start by copying rast.annotations to annotations, to get the initial history.
    #
    # Then we scan the old organism's annotations and assigned function files,
    # remembering any of the pegs that show up in the map. Any that do not,
    # we write to files unmapped.annotations and unmapped.assigned_functions, again
    # to retain history.
    #
    # Once this scan is complete, we scan the new organism's assigned functions file.
    # If a peg in there is mapped, we copy the set of annotations for the old
    # peg across into the new annotations file, mapping pegs as we go.
    #
    # If the peg was in a subsystem, we then write
    # an annotation declaring the set of subsystems the peg was in, and a final
    # annotation with the function assignment from the old org. The old
    # assignment is written to assigned_functions.
    #
    # If the peg is not in a subsystmem, we write a final annotation with the
    # rast annotation, and write the rast assignemnt to assigned_functions.
    #
    # If the peg was not mapped, we just write the rast function to assigned_functions.
    #
    # Note that we don't actually have to scan anything - all we need to do is
    # walk over the entries in the old/new map, and write out the appropriate data.
    # Anything that isn't in there was already copied from the rast version of the data.
    #

    my $new_af = $stage->open_file(">$orgdir/assigned_functions");
    copy("$orgdir/rast.assigned_functions", $new_af) or
	$stage->fatal("Cannot copy $orgdir/rast.assigned_functions to $orgdir/assigned_functions: $!");

    my $new_anno = $stage->open_file(">$orgdir/annotations");
    copy("$orgdir/rast.annotations", $new_anno) or
	$stage->fatal("Cannot copy $orgdir/rast.annotations to $orgdir/annotations: $!");

    my $unmapped_af = $stage->open_file(">$orgdir/unmapped.assigned_functions");
    my $unmapped_anno = $stage->open_file(">$orgdir/unmapped.annotations");

    my $old_af = $stage->open_file("<$old_orgdir/assigned_functions");
    my $old_anno = $stage->open_file("<$old_orgdir/annotations");

    my(%old_anno, %old_af);

    while (<$old_af>)
    {
	chomp;
	my($peg, $fun) = split(/\t/);
	if (my $ent = $old_to_new{$peg})
	{
	    $ent->{old_func} = $fun;
	}
	else
	{
	    print $unmapped_af "$_\n";
	}
    }
    close($old_af);

    {
	local $/;
	$/ = "//\n";
	
	while (<$old_anno>)
	{
	    chomp;
	    my($peg, $time, $who, $what, $val) = split(/\n/, $_, 5);

	    if (my $ent = $old_to_new{$peg})
	    {
		$val =~ s/\n*$//;
		push @{$ent->{old_anno}}, [$peg, $time, $who, $what, $val];
	    }
	    else
	    {
		print $unmapped_anno $ent . $/;
	    }
	}
    }
    close($old_anno);

    #
    # Pull in RAST assignments for use below
    #
    my %rast;
    my $rast_af = $stage->open_file("$orgdir/rast.assigned_functions");
    while (<$rast_af>)
    {
	chomp;
	my($peg, $fn) = split(/\t/);
	$rast{$peg} = $fn;
    }
    close($rast_af);
    
    for my $new_peg (sort { &FIG::by_fig_id($a, $b) }  keys %new_to_old)
    {
	my $ent = $new_to_old{$new_peg};

	#
	# Copy old annotations
	#
	for my $anno (@{$ent->{old_anno}})
	{
	    $anno->[0] = $new_peg;
	    print $new_anno join("\n", @$anno), "\n//\n";
	}

	my $old_func = $ent->{old_func};
	my $new_func = $rast{$new_peg};

	#
	# Determine if we need to update.
	#
	my @ss_list = @{$ent->{ss}};

	if ($old_func eq '')
	{
	    print "$ent->{old} => $new_peg No existing function\n";
	    print $new_anno join("\n", $new_peg, time, "salvage",
				 "No function found in original organism $ent->{old}"), "\n//\n";

	}
	elsif ($old_func eq $new_func)
	{
	    print "$ent->{old} => $new_peg functions are the same\n";
	    print $new_anno join("\n", $new_peg, time, "salvage",
				 "Old and new assignments are the same", $old_func), "\n//\n";

	}
	else
	{
	    if (@ss_list > 0)
	    {
		print "$ent->{old} => $new_peg is in a ss\n";
		
		print $new_anno join("\n", $new_peg, time, "salvage",
				     "Retaining old assignment due to membership in subsystems", "@ss_list", $old_func), "\n//\n";
		print $new_anno join("\n", $new_peg, time, "Set master function to", $old_func), "\n//\n";
		print $new_af "$new_peg\t$old_func\n";
	    }
	    else
	    {
		print "$ent->{old} => $new_peg is not in a ss\n";
		
		print $new_anno join("\n", $new_peg, time, "salvage",
				     "Using RAST assignment due to no subsystem membership", $new_func), "\n//\n";
		print $new_anno join("\n", $new_peg, time, "Set master function to", $new_func), "\n//\n";
	    }
	}
    }
    close($new_af);
    close($new_anno);
}

    
    
    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3