[Bio] / FigKernelScripts / merge_missing_gene_output.pl Repository:
ViewVC logotype

View of /FigKernelScripts/merge_missing_gene_output.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Wed Aug 4 21:13:15 2010 UTC (9 years, 8 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.2: +1 -1 lines
bugfix

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2008 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

use strict;
use warnings;
use Data::Dumper;

use FIGV;

use SeedUtils;

use SAPserver;
my $sapO = SAPserver->new();

my $figv_dir;
(($figv_dir = shift @ARGV) && (-d $figv_dir))
    || die qq(Directory \'$figv_dir\' does not exist);

my $org_dir;
(($org_dir = shift @ARGV) && (-d $org_dir))
    || die qq(Directory \'$org_dir\' does not exist);

opendir( ORG_DIR, $org_dir) || die qq(Could not opendir \'$org_dir\');

my $fig = FIGV->new($figv_dir);

my @ref_orgs = grep { m/^\d+\.\d+$/o } readdir(ORG_DIR);
closedir(ORG_DIR);
	
use constant BEFORE     => 0;
use constant AFTER      => 1;
use constant LOC        => 2;
use constant FRAMESHIFT => 3;
use constant REF_PEGS   => 4;
use constant FS_OUT     => 5;

my %new_pegs     = ();
my %ref_genomes = ();
foreach my $ref_org (@ref_orgs) {
    $/ = "\n//\n";
    my $filepath = qq($org_dir/$ref_org);
    open(FILE, qq(<$filepath))
	|| die qq(Could not read-open file \'$filepath\');
    
    while (defined(my $record = <FILE>)) {
	chomp $record;
	my @lines = split( /\n/, $record);
	
	my $last_key = q();
	my $recordH = {};
	foreach my $line (@lines) {
	    if ($line =~ m/^(\w+)\t(.*)$/o) {
		$recordH->{$1} = $2;
		$last_key = $1;
	    }
	    elsif ($line =~ m/^\+\+\+\t(.*)$/o) {
		$recordH->{$last_key} .= "\n$1";
	    }
	    else {
		die qq(Could not handle file \'$filepath\', record:\n$record\n---);
	    }
	}
	
	if (defined($recordH->{REF_PEG})) {
	    $ref_genomes{ &genome_of($recordH->{REF_PEG}) } = 1;
	}
	else {
	    warn (qq(Bad record:\n), $record, qq(//\n\n));
	    next;
	}
	
	my ($contig, $min, $max, $dir) = boundaries_of([ split(/,/o, $recordH->{LOCATION}) ]);
	my $end = ($dir eq q(+)) ? $max : $min;
	
	$recordH->{LENGTH} = 1 + abs($max-$min);
	push @ { $new_pegs{qq($contig\t$dir$end)} }, $recordH;
    }
}
print STDERR (qq(Found ), (scalar keys %new_pegs), qq( missing PEG candidates\n\n));


foreach my $key (sort { my ($x) = ($a =~ m/(\d+)$/o);
			my ($y) = ($b =~ m/(\d+)$/o);
			($x <=> $y)
			} (keys %new_pegs)
		 ) {
    
    my ($before, $after, $last_before)            = (q(), q(), q());
    my ($curr_ref_peg, $curr_trans, $curr_boundary, $curr_loc, $curr_length, $curr_FS) = (q(), q(), q(), q(), 0, q());
    my $frameshift = 0;
    my $ref_funcH  = {};
    
    my $trouble  = 0;
    my @ref_pegs = ();
    foreach my $recordH (@ { $new_pegs{$key} }) {
	
	my $ref_peg  = $recordH->{REF_PEG};
	if ($fig->is_deleted_fid($ref_peg)) {
	    print STDERR qq(Skipping due to deleted ref_peg=$ref_peg\n\n);
	    next;
	}
	
	my $ref_func = $recordH->{REF_FUNC};
	$ref_func =~ s/\s+\#.*$//o;
	print STDERR qq(ref_peg=$ref_peg,\t$ref_func\n) if $ENV{VERBOSE};
	
	my $ref_loc = $sapO->fid_locations( -ids => [$ref_peg], -boundaries => 1 );
	my ($ref_contig, $ref_beg, $ref_end, $ref_strand) = &SeedUtils::parse_location( $ref_loc->{$ref_peg} );
	
	($before, $after) = ($recordH->{BEFORE}, $recordH->{AFTER});
	if ($fig->is_deleted_fid($before) || $fig->is_deleted_fid($after)) {
	    print STDERR qq(Skipping due to deleted before=$before or after=$after\n\n);
	    next;
	}
	
	print STDERR qq(before=$before, after=$after\n) if $ENV{VERBOSE};
	($before, $after) = sort { &by_loc($a,$b) } ($recordH->{BEFORE}, $recordH->{AFTER});
	print STDERR qq(before=$before, after=$after\n) if $ENV{VERBOSE};
	
	if ($last_before && ($before ne $last_before)) {
	    $trouble = 1;
	    warn qq(before=$before, last_before=$last_before\n);
	}
	$last_before = $before;
	print STDERR qq(\n) if $ENV{VERBOSE};
	
	if ($recordH) {
	    push @ref_pegs, $ref_peg;
	    
	    if ($recordH->{LENGTH} > $curr_length) {
		$curr_ref_peg  = $ref_peg;
		$curr_length   = $recordH->{LENGTH};
		$curr_trans    = $recordH->{PROT_SEQ};
		$curr_loc      = $recordH->{LOCATION};
		$curr_boundary = &SeedUtils::boundary_loc($curr_loc);
		$curr_FS       = $recordH->{FS_RAST_OUT};
		$frameshift    ||= (index($recordH->{LOCATION},q(,)) > -1) ? 1 : 0;
	    }
	}
	else {
	    die qq(No record for key \'$key\');
	}
    }
    
    if ($trouble) {
	warn qq(Inconsistent record -- skipping\n\n);
	next;
    }
    
    my $roles = {};
    my $ssH  = $sapO->ids_to_subsystems( -ids => [@ref_pegs], -usable => 1);
    foreach my $fid (@ref_pegs) {
	my @pairs =  @ { $ssH->{$fid} };
	foreach my $pair (@pairs) {
	    my ($role, $ss) = @$pair;
	    ++$roles->{$role};
	}
    }
    my @roles = sort { $roles->{$b} <=> $roles->{$a} } (keys %$roles);
    

    $curr_FS =~ s/\n/\%0A/gso;
    print STDOUT (join(qq(\t), ($curr_loc,
				$frameshift,
				$curr_length,
				join(q(,), ($before, $after)),
				join(q(,), @ref_pegs),
				$curr_ref_peg,
				$curr_trans,
				$curr_FS
				)
		       ),
		  qq(\n)
		  );
}
exit(0);


sub by_loc {
    my ($x, $y) = @_;
#   print STDERR qq(x=$x,\ty=$y\n);
    
    my $locH  = $sapO->fid_locations( -ids => [$x, $y], -boundaries => 1 );
    
    my $loc_x = $locH->{$x};
    my $loc_y = $locH->{$y};
    
    my $cmp = &SeedUtils::location_cmp($loc_x, $loc_y);
#   print STDERR (qq(loc_x=$loc_x,\tloc_y=$loc_y\n), qq(cmp=$cmp\n\n));
    
    return $cmp;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3