[Bio] / FigKernelScripts / find_missing_genes.pl Repository:
ViewVC logotype

View of /FigKernelScripts/find_missing_genes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (download) (as text) (annotate)
Wed Aug 4 03:08:27 2010 UTC (9 years, 3 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.9: +8 -5 lines
more RAST fixes (some of these may need to be backported to use the SAP version
outside rast, but the immediate need is for the rast version)

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2008 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

my $usage = "find_missing_genes [--help] [--strict] [--verbose]  --corr CorrTable  --ref RefGenomeID";

use strict;
use warnings;
use Data::Dumper;

use Getopt::Long;

use SeedUtils;
use SAPserver;
# use SSserver;
use FS_RAST;
use FIGV;

my $corrT;
my $ref_genome;

my $help    = 0;
my $strict  = 0;
my $verbose = 0;
my $orgdir;

my $rc = GetOptions(
		    "help!"     =>  \$help,
		    "strict!"   =>  \$strict,
		    "verbose!"  =>  \$verbose,
		    "corr=s"    =>  \$corrT,
		    "ref=s"     =>  \$ref_genome,
		    "orgdir=s"  =>  \$orgdir,
		    );

if (!$rc || $help) {
    warn "\n   usage: $usage\n\n";
    exit(0);
}

if ($verbose) { $ENV{VERBOSE} = 1; }

my $fig = new FIGV($orgdir);

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Read correspondence table and build associated data structures
#-----------------------------------------------------------------------
if ((!-e $corrT) || (not open(CORR,"<$corrT"))) {
    die ("Could not open correspondence table '$corrT'\n\n   ",
	 $usage, 
	 "\n\n"
	 );
}
my @corr_table = grep { $_->[8] eq q(<=>) } map { chomp; [split(/\t/,$_)] } <CORR>; close(CORR);


if ((@corr_table < 1) || ((&SeedUtils::genome_of($corr_table[0]->[0]) ne $ref_genome) &&
	                  (&SeedUtils::genome_of($corr_table[0]->[1]) ne $ref_genome))
    ) {
    die "$corrT is not a valid correspondence table with reference genome = $ref_genome";
}

my $ref_column    = (&SeedUtils::genome_of($corr_table[0]->[0]) eq $ref_genome) ? 0 : 1;
my $nonref_column = ($ref_column +1) % 2;
my $nonref_genome = &SeedUtils::genome_of($corr_table[0]->[$nonref_column]);

my $corrH;
%$corrH = map { ($ref_column == 0) ? ($_->[0] => $_->[1]) : ($_->[1] => $_->[0]) } @corr_table;

my %corr_pegs  = map { $_->[$ref_column] => 1 } @corr_table;



#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Load PEG IDs for reference and non-reference genomes
#-----------------------------------------------------------------------
my $sapO          = SAPserver->new();
my $pegsRH        = $sapO->all_features(-ids => $ref_genome, -type => 'peg');
my $ref_pegs      = $pegsRH->{$ref_genome};

my $pegsNH        = $sapO->all_features(-ids => $nonref_genome, -type => 'peg');
my $nonref_pegs   = [$fig->all_features($nonref_genome, 'peg')];

my $ref_locsH     = $sapO->fid_locations(-ids => $ref_pegs);
map { $ref_locsH->{$_} = &SeedUtils::boundary_loc($ref_locsH->{$_}) } (keys %$ref_locsH);

my $ref_lensH;
    map { my ($x, $y);
      (undef, $x, $y)  = &SeedUtils::boundaries_of([$ref_locsH->{$_}]);
      $ref_lensH->{$_} = 1 + abs($y-$x);
  } (keys %$ref_locsH);


my $nonref_locsH = {};
for my $peg (@$nonref_pegs)
{
    my @loc = $fig->feature_location($peg);
    my $r = &SeedUtils::boundary_loc(\@loc);
    $nonref_locsH->{$peg} = $r;
}

#my $nonref_locsH = $sapO->fid_locations(-ids => $nonref_pegs);
#map { $nonref_locsH->{$_} = &SeedUtils::boundary_loc($nonref_locsH->{$_}) } (keys %$nonref_locsH);

my @ordered = sort { &SeedUtils::location_cmp($ref_locsH->{$a},$ref_locsH->{$b}) } (keys %$ref_locsH);
# @_ = map { "$_\t" . join(q(), &flatten_dumper($ref_locsH->{$_})) } @ordered;   die Dumper(\@_);


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Find "interesting" PEGs
#   (reference PEGs in subsystems w/out corresponding non-ref PEGs)
#-----------------------------------------------------------------------
my @interesting_pegs = grep { ! $corr_pegs{$_} } @$ref_pegs;
print STDERR (q(Found ), (scalar @interesting_pegs), q( interesting PEGs), "\n") if $ENV{VERBOSE};

# my $ssO     = SSserver->new();
my $in_ssH  = $sapO->is_in_subsystem(-ids => \@interesting_pegs);
my @to_seek = grep { @ { $in_ssH->{$_} } > 0 } @interesting_pegs;
my %to_seekH = map { $_ => 1 } @to_seek;
print STDERR (q(Found ), (scalar @to_seek), q( interesting PEGs in subsystems), "\n") if $ENV{VERBOSE};


my $i;
for ($i=1; ($i < $#ordered); ++$i)
{
    my $beforeR = $ordered[$i-1];
    my $geneR   = $ordered[$i];
    my $afterR  = $ordered[$i+1];
    
    my $geneR_len = $ref_lensH->{$geneR};
    
    my ($beforeN, $afterN);
    if (($to_seekH{$geneR}) &&
	($beforeN = $corrH->{$beforeR}) &&
	($afterN  = $corrH->{$afterR}))
    {
	print STDERR qq(geneR = $geneR,\tpredR = $beforeR,\tsuccR = $afterR,\tpredN = $beforeN,\tsuccN = $afterN\n) if $ENV{VERBOSE};
	
	if ($strict) {
	    my $spreadsheet_row = $sapO->is_in_subsystem_with(-ids => [ $geneR ]);
	    my $roles_of_pegH   = {};
	    my $pegs_with_roleH = {};
	    
	    foreach my $tuple (@ { $spreadsheet_row->{$geneR} }) {
		my ($ss_name, $ss_var, $ss_fid, $ss_func, $ss_role) = @$tuple;
		$roles_of_pegH->{$ss_fid}->{$ss_name} = $ss_role;
		push @ { $pegs_with_roleH->{$ss_name}->{$ss_role} }, $ss_fid;
	    }
	    
	    my $skip = 0;
	    foreach my $ss_name (keys % { $roles_of_pegH->{$geneR} }) {
		my $ss_role = $roles_of_pegH->{$geneR}->{$ss_name};
		if ((@_ = @ { $pegs_with_roleH->{$ss_name}->{$ss_role} }) > 1) {
		    print STDERR (qq(Duplicate roles: ), join(q(, ), @_), q( --- ), $ss_role, qq(\n)) if $ENV{VERBOSE};
		    $skip = 1;
		}
	    }
	    next if $skip;
	}
	print STDERR "Searching for homolog to $geneR\n" if $ENV{VERBOSE};
	
	my ($loc, $trans, undef, $annotation) = &find_gene($sapO,
							   $ref_genome,
							   $nonref_genome,
							   $geneR, $geneR_len,
							   $ref_locsH, $nonref_locsH,
							   $beforeR,   $beforeN,
							   $afterR,    $afterN
							   );
	if ($loc) {
	    print STDERR qq(Found loc=$loc\n) if $ENV{VERBOSE};
	    
	    my $len   = length($trans);
	    my $func  = '';
	    my $funcH = $sapO->ids_to_functions(-ids => [$geneR]);
	    if (defined($funcH->{$geneR})) {
		$func = $funcH->{$geneR};
	    }
	    else {
		die Dumper($funcH);
	    }
	    
	    print STDERR (join("\t", ('HIT:', $geneR, $beforeN, $afterN, $func)), "\n") if $ENV{VERBOSE};
	    
	    chomp $annotation;
	    $annotation =~ s/\n/\n\+\+\+\t/gs;
	    
	    print STDOUT ("REF_PEG\t$geneR\n",
			  "REF_FUNC\t$func\n",
			  "REF_LEN\t$len\n",
			  "BEFORE\t$beforeN\n",
			  "AFTER\t$afterN\n",
			  "LOCATION\t$loc\n",
			  "PROT_SEQ\t$trans\n",
			  "FS_RAST_OUT\t$annotation\n",
			  "//\n"
		);
	}
	else {
	    print STDERR qq(Homolog not found\n) if $ENV{VERBOSE};
	}
	
	print STDERR "//\n" if $ENV{VERBOSE};
    }
}
exit(0);



sub find_gene {
    my($sapO,
       $ref_genome, $nonref_genome,
       $geneR, $geneR_len,
       $ref_locsH,  $nonref_locsH,
       $beforeR, $beforeN,
       $afterR,  $afterN) = @_;
    
    my $genome_metrics = $sapO->genome_metrics(-ids => [ $nonref_genome ]);
    my $genetic_code_numberN = $genome_metrics->{$nonref_genome}->[2];
    
    my($cR1,$leftR1,$rightR1) = &SeedUtils::parse_location($ref_locsH->{$beforeR});
    my($cR2,$leftR2,$rightR2) = &SeedUtils::parse_location($ref_locsH->{$afterR});
    
    my $szR;
    if (($cR1 eq $cR2) && ($szR = ($leftR2 - ($rightR1+1))) && ($szR > 100))
    {
# 	print STDERR (qq(   Found candidate: ),
# 		      join(qq(,\t), ($geneR, $rightR1, $leftR2, $szR)),
# 		      (($szR > 2000) ? q( !!!) : q()),
# 		      qq(\n));

	my ($cN1, $leftN1, $rightN1) = &SeedUtils::boundaries_of([$nonref_locsH->{$beforeN}]);
	my ($cN2, $leftN2, $rightN2) = &SeedUtils::boundaries_of([$nonref_locsH->{$afterN}]);
	
	if ((not $cN1) || (not $cN2)) {
	    warn "!!! contig undefined for beforeN=$beforeN\n" if (not $cN1);
	    warn "!!! contig undefined for  afterN=$afterN\n"  if (not $cN2);
	    return undef; 
	}
	
# 	print STDERR (qq(   Corresp. NonRef: ),
# 		      join(qq(,\t), ($leftN1, $rightN1, $leftN2, $rightN2)),
# 		      qq(\n));
	
	my $left  = &SeedUtils::min($rightN1, $rightN2) + 50;
	my $right = &SeedUtils::max($leftN1,  $leftN2)  - 50;
	
	return undef if ($left > $right);     #...Gap is backwards
	
	my $gap_regionN = join(q(_), ($cN1, $left, $right));
	my $nonref_genes_in_regionH = $sapO->genes_in_region( -locations => [ $gap_regionN ] );
	
 	print STDERR (qq(   Corresp. NonRef:\n),
		      qq(       $beforeN\t$nonref_locsH->{$beforeN}\n),
		      qq(       $afterN\t$nonref_locsH->{$afterN}\n),
 		      qq(       $leftN1\t$rightN1\n),
 		      qq(       $left\t$right\n),
 		      qq(       $leftN2\t$rightN2\n),
 		      qq(\n)
		      ) if $ENV{VERBOSE};
	
	if (grep { m/^fig\|\d+\.\d+\.peg\.\d+$/o } @ { $nonref_genes_in_regionH->{$gap_regionN} }) {
	    print STDERR (qq(   Found genes in gap region: $gap_regionN\n),
			  Dumper($nonref_genes_in_regionH)
			  ) if $ENV{VERBOSE};
	    return undef;
	}
	
	#...Add guard region...
	$left  -= 100;
	$right += 100;

	
	my $szN;
	if (($cN1 eq $cN2) && ($szN = ($right - ($left-1))) && 
	    ($szN > 200) && ($szN > 0.9 * $geneR_len)
	    )
	{
	    if ($ENV{VERBOSE}) {
		print STDERR (qq(   Found candidate: ),
			      join(qq(,\t), ($geneR,
					     $cR1, $rightR1, $leftR2,
					     $cN1, $left, $right,
					     $szR, $szN)),
			      (($szR > 2000) ? q( !!!) : q()),
			      qq(\n));
	    }
	    
	    if ((abs($szN-$szR) <= (0.2 * $szR)) || (abs($szN-$szR) < 500)) {
		print STDERR qq(   Acceptable gap,\tszN=$szN,\tszR=$szR\n) if $ENV{VERBOSE};
		
		my $same =  &SeedUtils::strand_of($ref_locsH->{$beforeR}) eq
		            &SeedUtils::strand_of($ref_locsH->{$geneR});
		
		my $strand_beforeR = &SeedUtils::strand_of($ref_locsH->{$beforeR});
		my $strand_beforeN = &SeedUtils::strand_of($nonref_locsH->{$beforeN});
		
		my $strand_sought  = $same ? $strand_beforeN : &opp($strand_beforeN);
		
		my $begN  = ($strand_sought eq '+') ? $left  : $right;
		my $endN  = ($strand_sought eq '+') ? $right : $left;
		
		my $locN  = &SeedUtils::location_string($cN1, $begN, $endN);
		
		# my $gap_dnaN = $sapO->locs_to_dna(-locations => { gapN => $locN });

		my $gap_dnaN = $fig->dna_seq($nonref_genome, join("_", $cN1, $begN, $endN));
		
		# warn Dumper($locN, $gap_dnaN);
		# $gap_dnaN = $gap_dnaN->{gapN};
		
		my $params = {};
		my $prot_seqR = &protein_seq_of($sapO, $geneR);
		$params->{family}  = [[$geneR, "", $prot_seqR]];
		$params->{code}    = $genetic_code_numberN;
 		# print STDERR (qq(   prot_seqR\t), $prot_seqR, qq(\n),
		# qq(   gap_dnaN \t), $gap_dnaN,  qq(\n));
		
		return &FS_RAST::best_match_in_family($params,[$cN1,$begN,$endN,$gap_dnaN]);
	    }
	    else {
		print STDERR qq(   Unacceptable gap,\tszN=$szN,\tszR=$szR\n) if $ENV{VERBOSE};
	    }
	}
    }
    return undef;
}

sub opp {
    my ($strand) = @_;
    my $opp = ($strand eq '+') ? '-' : '+'; 
    return $opp;
}

sub protein_seq_of {
    my ($sapO, $fid) = @_;
    my $retH = $sapO->ids_to_sequences(-ids => [$fid], -protein => 1);
    return $retH->{$fid};
}
	
sub flatten_dumper {
    my @x = @_;
    my $x;

    foreach $x (@x)
    {
	$x = Dumper($x);

	$x =~ s/\$VAR\d+\s+\=\s+//o;
	$x =~ s/\n//gso;
	$x =~ s/\s+/ /go;
	$x =~ s/\'//go;
#       $x =~ s/^[^\(\[\{]+//o;
#       $x =~ s/[^\)\]\}]+$//o;
    }

    return @x;
}

sub fids_to_compare_regions_url {
    my ($fid) = @_;
    
#   return qq(<A HREF="http://seed-viewer.theseed.org/seedviewer.cgi?page=Regions&feature=$y&feature=$z">$x</A>);
    return qq(<A HREF="http://seed-viewer.theseed.org/seedviewer.cgi?page=Annotation&feature=$fid">$fid</A>);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3