[Bio] / FigKernelScripts / find_miscalled_genes.pl Repository:
ViewVC logotype

View of /FigKernelScripts/find_miscalled_genes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Thu Dec 24 15:54:16 2015 UTC (3 years, 10 months ago) by gdpusch
Branch: MAIN
CVS Tags: HEAD
Changes since 1.4: +32 -9 lines
Add test to flip to "metagenome" mode for genomes smaller than 25 kbp, so that PRODIGAL will not bail out.FigKernelPackages/Prodigal.pm

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2008 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################


my $usage = "find_miscalled_genes  [--help] [--strict] [--verbose]  [--query=QueryGenomeID | --corr=CorrTable] --ref=RefGenomeID";

use strict;
use warnings;
use Data::Dumper;

use Getopt::Long;

use SeedUtils;

# use SSserver;

use SAPserver;
my $sapO = SAPserver->new();

use FS_RAST;

my $corrT;
my $ref_genome;
my $query_genome;

my $help    = 0;
my $strict  = 0;
my $verbose = 0;
my $rc = GetOptions(
		    "help!"     =>  \$help,
		    "strict!"   =>  \$strict,
                    "verbose!"  =>  \$verbose,
		    "corr=s"    =>  \$corrT,
		    "ref=s"     =>  \$ref_genome,
		    "query=s"   =>  \$query_genome,
		    );

if (!$rc || $help) {
    warn "\n   usage: $usage\n\n";
    exit(0);
}

if ($verbose) { $ENV{VERBOSE} = 1; }


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Read correspondence table and build associated data structures
#-----------------------------------------------------------------------
my @corr_table;
if ($corrT) {
    if ((!-s $corrT) || (not open(CORR,"<$corrT"))) {
	die ("Could not open correspondence table '$corrT'\n\n   ",
	     $usage, 
	     "\n\n"
	     );
	@corr_table = map { chomp; [split(/\t/,$_)] } <CORR>; close(CORR);	
    }
}
else {
    if (defined($query_genome) && defined($ref_genome)) {
	my $corr_table = $sapO->gene_correspondence_map(
							-genome1 => $query_genome,
							-genome2 => $ref_genome,
							-fullOutput => 1,
							-passive => 0
							);
	@corr_table = @$corr_table;
    }
    else {
	die qq(Undefined query and/or reference geneome ID);
    }
}



if ((@corr_table < 1) || ((&SeedUtils::genome_of($corr_table[0]->[0]) ne $ref_genome) &&
	                  (&SeedUtils::genome_of($corr_table[0]->[1]) ne $ref_genome)))
{
    die "$corrT is not a valid correspondence table with reference genome = $ref_genome";
}
my $ref_column    = (&SeedUtils::genome_of($corr_table[0]->[0]) eq $ref_genome) ? 0 : 1;
my $nonref_column = ($ref_column +1) % 2;
my $nonref_genome = &SeedUtils::genome_of($corr_table[0]->[$nonref_column]);

my $corrH;
%$corrH = map { ($ref_column == 0) ? ($_->[0] => $_->[1]) : ($_->[1] => $_->[0]) } @corr_table;

my %corr_pegs  = map { $_->[$ref_column] => 1 } @corr_table;



#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Load PEG IDs and locations for reference and non-reference genomes
#-----------------------------------------------------------------------
my $pegsRH        = $sapO->all_features(-ids => $ref_genome, -type => 'peg');
my $ref_pegs      = $pegsRH->{$ref_genome};

my $pegsNH        = $sapO->all_features(-ids => $nonref_genome, -type => 'peg');
my $nonref_pegs   = $pegsNH->{$nonref_genome};

my $in_ssH  = $sapO->is_in_subsystem(-ids => [ @$ref_pegs, @$nonref_pegs ]);


my $ref_locsH     = $sapO->fid_locations(-ids => $ref_pegs);
map { $ref_locsH->{$_} = &SeedUtils::boundary_loc($ref_locsH->{$_}) } (keys %$ref_locsH);

my (%predR, %succR);
my @orderedR = sort { &SeedUtils::location_cmp($ref_locsH->{$a},$ref_locsH->{$b}) } (keys %$ref_locsH);
for (my $i=1; $i < $#orderedR; ++$i) {
    my $before = $orderedR[$i-1];
    my $curr   = $orderedR[$i];
    my $after  = $orderedR[$i+1];
    
    my ($cB, undef, undef, $strandB) = &SeedUtils::parse_location($ref_locsH->{$before});
    my ($c0, undef, undef, $strand0) = &SeedUtils::parse_location($ref_locsH->{$curr});
    my ($cA, undef, undef, $strandA) = &SeedUtils::parse_location($ref_locsH->{$after});
    
    if ($strand0 eq q(+)) {
	$predR{$curr} = ($cB eq $c0) ? $before : undef;
	$succR{$curr} = ($cA eq $c0) ? $after  : undef;
    }
    elsif ($strand0 eq q(-)) {
	$predR{$curr} = ($cA eq $c0) ? $after  : undef;
	$succR{$curr} = ($cB eq $c0) ? $before : undef;
    }
    else {
	die "Can\'t handle strand=$strand0 of PEG=$curr, loc=$ref_locsH->{$curr}";
    }
}


my $nonref_locsH  = $sapO->fid_locations(-ids => $nonref_pegs);
map { $nonref_locsH->{$_} = &SeedUtils::boundary_loc($nonref_locsH->{$_}) } (keys %$nonref_locsH);

my (%predN, %succN);
my @orderedN = sort { &SeedUtils::location_cmp($nonref_locsH->{$a},$nonref_locsH->{$b}) } (keys %$nonref_locsH);
for (my $i=1; $i < $#orderedN; ++$i) {
    my $before = $orderedN[$i-1];
    my $curr   = $orderedN[$i];
    my $after  = $orderedN[$i+1];
    
    my ($cB, undef, undef, $strandB) = &SeedUtils::parse_location($nonref_locsH->{$before});
    my ($c0, undef, undef, $strand0) = &SeedUtils::parse_location($nonref_locsH->{$curr});
    my ($cA, undef, undef, $strandA) = &SeedUtils::parse_location($nonref_locsH->{$after});
    
    if ($strand0 eq q(+)) {
	$predN{$curr} = ($cB eq $c0) ? $before : undef;
	$succN{$curr} = ($cA eq $c0) ? $after  : undef;
    }
    elsif ($strand0 eq q(-)) {
	$predN{$curr} = ($cA eq $c0) ? $after  : undef;
	$succN{$curr} = ($cB eq $c0) ? $before : undef;
    }
    else {
	die "Cannot handle strand=$strand0 of PEG=$curr, loc=$nonref_locsH->{$curr}";
    }
}


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Find "interesting" PEGs
#   (reference PEGs in subsystems w/out corresponding non-ref PEGs)
#-----------------------------------------------------------------------
my @pegs_to_check = grep { (@ { $in_ssH->{$_} } > 0)
				  && defined($corr_pegs{$_})
			      } @orderedR;

print STDERR (q(Found ), (scalar @pegs_to_check), q( PEGs in subsystems), qq(\n)) if $ENV{VERBOSE};

foreach my $geneR (@pegs_to_check) {
    my ($predR, $succR, $geneN, $predN, $succN);
    
    if (defined($geneN = $corrH->{$geneR}) && (not $in_ssH->{$geneN})      &&
	defined($predR = $predR{$geneR})   && defined($corrH->{$predR})    &&
	defined($succR = $succR{$geneR})   && defined($corrH->{$succR})    &&
	defined($predN = $predN{$geneN})   && ($predN eq $corrH->{$predR}) &&
	defined($succN = $succN{$geneN})   && ($succN eq $corrH->{$succR})
	)
    {
	if ($strict) {
	    my $spreadsheet_row = $sapO->is_in_subsystem_with(-ids => [ $geneR ]);
	    my $roles_of_pegH   = {};
	    my $pegs_with_roleH = {};
	    
	    foreach my $tuple (@ { $spreadsheet_row->{$geneR} }) {
		my ($ss_name, $ss_var, $ss_fid, $ss_func, $ss_role) = @$tuple;
		$roles_of_pegH->{$ss_fid}->{$ss_name} = $ss_role;
		push @ { $pegs_with_roleH->{$ss_name}->{$ss_role} }, $ss_fid;
	    }
	    
	    my $skip = 0;
	    foreach my $ss_name (keys % { $roles_of_pegH->{$geneR} }) {
		my $ss_role = $roles_of_pegH->{$geneR}->{$ss_name};
		if ((@_ = @ { $pegs_with_roleH->{$ss_name}->{$ss_role} }) > 1) {
		    print STDERR (qq(Duplicate roles: ), join(q(, ), @_), q( --- ), $ss_role, qq(\n)) if $ENV{VERBOSE};
		    $skip = 1;
		}
	    }
	    next if $skip;
	}
	print STDERR "Checking $geneR, with homolog $geneN\n" if $ENV{VERBOSE};
	
	my $funcsH = $sapO->ids_to_functions(-ids => [$geneR, $geneN]);
	my $funcR  = $funcsH->{$geneR};
	my $funcN  = $funcsH->{$geneN};
	
	if (defined($funcR) && ((not defined($funcN)) || ($funcR ne $funcN))) {
	    print STDOUT ("REF_PEG\t$geneR\n",
			  "REF_FUNC\t$funcR\n",
			  "MISCALLED\t$geneN\n",
			  "BAD_FUNC\t$funcN\n",
			  "//\n"
		          );
	}
    }
    print STDERR "//\n" if $ENV{VERBOSE};
}

exit(0);



sub flatten_dumper {
    my @x = @_;
    my $x;

    foreach $x (@x)
    {
	$x = Dumper($x);

	$x =~ s/\$VAR\d+\s+\=\s+//o;
	$x =~ s/\n//gso;
	$x =~ s/\s+/ /go;
	$x =~ s/\'//go;
#       $x =~ s/^[^\(\[\{]+//o;
#       $x =~ s/[^\)\]\}]+$//o;
    }

    return @x;
}

sub feature_url {
    my ($x) = @_;
    return qq(<A HREF="http://seed-viewer.theseed.org/seedviewer.cgi?page=Annotation&feature=$x">$x</A>);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3