[Bio] / FigKernelScripts / FFB3_make_subsys_based_families.pl Repository:
ViewVC logotype

View of /FigKernelScripts/FFB3_make_subsys_based_families.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Tue Jul 9 16:49:46 2013 UTC (6 years, 4 months ago) by olson
Branch: MAIN
CVS Tags: rast_rel_2014_0729, rast_rel_2014_0912, HEAD
Changes since 1.1: +1 -1 lines
FFB3_build_updated_FF: Add --temp to force use of given temp directory
Change hardcoded database host to seed-db-read for the annotator seed.

########################################################################
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#
########################################################################


use FIG;
use Data::Dumper;
use strict;
use Getopt::Long;
use YAML::Any 'DumpFile';
use List::Util 'first';
use IO::Handle;

# usage: FFB3_make_subsys_based_families raw-family-data

my $fig = new FIG;

(@ARGV == 2) ||
    die "Usage: FFB3_make_subsys_based_families raw-data-file log-file\n";

my $raw_data = shift;
my $log_file = shift;

die "Not overwriting existing output file $raw_data\n" if -s $raw_data;

open(my $log_fh, ">", $log_file) or die "cannot write $log_file: $!";
$log_fh->autoflush(1);


my $anno_dbh;
my $pubseed_dbh;
if ($ENV{FFB3_TEST_MODE})
{
    $anno_dbh = DBI->connect('dbi:mysql:host=seed-db-read.mcs.anl.gov;database=fig_test1', 'seed');
    $pubseed_dbh = DBI->connect('dbi:mysql:host=seed-db-read.mcs.anl.gov;database=fig_test2', 'seed');
}
else
{
    $anno_dbh = DBI->connect('dbi:mysql:host=seed-db-read.mcs.anl.gov;database=fig_anno_v5', 'seed');
    $pubseed_dbh = $fig->db_handle->{_dbh};
}
$pubseed_dbh or die;
$anno_dbh or die;

#
# Need to collect list of valid genomes.
#
my $gres = $pubseed_dbh->selectcol_arrayref(qq(SELECT genome FROM genome));
my %genomes = map { $_ => 1 } @$gres;

my(%peg2fun, %fun2peg);

#
# First collect from pubseed (assumed to be the current seed)
#
collect_ss_pegs_from_estimates($pubseed_dbh, $FIG_Config::organisms, \%genomes, \%peg2fun, \%fun2peg);

#
# Then connect to annotator seed, letting it override.
#
collect_ss_pegs($anno_dbh, \%genomes, \%peg2fun, \%fun2peg);

DumpFile($raw_data, \%fun2peg);

sub collect_ss_pegs
{
    my($dbh, $genomes, $peg2fun, $fun2peg) = @_;

    my $sth = $dbh->prepare(qq(SELECT si.protein, si.role, f.assigned_function, seek.slen, md5.md5
			       FROM subsystem_index si
			       LEFT JOIN aux_roles ar ON si.subsystem = ar.subsystem AND si.role = ar.role
			       JOIN subsystem_metadata m ON si.subsystem = m.subsystem
			       JOIN assigned_functions f ON f.prot = si.protein
			       LEFT JOIN deleted_fids df ON si.protein = df.fid
			       LEFT JOIN protein_sequence_seeks seek ON seek.id = si.protein
			       LEFT JOIN protein_sequence_MD5 md5 ON md5.id = si.protein
			       WHERE df.fid IS NULL AND
			       ar.role IS NULL AND
			       m.class_1 <> '' AND
			       m.class_1 NOT LIKE 'experimental%' COLLATE latin1_swedish_ci AND
			       m.class_1 NOT LIKE '%delete%' COLLATE latin1_swedish_ci  AND
			       si.variant != '0' AND
			       si.variant != '-1'
			      ),
			{ mysql_use_result => 1 });

    $sth->execute();

    my $n = 0;

    while (my $row = $sth->fetchrow_arrayref())
    {
	my($peg, $role, $func, $len, $md5) = @$row;
	my $com;
	if (++$n % 10000 == 0)
	{
	    print "$n\n";
	}

	next if $peg !~ /\.peg\./;
	my($genome) = $peg =~ /^fig\|(\d+\.\d+)/;
	if (!$genomes->{$genome})
	{
	    print $log_fh "bad genome\t$peg\n";
	    next;
	}

	if ((! $func) || (length($func) < 2))
	{
	    print $log_fh "missing func\t$peg\n";
	    next;
	}

	($func, $com) = $func =~ /^([^#]*?)(?:\s*\#(.*))?$/;

	if ($com =~ /trunc|framesh|fragment/)
	{
	    print $log_fh "$com\t$peg\n";
	    next;
	}

	my @roles = split /\s+[\/@]\s+|\s*;\s+/, $func;
	if (first { $_ eq $role } @roles)
	{
	    # $func = SeedUtils::strip_func_comment($func);

	    if (my $old = delete $peg2fun->{$peg})
	    {
		delete $fun2peg->{$old}->{$peg};
	    }
	    $peg2fun->{$peg} = $func;
	    $fun2peg->{$func}->{$peg} = "$len,$md5";
	}
	else
	{
#	    print $log_fh "role mismatch ss\t$peg\t$func\t
	    print $log_fh join("\t", "role mismatch ss", $peg, $func, $role) . "\n";
	}
    }
    $sth->finish();
}

sub collect_ss_pegs_from_estimates
{
    my($dbh, $orgdir, $genomes, $peg2fun, $fun2peg) = @_;

    #
    # Collect aux-role information
    #
    my $res = $dbh->selectall_arrayref(qq(SELECT s.subsystem, a.role
					  FROM subsystem_metadata s JOIN aux_roles a USING ( subsystem ))
				       );

    my %aux_role;
    $aux_role{$_->[0]}->{$_->[1]} = 1 foreach @$res;

    my $n = 0;
    for my $genome (sort { FIG::by_genome_id($a, $b) } keys %$genomes)
    {
	if (++$n % 10000 == 0)
	{
	    print "$n\n";
	}

	my $dir = "$orgdir/$genome/Subsystems";

	my $ss_fh;
	if (!open($ss_fh, "<", "$dir/subsystems"))
	{
	    warn "Cannot open $dir/subsystems: $!";
	    next;
	}

	my $b_fh;
	if (!open($b_fh, "<", "$dir/bindings"))
	{
	    warn "Cannot open $dir/bindings: $!";
	    next;
	}

	my %active_ss;
	while (<$ss_fh>)
	{
	    chomp;
	    my($ss, $vc) = split(/\t/);
	    $active_ss{$ss} = $vc if ($vc != '0' && $vc != '1');
	}
	close($ss_fh);

	my @ids;
	my %dat;
	while (<$b_fh>)
	{
	    chomp;
	    my($ss, $role, $peg) = split(/\t/);
	    next if $aux_role{$ss}->{$role};
	    next unless $active_ss{$ss};
	    
	    next if $peg !~ /\.peg\./;

	    push(@ids, $peg);
	    push(@{$dat{$peg}->{$ss}}, $role);
	}
	close($b_fh);

	if (@ids == 0)
	{
	    print STDERR "No ids found for $genome\n";
	    next;
	}

	my $ids = join(", ", map { $dbh->quote($_) } @ids);

	my $sth = $dbh->prepare(qq(SELECT f.prot, f.assigned_function, seek.slen, md5.md5
				   FROM assigned_functions f 
				   LEFT JOIN deleted_fids df ON f.prot = df.fid
				   LEFT JOIN protein_sequence_seeks seek ON seek.id = f.prot
				   LEFT JOIN protein_sequence_MD5 md5 ON md5.id = f.prot
				   WHERE df.fid IS NULL AND
				   f.prot IN ($ids)),
			    { mysql_use_result => 1 });
	$sth->execute();

	while (my $row = $sth->fetchrow_arrayref())
	{
	    my($peg, $func, $len, $md5) = @$row;
	    my $com;

	    ($func, $com) = $func =~ /^([^\#]*?)(?:\s*\#\s*(.*))?$/;
					 
    	    if ($com =~ /trunc|framesh|fragment/)
	    {
		print $log_fh "$com\t$peg\n";
		next;
	    }
					 
	    my @roles = split /\s+[\/@]\s+|\s*;\s+/, $func;

	    for my $ss (sort keys %{$dat{$peg}})
	    {
		for my $role (sort @{$dat{$peg}->{$ss}})
		{
		    if (first { $_ eq $role } @roles)
		    {

			if (my $old = delete $peg2fun->{$peg})
			{
			    delete $fun2peg->{$old}->{$peg};
			}
			$peg2fun->{$peg} = $func;
			$fun2peg->{$func}->{$peg} = "$len,$md5";
		    }
		    else
		    {
			print $log_fh "role mismatch est\t$peg\n";
		    }
		}
	    }
	}
	$sth->finish();
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3