[Bio] / FigKernelScripts / make_PATRICfams.pl Repository:
ViewVC logotype

View of /FigKernelScripts/make_PATRICfams.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Thu Aug 2 19:02:26 2012 UTC (7 years, 3 months ago) by overbeek
Branch: MAIN
CVS Tags: rast_rel_2014_0729, rast_rel_2014_0912, HEAD
Changes since 1.3: +2 -1 lines
minor fix

use strict;
use Data::Dumper;
use SeedEnv;
use DB_File;

# Making PATRICfams is normally down in three steps:
# 
#     first you run this program:
# 
# [1]         make_PATRICfams FIGfamReleaseDirectory Tmp Recall
# 
#     where Recall is the file Bob produces when recalling functions.  It contains lines
#     like
# 
#         fig|217.1.peg.17	Methionyl-tRNA synthetase (EC 6.1.1.10)	FIG01265301
# 
#     produced by calling the kmer assignments.  Using the recalls saves having to rerun all
#     of the proteins through kmers
# 
# [2]          all_entities_Feature -fields source_id | grep 'fig|' | get_relationship_Produces -c 1 -to id | cut -f2,3 > peg.md5 
# 
#     This Kbase command builds a 2-column table [Peg,MD5]
# 
# [3]          make_PATRICfams_collapse_md5.pl Tmp PATRICfamsReleaseDirectory peg.md5 


my $usage = "usage: make_PATRICfams FIGfamRelease PATRICfamRelease Recall";
my($ffR,$patricR,$recall);
(
 ($ffR        = shift @ARGV) && 
 ($patricR    = shift @ARGV) &&
 ($recall     = shift @ARGV)
)
    || die $usage;

open(FF2C,"<$ffR/families.2c")        || die "could not open $ffR/families.2c";
open(FUNCS,"<$ffR/family.functions")  || die "could not open $ffR/family.functions";

my %ffr_funcs;
tie %ffr_funcs, 'DB_File', "$ffR/assigned_functions.btree", O_RDONLY, 0, $DB_BTREE;

if (! -d $patricR) { mkdir($patricR,0777) }
open(P2C,">$patricR/families.2c") || die "could not open $patricR/families.2c";
open(PF,">$patricR/family.functions") || die "could not open $patricR/family.functions";

my @funcs = map { ($_ =~ /(FIG\d+)\t(\S.*\S)/) ? [$1,$2] : () } <FUNCS>;
close(FUNCS);
my %to_func = map { $_->[0] => $_->[1] } @funcs;
my %to_fams;
foreach $_ (@funcs)
{
    push(@{$to_fams{$_->[1]}},$_->[0]);
}

my %fam;
my %peg_to_fam;
while (defined($_ = <FF2C>))
{
    if ($_ =~ /^(FIG\d+)\t(\S+)/)
    {
	push(@{$fam{$1}},$2);
	$peg_to_fam{$2} = $1;
    }
}
close(FF2C);

foreach my $func (keys(%to_fams))
{
    if (! &SeedUtils::hypo($func))
    {
	my @same = sort @{$to_fams{$func}};
	my $i;
	for ($i=1; ($i < @same); $i++)
	{
	    my $pegs = $fam{$same[$i]};
	    push(@{$fam{$same[0]}},@$pegs);
	    foreach my $peg (@$pegs)
	    {
		$peg_to_fam{$peg} = $same[0];
	    }
	    delete $fam{$same[$i]};
	}
	$to_fams{$func} = [$same[0]];
    }
}

use FIG;
my $fig = new FIG;
open(RECALL,"<$recall") || die "could not open $recall";
while (defined($_ = <RECALL>))
{
    chop;
    my($peg,$funcRC) = split(/\t/,$_);

    if (! $peg_to_fam{$peg})
    {
	my $func;
	my $place_in = $to_fams{$funcRC};
	if (! $place_in)
	{
#	    my $func = $fig->function_of($peg,'',1);
	    $func = $ffr_funcs{$peg};
	    if ($func)
	    {
		$place_in = $to_fams{$func};
	    }
	}

	if ($place_in && (! &SeedUtils::hypo($func)))
	{
	    my $in = $place_in->[0];
	    $peg_to_fam{$peg} = $in;
	    push(@{$fam{$in}},$peg);
	}
    }
}

foreach my $fam (sort keys(%fam))
{
    print PF "$fam\t$to_func{$fam}\n";
    my $pegs = $fam{$fam};
    foreach my $peg (sort { &SeedUtils::by_fig_id($a,$b) } @$pegs)
    {
 	print P2C "$fam\t$peg\n";
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3