[Bio] / FigKernelScripts / make_subsys_based_families2.pl Repository:
ViewVC logotype

View of /FigKernelScripts/make_subsys_based_families2.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Thu Oct 8 18:53:34 2009 UTC (10 years, 4 months ago) by arodri7
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
commit changes

########################################################################
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#
########################################################################


use FIG;
my $fig = new FIG;

use Subsystem;

# usage: make_subsys_based_families [trusted] > subsys.based.families

if (@ARGV > 0)
{
    foreach $_ (`cat $ARGV[0]`)
    {
	if ($_ =~ /^(\S[^\t]+\S)/)
	{
	    $trusted{$1} = 1;
	}
    }
}


foreach $sub ($fig->all_subsystems)
{
    if (((@ARGV == 0) && $fig->usable_subsystem($sub)) || $trusted{$sub})
    {
	foreach $role ($fig->subsystem_to_roles($sub))
	{
	    #next if ($fig->is_aux_role_in_subsystem($sub,$role));
	    push(@{$subs_for_role{$role}},$sub);
	}
    }
}


my $nodesN = 6;
$roleN = 1;
@roles = sort keys(%subs_for_role);
my $roles_per_array = int ((scalar @roles)/$nodesN);
my $roleHash = {};
my $hashNumber = 0;
my $qty_in_hash = 0;


for ($i=0; ($i < @roles); $i++){
  $role = $roles[$i];
  $fam = "subsys$roleN";
  $roleN++;

  $roleHash->{$hashNumber}->{$role} = 1;

  if ( ($hashNumber != $nodesN-1) && ($qty_in_hash > $roles_per_array) ){
    $hashNumber++;
    $qty_in_hash=0;
  }
  else{
    $qty_in_hash++;
  }
}


my $tmpdir = "$FIG_Config::temp/update.FIGfams_subsys.$$";
mkdir($tmpdir,0777) || die "could not make $tmpdir";


for ($i=0; ($i < $nodesN); $i++){
  my $pid = fork();
  if ($pid) { # parent
    push @children, $pid;
  }
  elsif ($pid == 0) { # child
    &process_subset($tmpdir, $i, $roleHash->{$i}, \%subs_for_role);
    exit;
  }
  else{
    print STDERR "couldn't fork\n";
  }
}

foreach my $child (@children){
  waitpid($child, 0);
}



my $prev_id = 0;
my $id = 0;
for ($i=0; ($i < $nodesN); $i++){
  open (FH, "$tmpdir/subsys_file_$i") || die "cannot open $tmpdir/subsys_file_$i\n";
  while (my $line = <FH>){
    my ($subsys_pref, $subsys_id, $rest) = ($line) =~ /^(subsys)(\d+?)(\t.*)/;
    next if (!$subsys_pref);
    if ($subsys_id != $prev_id){
      $id++;
    }
    print $subsys_pref . $id . $rest . "\n";
    $prev_id = $subsys_id;
  }    
  close FH;
}


sub process_subset {
  my ($tmpdir, $countF, $roleHash, $subs_for_role_ref) = @_;

  my $fig = new FIG;  
  $roleN=1;
  open (FH, ">$tmpdir/subsys_file_$countF") || die "cannot open $tmpdir/subsys_file_$countF\n";
  my @roles = sort (keys %$roleHash);
  for ($i=0; ($i < @roles); $i++){
    my $fam = "subsys$roleN";
    $roleN++;
    $role = $roles[$i];

    undef %pegs;
    $subs = $subs_for_role{$role};

    foreach $sub (sort @$subs) {
      $subO = new Subsystem($sub,$fig);
      @genomes =  map { $_->[0] } @{$fig->subsystem_genomes($sub)};

      foreach $genome (@genomes){
	next if ($fig->is_eukaryotic($genome));
	my $vc = $subO->get_variant_code_for_genome($genome);
	if (($vc ne '0') && ($vc ne '-1')) {
	  @pegs_in_subsystem = grep { $_ =~ /\.peg\./ } $fig->pegs_in_subsystem_cell($sub,$genome,$role);
	  foreach $peg (grep { $fig->is_real_feature($_) } @pegs_in_subsystem) {
	    $func = $fig->function_of($peg);
	    if ((index($func,$role) >= 0)           &&
		(! $fig->possibly_truncated($peg))  &&
		(! $fig->possible_frameshift($peg))) {
	      $pegs{$peg} = 1;
	    }
	  }
	}
      }
    }
    my @pegs = sort { &FIG::by_fig_id($a,$b) } keys(%pegs);
    if (@pegs > 1){
      foreach $peg (@pegs) {
	$func = $fig->function_of_quick($peg);
	@pieces = split(/(\s+\/\s+)|(\s*[;@]\s+)/,$func);
	if (@pieces > 1) {
	  $key = join("\t",@pieces);
	  $multi{$key}->{$peg} = 1;
	}
	else {
	  print FH join("\t",($fam,$peg,$func,$role)),"\n";
	}
      }
    }
  }

  foreach $key (sort keys(%multi)){
    $x = $multi{$key};
    @pegs = keys(%$x);
    if (@pegs > 1){
      $fam = "subsys$roleN";
      $roleN++;
      
      foreach $peg (@pegs){
	print FH join("\t",($fam,$peg,scalar $fig->function_of_quick($peg))),"\n";
      }
    }
  }
  close FH;
}

	

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3