[Bio] / FigKernelScripts / initialize_alignments_for_subsystems.pl Repository:
ViewVC logotype

View of /FigKernelScripts/initialize_alignments_for_subsystems.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Thu Jan 5 01:09:56 2006 UTC (13 years, 10 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.1: +18 -0 lines
fixes to extract_genomes, salvage_subsystem_rows and initialize_alignments_for_subsystems

# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#
########################################################################

use FIG;
my $fig = new FIG;

# usage: initialize_alignments_for_subsystems FileOfSub  [no argument means "all"]

my @subsys;
if (@ARGV == 1)
{
    @subsys = map { ($_ =~ /(\S.*\S)/) ? $1 : () } `cat $ARGV[0]`;
}
else
{
    @subsys = $fig->all_subsystems;
}

foreach my $subsys (@subsys)
{
    print STDERR "processing $subsys\n";
    my @roles = $fig->subsystem_to_roles($subsys);
    for (my $i = 0; ($i < @roles); $i++)
    {
	print STDERR "    $roles[$i]\n";
	&align_column($fig,\@roles,$i+1,$subsys);
    }
}

sub align_column {
    my($fig,$roles,$colN,$subsys) = @_;

    &check_index("$FIG_Config::data/Subsystems/$name/Alignments",$roles);
    if (($role = &which_role_for_column($colN,$roles)) &&
        ((@pegs = &seqs_to_align($fig,$role,$subsys)) > 1))
    {
        my $tmpF = "/tmp/seqs.fasta.$$";
        open(TMP,">$tmpF") || die "could not open $tmpF";

        foreach $peg (@pegs)
        {
            if ($pseq = $fig->get_translation($peg))
            {
                $pseq =~ s/[uU]/x/g;
                print TMP ">$peg\n$pseq\n";
            }
        }
        close(TMP);

        my $dir = "$FIG_Config::data/Subsystems/$subsys/Alignments/$colN";

        if (-d $dir)
        {
            system "rm -rf \"$dir\"";
        }

        &FIG::run("$FIG_Config::bin/split_and_trim_sequences \"$dir/split_info\" < $tmpF");

        if (-s "$dir/split_info/set.sizes")
        {
            open(SZ,"<$dir/split_info/set.sizes") || die " could not open $dir/split_info/set.sizes";
            while (defined($_ = <SZ>))
            {
                if (($_ =~ /^(\d+)\t(\d+)/) && ($2 > 3))
                {
                    my $n = $1;
                    &FIG::run("$FIG_Config::bin/make_phob_from_seqs \"$dir/$n\" < \"$dir/split_info\"/$n");
                }
            }
            close(SZ);
            &update_index("$FIG_Config::data/Subsystems/$subsys/Alignments/index",$colN,$role);
        }
        else
        {
            system("rm -rf \"$dir\"");
        }
    }
}



sub check_index {
    my($alignments,$roles) = @_;

    if (-s "$alignments/index")
    {
        my $ok = 1;
        foreach $_ (`cat \"$alignments/index\"`)
        {
            $ok = $ok && (($_ =~ /^(\d+)\t(\S.*\S)/) && ($roles->[$1 - 1] eq $2));
        }
        if (! $ok)
        {
            system "rm -rf \"$alignments\"";
            return 0;
        }
        return 1;
    }
    else
    {
        system "rm -rf \"$alignments\"";
    }
    return 0;
}

sub update_index {
    my($file,$colN,$role) = @_;

    my @lines = ();
    if (-s $file)
    {
        @lines = grep { $_ !~ /^$colN\t/ } `cat $file`;
    }
    push(@lines,"$colN\t$role\n");
    open(TMP,">$file") || die "could not open $file";
    foreach $_ (@lines)
    {
        print TMP $_;
    }
    close(TMP);
}

sub which_role_for_column {
    my($col,$roles) = @_;
    my($i);

    if (($col =~ /^(\d+)/) && ($1 <= @$roles))
    {
        return $roles->[$1-1];
    }
    return undef;
}

sub seqs_to_align {
    my($fig,$role,$subsys) = @_;
    my($genome,$peg,%existing_function);

    my @seqs = ();
    my @genomes = $fig->subsystem_genomes($subsys);
    foreach $genome (map { $_->[0] } @{$fig->subsystem_genomes($subsys)})
    {
	foreach $peg ($fig->pegs_in_subsystem_cell($subsys,$genome,$role))
	{
	    push(@seqs,$peg);
	    my $func = $fig->function_of($peg);
	    if (! &FIG::hypo($func))
	    {
		my @roles_of_func = $fig->roles_of_function($func);
		for (my $i=0; ($i < @roles_of_func) && ($roles_of_func[$i] ne $role); $i++) {}
		if ($i < @roles_of_func)
		{
		    $existing_function{$func} = 1;
		}
	    }
	}
    }

    my $new = 0;
    my @similar = &get_homologs($fig,\@seqs,1.0e-20);
    while (($new < 100) && ($peg = shift @similar))
    {
	push(@seqs,$peg);
	my $func = $fig->function_of($peg);
	if (! $existing_function{$func})
	{
	    $new++;
	}
    }
    return @seqs;
}

sub get_homologs {
    my($fig,$pegs,$cutoff) = @_;
    my($peg,$sim,$id2);

    my @homologs = ();
    my %got = map { $_ => 1 } @$pegs;
    my %new;

    foreach $peg (@$pegs)
    {
        foreach $sim ($fig->sims($peg,300,$cutoff,"fig"))
        {
            $id2 = $sim->id2;
            if ((! $got{$id2}) && ((! $new{$id2}) || ($new{$id2} > $sim->psc)))
            {
                $new{$id2} = $sim->psc;
            }
        }
    }
    @homologs = sort { $new{$a} <=> $new{$b} } keys(%new);
    return @homologs;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3