[Bio] / FigKernelScripts / build_syn.pl Repository:
ViewVC logotype

View of /FigKernelScripts/build_syn.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Mon Dec 5 18:56:37 2005 UTC (14 years, 3 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.1: +17 -0 lines
Add license words.

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#


# usage: build_syn 

my $tmpD = "$FIG_Config::temp";
my $tmp  = "$tmpD/tmp$$";

my $usage = "usage: build_syn SourcesInFile synonyms";
my($sources,$synF);

(
 ($sources = shift @ARGV) && opendir(SOURCES,$sources) &&
 ($synF    = shift @ARGV)
)
    || die $usage;

use strict;
my($file,$id,$seq,$n,%type,$curr,$last,@ids,$t1,$t2,$i,%syn,$syns,%same,$extra);

my @sources = map { "$sources/$_/fasta" } grep { ($_ !~ /^\./) && (-s "$sources/$_/fasta") } readdir(SOURCES);

open(ALL,">$tmp.all") || die "could not open $tmp.all";

foreach $file (@sources)
{
    if (open(TMP,"<$file"))
    {
	$/ = "\n>";
	while (defined($_ = <TMP>))
	{
	    chomp;
	    if ($_ =~ /^>?(\S+)[^\n]*\n(.*)/s)
	    {
		$id  =  $1;
		$seq =  $2;
		$id =~ s/\cA.*$//;
		$id =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/;
		$seq =~ s/\s//gs;
		if (&good_seq($seq))
		{
		    if ($seq =~ s/>//g)
		    {
			print STDERR "$file has embedded >s\n";
		    }
		    print ALL ">$id\n$seq\n";
		}
	    }
	}
	close(TMP);
	$/ = "\n";
    }
    else
    {
	print STDERR "could not open $file\n";
    }
}
close(ALL);

open(ALL,"<$tmp.all") || die "could not reopen $tmp.all";
open(NORMALIZED,"| sort -T $tmpD >$tmp.normalized")
    || die "could not open $tmp.normalized";

while (defined($_ = <ALL>) && ($_ =~ /^>(\S+)/))
{
    $id = $1;
    $seq = <ALL>;
    $n = length($seq);
    $seq =  reverse(substr($seq,1,$n-2));
    $seq =~ tr/a-z/A-Z/;
    print NORMALIZED "$seq\t$id\n";
}
close(ALL);
close(NORMALIZED);

(open(NORMALIZED,"<$tmp.normalized") && (defined($last = <NORMALIZED>)))
    || die "could not open $tmp.normalized";

%type  = ( "uni"   => 0,
	   "sp"    => 1,
	   "pir"   => 2,
	   "pirnr" => 3,
	   "gi"    => 4,
	   "fig"   => 5,
	   "tr"    => 6,
	   "tn"    => 7,
	   "nrl"   => 8
);

open(SYN,">$synF") 
    || die "could not open $synF";
while ($last)
{
    ($curr,$id) = split(/\t/,$last);
    chop $id;
    undef %same;
    $same{$id} = 1;
    while (defined($last = <NORMALIZED>) && 
	   (($seq,$id) = split(/\t/,$last)) &&
	   &same_seqs($curr,$seq))
    {
	chop $id;
	$same{$id} = 1;
#	$curr = $seq;                     # do not let it walk transitively (RAO: 2/6/05)
    }

    print SYN join("\t",sort keys(%same)),"\n";
}
close(SYN);
close(NORMALIZED);
unlink("$tmp.normalized");

sub good_seq {
    my($seq) = @_;

    my $xs = ($seq =~ tr/xX//);
    my $ln = length($seq);
    return (($ln - $xs) > 25);
}

sub same_seqs {
    my($s1,$s2) = @_;

    my $ln1 = length($s1);
    my $ln2 = length($s2);
    
    return ((abs($ln1-$ln2) < (0.3 * (($ln1 < $ln2) ? $ln1 : $ln2))) &&
	    ((($ln1 <= $ln2) && (index($s2,$s1) == 0)) ||
	     (($ln1 > $ln2)  && (index($s1,$s2) == 0))));
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3