[Bio] / FigKernelScripts / check_sims_basic.pl Repository:
ViewVC logotype

View of /FigKernelScripts/check_sims_basic.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (download) (as text) (annotate)
Mon Sep 4 20:21:31 2006 UTC (13 years, 2 months ago) by redwards
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.7: +1 -1 lines
using is_environmental instead of seven 9s

# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#


$SIG{HUP} = 'ignore';

use FIG;
use File::Path;
use File::Basename;

$usage = "usage: check_sims_basic [-delint_dir=Dir] [-logfile=log] [-synonyms=peg_synonyms_file] NR [ < sims | - | SimsDir | Sims1 Sims2 Sims3 ...] > checked.sims [2> errors (recommended if a logfile isn't specified)]";

$outdir  = "";
$logfile = "";
$synfile = "";
$trouble = 0;
for ($i=0; $i < @ARGV; )
{
    if ($ARGV[$i] =~ m/-delint_dir=(\S+)/)
    {
	$outdir = $1;
	splice @ARGV, $i, 1;
	if (-d $outdir)
	{
	    $trouble = 1;
	    warn "$outdir exists";
	} else {
	    mkpath($outdir, 0, 0777) || die "Could not create $outdir";
	}
    }
    elsif ($ARGV[$i] =~ m/-logfile=(\S+)/)
    {
	$logfile = $1;
	splice @ARGV, $i, 1;
	open($logfh, ">$logfile") || die "Could not write-open $logfile";
    }
    elsif ($ARGV[$i] =~ m/-synonyms=(\S+)/)
    {
	$synfile = $1;
	splice @ARGV, $i, 1;
	if (-s $synfile)
	{
	    open(TMP, "<$synfile") || die "Could not read-open $synfile";
	    while (defined($entry = <TMP>))
	    {
		chomp $entry;
		$entry =~ m/^([^,]+),\d+(\S+)$/o;
		($major_syn, $syns) = ($1, $2);
		@syns = map { m/^([^,]+)/; $1 } split /;/, $syns;
		foreach $syn (@syns) { $major{$syn} = $major; }
	    }
	    close(TMP) || die "Could not close $synfile";
	}
    }
    elsif (-s $ARGV[$i]) { 
	++$i; 
    }
    else {
	$trouble = 1;
	print STDERR "Invalid arg $ARGV[$i]\n";
	++$i;
    }
}
die "aborting due to invalid args" if ($trouble);

(($nr = shift @ARGV) && (-s $nr))
    || die $usage;

if (@ARGV == 0) 
{
    if (-t STDIN)
    {
	push @ARGV, '-';
    }
    else
    {
	print STDERR "No arguments given --- checking $FIG_Config::data/Sims by default\n";
	push @ARGV, "$FIG_Config::data/Sims"; 
    }
}

if ((@ARGV == 1) && (-d $ARGV[0]))
{
    $sims_dir = shift @ARGV;
    opendir(SIMS, $sims_dir) || die "Could not open $sims_dir";
    @ARGV = grep !/^\./, readdir(SIMS);
    @ARGV = map  { $_ = "$sims_dir/$_" } @ARGV;
    closedir(SIMS) || die "Could not close $sims_dir";
}

$trouble = 0;
foreach $file (@ARGV)
{
    next if ($file eq '-');
    if (!-e $file) { print STDERR "Simfile $file does not exist"; $trouble = 1; }
}
die "There were nonexistent input files" if $trouble;

unless ($logfile) { $logfh = \*STDERR; }
unless ($outdir)  { $outfh = \*STDOUT; }

opendir(ORGS, "$FIG_Config::organisms") || die "Could not open dir $FIG_Config::organisms";
@env = grep s{^(\d+\.\d+)}{$FIG_Config::organisms/$1/Features/peg/fasta}, grep {$fig->is_environmental($_)} readdir(ORGS);
closedir(ORGS) || die "Could not close dir $FIG_Config::organisms";

foreach $file ($nr, @env)
{
    open(TMP, "<$file") || die "Could not read-open $file";
    print STDERR "Loading lengths from $file ...\n" if $ENV{FIG_VERBOSE};
    while (($id, $seqP) = &FIG::read_fasta_record(\*TMP))
    {
	$ln{$id} = length($$seqP);
    }
}


foreach $simfile (@ARGV)
{
    print STDERR "Processing $simfile\n" if $ENV{FIG_VERBOSE};
    
    open(SIMFILE, "<$simfile") || die "Could not open $simfile";
    if ($outdir)
    {
	$outfile  = "$outdir/" . basename($simfile);
	open($outfh, ">$outfile") || die "could not write-open $outfile";
    }
    
    while (defined($sim = <SIMFILE>))
    {
	chomp $sim;
	$sim =~ s/\t\t/\t/go;
	
	if ($sim =~ m/^(\S+)\t(\S+)\t(\d+|\d+\.\d+)\t\d+\t\d+\t\d+\t\d+\t\d+\t\d+\t\d+\t(\d+(\.\d*)?e[-+]?\d+|\d+\.\d+)\t(\d\.\d*e[-+]?\d+|\d+\.\d+|\d+)\t(\d+)\t(\d+)/o)
	{
	    # die "$1, $2, $3, $4, $5, $6, $7, $8, $9\n";
	    ($id1, $id2, $ln1, $ln2) = ($1, $2, $7, $8);
	    # die "$id1, $id2, $ln1, $ln2";
	    
	    if ($ln{$id1} && $ln{$id2} && ($ln{$id1} == $ln1) && ($ln{$id2} == $ln2))
	    {
		print $outfh "$sim\n";   #...print valid sims to OUTPUT
	    }
	    else
	    {
		if ($ln{$id1})
		{
		    if ($ln{$id1} != $ln1) { print $logfh "badlen1\t$simfile, $.:\t$id1\t$ln{$id1}\t$ln1\t$sim\n"; }
		}
		else
		{
		    if ($synfile)
		    {
			if ($major{$id1})
			{
			    print $logfh "synref1\t$simfile, $.:\t$id1\t\t\t$sim\n";
			}
			else
			{
			    print $logfh "undef1\t$simfile, $.:\t$id1\t\t\t$sim\n";
			}
		    }
		    else
		    {
			print $logfh "undef1\t$simfile, $.:\t$id1\t\t\t$sim\n";
		    }
		}
		
		if ($ln{$id2})
		{
		    if ($ln{$id2} != $ln2) { print $logfh "badlen2\t$simfile, $.:\t$id2\t$ln{$id2}\t$ln2\t$sim\n"; }
		}
		else
		{
		    if ($synfile)
		    {
			if ($major{$id2})
			{
			    print $logfh "synref2\t$simfile, $.:\t$id2\t\t\t$sim\n";
			}
			else
			{
			    print $logfh "undef2\t$simfile, $.:\t$id2\t\t\t$sim\n";
			}
		    }
		    else
		    {
			print $logfh "undef2\t$simfile, $.:\t$id2\t\t\t$sim\n";
		    }
		}
	    }
	}
	else
	{
	    print $logfh "INVALID FORMAT\t$simfile, $.:\t$sim\n";
	}
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3