[Bio] / FigKernelScripts / extract_genomes.pl Repository:
ViewVC logotype

View of /FigKernelScripts/extract_genomes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.38 - (download) (as text) (annotate)
Mon Jan 23 19:20:54 2006 UTC (13 years, 9 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.37: +151 -0 lines
changes to support dynamic sims and updates

########################################################################
# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#
# If the optional last argument is given, the DATA that drives the extraction is taken
# from the specified file
#

my $usage = "usage: extract_genomes [-invert] all|complete|unrestricted|FileOfGenomeIDs DataDir ExtractedDataDir [DATA-FILE]";

use FIG;
#use strict;

my $fig = new FIG;

my($invert, $genomesF, $fromFD, $toFD, %exclude, %genomes, @genomes, $n);

$invert = "";
if ($ARGV[0] eq '-invert') { $invert = shift @ARGV; }

(     ($genomesF = shift @ARGV) 
   && ($fromFD   = shift @ARGV) 
   && ($toFD     = shift @ARGV) 
)
    || die "\n   $usage\n\n";

(-d $fromFD) || die "Source data-directory $fromFD does not exist";
(!-d $toFD)  || die "Output data-directory $toFD already exists";

if ($invert) { 
    %exclude = map { $_ => 1 } map { $_ =~ /^(\d+\.\d+)/; $1 } `cut -f1 $genomesF`; 
}

if ($genomesF =~ /^all$/)
{
    @genomes = $fig->genomes();
}
if ($genomesF =~ /^complete$/)
{
    @genomes = $fig->genomes('complete');
}
if ($genomesF =~ /^unrestricted$/i)
{
    @genomes = ();
    opendir(TMP,"$fromFD/Organisms") || die "could not open $fromFD/Organisms";
    foreach $_ (grep { $_ =~ /^\d+\.\d+$/ } readdir(TMP))
    {
	if (! -s "$fromFD/Organisms/$_/RESTRICTIONS")
	{
	    push(@genomes,$_);
	}
    }
}
else
{
    @genomes = map { $_ =~ /^(\d+\.\d+)/; $1 } `cut -f1 $genomesF`;
}
    
if ($invert)
{
    opendir(FROM,"$fromFD/Organisms") || die "bad";
    my @tmp = grep { $_ =~ /^\d+\.\d+/ } readdir(FROM);
    closedir(FROM);

    %genomes = map { $_ => 1 } grep { (not defined($exclude{$_})) } @tmp;
    @genomes = sort keys %genomes;
}
else
{
    %genomes = map { $_ => 1 } @genomes;
}


my $genome;
foreach $genome (keys(%genomes))
{
    if (-s "$fromFD/Organisms/$genome/DELETE")
    {
	print STDERR "*** $genome was deleted; it will not be copied\n";
	delete $genomes{$genome};
	@genomes = sort { $a <=> $b } keys(%genomes);
    }
}

foreach $genome (@genomes)
{
#    print STDERR "$genome\n";
}

my $genomes = \%genomes;
$n = @genomes;
# print STDERR "extracting $n genomes\n";

my $struct;
if (@ARGV == 1)
{
    my @data = `cat $ARGV[0]`;
    chomp @data;

    $struct = &structure(\@data,0,0);
}
else
{
    $struct = &read_structure;
}
#print STDERR &Dumper($struct);
if (! &verify_structure($fromFD,$struct->[0]))
{
    die "aborting due to inability to verify source directory";
}
&make_extract($fromFD,$toFD,$genomes,$struct->[0]);
&condense_deletions($toFD);

print "STDERR Successful extraction\n";

sub verify_structure {
    my($dir,$struct) = @_;

    if (! $struct)     { print STDERR "failed: $dir; no structure"; return 0 }

    my($pattern,$action,$children) = @$struct;
    $dir  =~ /([^\/]+)$/;
    my $dirE = $1;
    
#   print STDERR "checking $dirE against $pattern: ";
    if (&matches($dirE,$pattern))
    {
#	print STDERR "success\n";
	if ((! $action) && (-d $dir))
	{
	    opendir(TMP,$dir) || die "could not open $dir";
	    my @sub = grep { $_ !~ /^\./ } readdir(TMP);
	    closedir(TMP);
	    
	    my($i,$j,$ok);
	    for ($i=0, $ok=1; ($i < @sub); $i++)
	    {
		for ($j=0; ($j < @$children) && (! &verify_structure("$dir/$sub[$i]",$children->[$j])); $j++) {}
		if ($j == @$children)
		{
		    $ok = 0;
		    print STDERR "failed: $dir/$sub[$i] failed to verify\n";
		}
	    }
	    return $ok;
	}
	return 1;
    }
#   print STDERR "failed\n";
    return 0;
}

sub condense_deletions {
    my($toFD) = @_;

    if (-s "$toFD/Global/deleted.features")
    {
	&remove_global_deleted_features($toFD);
    }
    
    &reorg_all_features($toFD);
}

sub remove_global_delete_features {
    my($data_dir) = @_;

    if (open(GDEL,"<$data_dir/Global/deleted.features"))
    {
	while (defined($_ = <GDEL>))
	{
	    if ($_ =~ /(fig\|(\d+\.\d+)\.([a-zA-Z0-9_]+)\.\d+)/)
	    {
		my($genome,$type,$fid) = ($2,$3,$1);
		if (open(LDEL,">>$data_dir/Organisms/$genome/Features/$type/deleted.features"))
		{
		    print LDEL "$fig\n";
		    close(LDEL);
		}
	    }
	}
	close(GDEL);
    }
    unlink("$data_dir/Global/deleted.features");
}

sub reorg_all_features {
    my($data_dir) = @_;
    my($genome,@genomes,$type,@features);

    if (opendir(ORG,"$data_dir/Organisms"))
    {
	@genomes = grep { $_ =~ /^\d+\.\d+$/ } readdir(ORG);
	closedir(ORG);

	foreach $genome (@genomes)
	{
	    if (opendir(FEAT,"$data_dir/Organisms/$genome/Features"))
	    {
		@features = grep { $_ !~ /^\./ } readdir(FEAT);
		closedir(FEAT);
		foreach $type (@features)
		{
		    my %deleted;
		    if (-s "$data_dir/Organisms/$genome/Features/$type/deleted.features")
		    {
			%deleted = map { $_ =~ /^(\S+)/; $1 => 1 } `cat $data_dir/Organisms/$genome/Features/$type/deleted.features`;
		    }
		    
		    &reorg_tbl("$data_dir/Organisms/$genome/Features/$type/tbl",\%deleted);
		    &reorg_fasta("$data_dir/Organisms/$genome/Features/$type/fasta",\%deleted);
		}
	    }
	}
    }
}
			
sub reorg_tbl {
    my($tbl,$deleted) = @_;
    my(%seen,@new_tbl,$reorg_needed);

    my @tbl = `cat $tbl`;
    for ($i = $#tbl; ($i >= 0); $i--)
    {
	if (($tbl[$i] =~ /^(\S+)/) && (! $deleted->{$1}) && (! $seen{$1}))
	{
	    $seen{$1} = 1;
	    unshift(@new_tbl,$tbl[$i]);
	}
	else
	{
	    $reorg_needed = 1;
	}
    }

    if ($reorg_needed)
    {
	if (-e "$tbl~") { unlink("$tbl~") }
	
	if ((rename($tbl,"$tbl~")) && open(TBL,">$tbl"))
	{
	    foreach $_ (@new_tbl)
	    {
		print TBL $_;
	    }
	    close(TBL);
	}
    }
}

sub reorg_fasta {
    my($fasta,$deleted) = @_;
    my(@fasta,%seen,@new_fasta,$reorg_needed,$id,$seq);

     $/ = "\n>";
    @fasta = map { $_ =~ /^>?(\S+)[^\n]*\n(.*)/s; $id = $1; $seq = $2; $seq =~ s/\s//gs; [$id,$seq] }
             `cat $fasta`;
    $/ = "\n";

    for ($i = $#fasta; ($i >= 0); $i--)
    {
	$id = $fasta[$i]->[0];
	if ((! $deleted->{$id}) && (! $seen{$id}))
	{
	    $seen{$id} = 1;
	    unshift(@new_fasta,$fasta[$i]);
	}
	else
	{
	    $reorg_needed = 1;
	}
    }

    if ($reorg_needed)
    {
	if (-e "$fasta~") { unlink("$fasta~") }
	
	if ((rename($fasta,"$fasta~")) && open(FASTA,">$fasta"))
	{
	    foreach $_ (@new_fasta)
	    {
		$id = $_->[0];
		$seq = $_->[1];
		print FASTA ">$id\n$seq\n";
	    }
	    close(FASTA);
	}
    }
}


sub make_extract {
    my($fromFD,$toFD,$genomes,$struct) = @_;

    if (! $struct)     { die $fromFD }

    my($pattern,$action,$children) = @$struct;
    $fromFD  =~ /([^\/]+)$/;
    my $dirE = $1;

    if (&matches($dirE,$pattern))
    {
#       print STDERR "extracting $dirE\n";
	if ((! $action) && (-d $fromFD))
	{
	    if (@$children < 1) { print STDERR &Dumper($struct); die "bad structure" }
	    mkdir($toFD,0777) || die "could not make $toFD";
	    opendir(TMP,$fromFD) || die "could not open $fromFD";
	    my @sub = grep { $_ !~ /^\./ } readdir(TMP);
	    closedir(TMP);

	    my($sub);
	    foreach $sub (@sub)
	    {
		my($i);
		for ($i=0; 
		     ($i < @$children) && (! &matches($sub,$children->[$i]->[0]));
		     $i++) {}
		if ($i < @$children)
		{
		    &make_extract("$fromFD/$sub","$toFD/$sub",$genomes,$children->[$i]);
		}
		else
		{
		    print STDERR &Dumper($fromFD,$sub,$struct);
		    die "somethings is screwed up";
		}
	    }
	}
	else
	{
	    if ($action)
	    {
#	        print STDERR "    invoking $action\n";
		&$action($fromFD,$toFD,$genomes);
	    }
	}
    }
}


sub matches {
    my($str,$pattern) = @_;

    if ($str eq $pattern) { return 1 }
    return (($pattern =~ /^\/(.*)\/$/) && ($str =~ /$1/));
}
	

sub read_structure {
    
    my @data = <DATA>;
    chomp @data;

    return &structure(\@data,0,0);
}

sub structure {
    my($data,$i,$level) = @_;

    my $line = $data->[$i];
    if ($line =~ /^\t{$level}(\S[^,]*)(,([^,]+))?/)
    {
	my $pattern  = $1;
	my $action   = $3;
	my $children = [];

	my($child,$tuple,$j);
	$j = $i + 1;
	while ($tuple  = &structure($data,$j,$level+1))
	{
	    ($child,$j) = @$tuple;
	    push(@$children,$child);
	}
	return [[$pattern,$action,$children],$j];
    }
    return undef;
}

sub check_and_copy {
    my($from,$to,$genomes) = @_;

    if (($from =~ /(\d+\.\d+)$/) && $genomes->{$1})
    {
        &FIG::run("cp -r \"$from\" \"$to\"");
    }
}

sub check_and_filter {
    my($from,$to,$genomes) = @_;

    if (($from =~ /(\d+\.\d+)$/) && $genomes->{$1})
    {
	&filter($from,$to,$genomes);
    }
}

sub filter {
    my($from,$to,$genomes) = @_;

    open(FROM,"<$from") || die "could not open $from";
    open(TO,">$to")     || die "could not open $to";

    my($line);
    while (defined($line = <FROM>))
    {
	my(@bad) = grep { ($_ =~ /fig\|(\d+\.\d+)/) && (! $genomes->{$1}) } ($line =~ /fig\|\d+\.\d+\./g);
	if (@bad == 0)
	{
	    print TO $line;
	}
    }
    close(FROM);
    close(TO);
}


sub copy {
    my($from,$to,$genomes) = @_;

    open(FROM,"<$from") || die "could not open $from";
    open(TO,">$to")     || die "could not open $to";

    my($line);
    while (defined($line = <FROM>))
    {
	print TO $line;
    }
    close(FROM);
    close(TO);
}

sub erase {
    my($from,$to,$genomes) = @_;

###  null routine is equivalent to erasing a file or directory

}

sub filter_fasta_and_formatdb {
    my($from,$to,$genomes) = @_;

    $/ = "\n>";
    open(FROM,"<$from") || die "could not open $from";
    open(TO,">$to")     || die "could not open $to";

    my($line);
    while (defined($line = <FROM>))
    {
	chomp;
	if ($line =~ /^>?((\S+)[^\n]*\n.*)/s)
	{
	    my $all_but = $1;
	    my $id      = $2;
	    if (! (($id =~ /^fig\|(\d+\.\d+)\./) && (! $genomes->{$1})))
	    {
		print TO ">$all_but\n";
	    }
	}
    }
    $/ = "\n";
    close(FROM);
    close(TO);
    if (-s $to)
    {
#	&FIG::run("formatdb -i $to -p T 2> /dev/null");
    }
}

sub filter_synonyms {
    my($from,$to,$genomes) = @_;

    open(FROM,"<$from") || die "could not open $from";
    open(TO,">$to")     || die "could not open $to";

    while (defined($_ = <FROM>))
    {
	chomp;
	my($hdr,$rest) = split(/\t/,$_);
	my @rest = grep { ! (($_ =~ /^fig\|(\d+\.\d+)/) && (! $genomes->{$1})) } 
	           split(/;/,$rest);
	if (@rest > 0) 
	{
	    if (($hdr =~ /^fig\|(\d+\.\d+)/) && (! $genomes->{$1}))
	    {
		if (@rest > 1)
		{
		    $hdr = shift @rest;
		    print TO "$hdr\t",join(";",@rest),"\n";
		}
	    }
	    else
	    {
		print TO "$hdr\t",join(";",@rest),"\n";
	    }
	}
    }
    close(FROM);
    close(TO);
}

sub copyR {
    my($from,$to,$genomes) = @_;

    &FIG::run("cp -r \"$from\" \"$to\"");
}

sub filter_rows {
    my($from,$to,$genomes) = @_;

    open(FROM,"<$from") || die "could not open $from";
    open(TO,">$to")     || die "could not open $to";

    my($line);
    while (defined($line = <FROM>))
    {
	if (! (($line =~ /^(\d+\.\d+)\t/) && (! $genomes->{$1})))
	{
	    print TO $line;
	}
    }
    close(FROM);
    close(TO);
}

__DATA__
/^Data.*$/
	Assignments
		/^\S+$/,erase
	CouplingData
		PCHs
			/^\d+\.\d+$/,check_and_filter
		scores,filter
	CloseStrainSets
		/^[A-Z].*$/
			genomes,filter
			sets,filter_rows
			Blocks
				genome.tbl,filter_rows
				region.tbl,filter_rows
				intergenic_region.tbl,filter_rows
				intergenic_block.tbl,copy
				block.tbl,copy
			/^.*$/,copy
	HOPSS,copyR
	P2PQueue,erase
	AbstractFuncCoupling
		/^\S+$/
			Data
				/^\d+\.\d+$/,filter
	Global
		Attributes
			/^[^\#].*[^~]$/,filter
		BBHs
			/^\d+\.\d+$/,check_and_filter
		LinksToTools,copy
		ControlledVocabulary,copyR
		DAS
			/^\S+$/,erase
		Literature
			/^\S+$/,copy
		Models
			/^\d+\.\d+$/,copy
		ProteinFamilies
			Sources
				FIG
					/^\S+$/,filter
				/^\S+$/,copyR
			/^id.map$/,filter
			/^local.*$/,filter
		background_jobs
			/^\S+$/,erase
		/pirsfcorrespondence.txt*/,filter
		/pirsfinfo.dat*/,filter
		chromosomal_clusters,filter
		queued_similarities,filter
		conflicted.pegs,filter
		deleted.features,filter
		ext_func.table,copy
		ext_org.table,copy
		function.synonyms,copy
		/cv_search*/,copyR
		/CVS*/,copyR
		nr,filter_fasta_and_formatdb
		/^nr\./,erase
		pch_pins,filter
		peg.synonyms,filter_synonyms
		pirsfcorrespondance.txt,filter
		pirsfinfo.dat,copy
		role.neighborhoods,copy
	Indexes
		/^\S+$/,erase
	JobQueue
		/^\S+$/,erase
	KEGG,copyR
	NR,copyR
	NewSims
		/^\S+$/,filter
	Sims
		/^\S+$/,filter
	Subsystems
		/^\S.*\S$/
			Alignments,erase
			Backup,copyR
			/^[CEV].*$/,copy
			curation.log,copy
			notes,copy
			reactions,copy
			spreadsheet,filter_rows
			SubsystemDiagrams,copyR
			diagrams,copyR
			/^assignments*/,erase
			/^rowss*/,erase
			constructs,copyR
			rows,erase
			/\.log$/,copyR
			MAP_SUPPORT,copyR
			/^.*~$/,erase
	Organisms
		/^\d+\.\d+$/,check_and_copy
	Readonly,erase
	RELEASE,copy

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3