[Bio] / FigKernelScripts / exclude_genomes.pl Repository:
ViewVC logotype

View of /FigKernelScripts/exclude_genomes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Tue Feb 5 02:28:56 2008 UTC (11 years, 9 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.2: +1 -1 lines
Fixed a POD error.

# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#


=pod

=head1 exclude_genomes

  Usage:     exclude_genomes  FileOfExcludedGenomeIDs DataDir ExtractedDataDir

  Function:  Converse of `extract_genomes` --- takes a file of genomes to exclude,
             rather than to extract. 

  Author:    Gordon D. Pusch (2004-Mar-29)

=cut

my $usage = "usage: exclude_genomes FileOfExcludedGenomeIDs DataDir ExtractedDataDir";

use FIG;
use strict;

my($genomesF,$fromFD,$toFD,%genomes,@genomes,$n);

(
 ($genomesF = shift @ARGV) &&
 ($fromFD   = shift @ARGV) && (-d $fromFD) &&
 ($toFD     = shift @ARGV) && (! -d $toFD)
)
    || die $usage;

mkdir($toFD,0777) || die "could not make $toFD";

my %excluded = map { chomp $_; $_ => 1 } `cut -f1 $genomesF`;
@genomes = ();
opendir(TMP,"$fromFD/Organisms") || die "could not open $fromFD/Organisms";
foreach $_ (grep { $_ =~ /^\d+\.\d+$/ } readdir(TMP))
{
    unless ($excluded{$_})
    {
	push(@genomes,$_);
    }
}
%genomes = map { $_ => 1 } @genomes;
my $genomes = \%genomes;
$n = @genomes;
print STDERR "extracting $n genomes\n";

&FIG::run("cd $fromFD; cp -rp Indexes KEGG NR $toFD");
print STDERR "copied basic stuff\n";
&copy_sims($genomes,"$fromFD/Sims","$toFD/Sims");
&copy_sims($genomes,"$fromFD/NewSims","$toFD/NewSims");
print STDERR "copied sims\n";
mkdir("$toFD/Organisms",0777) || die "could not make $toFD/Organisms";
opendir(TMP,"$fromFD/Organisms") || die "could not open $fromFD/Organisms";
my @all_genomes = grep { $_ =~ /^\d+\.\d+$/ } readdir(TMP);
closedir(TMP);

my $genome;
foreach $genome (@all_genomes)
{
    if ($genomes->{$genome})
    {
	&FIG::run("cp -rp $fromFD/Organisms/$genome $toFD/Organisms");
    }
}
print STDERR "copied organisms\n";

&copy_global($genomes,"$fromFD/Global","$toFD/Global");
print STDERR "copied Global\n";


sub copy_global {
    my($genomes,$globalF,$globalT) = @_;

    mkdir($globalT,0777) || die "could not make $globalT";
    &FIG::run("cd $globalF; cp -rf LinksToTools function.synonyms ext* role.neighborhoods $globalT");
    &copy_protein_families("$globalF/ProteinFamilies","$globalT/ProteinFamilies",$genomes);
    &copy_nr("$globalF/nr","$globalF/peg.synonyms","$globalT/nr","$globalT/peg.synonyms",$genomes);
    print STDERR "copied nr\n";
    &copy_sets("$globalF/chromosomal_clusters","$globalT/chromosomal_clusters",$genomes);
    &copy_sets("$globalF/pch_pins","$globalT/pch_pins",$genomes);
    print STDERR "copied sets\n";
    &copy_sim_gen("$globalF/SimGen","$globalT/SimGen",$genomes);
    print STDERR "copied SimGen\n";
}

sub copy_fasta_and_formatdb {
    my($from,$to,$genomes) = @_;

    &copy_fasta($from,$to,$genomes);
    &FIG::run("$FIG_Config::ext_bin/formatdb -p T -i $to");
}

sub copy_fasta {
    my($from,$to,$genomes) = @_;
    my($line,$x,$id,@set);

    open(IN,"<$from") || die "could not open $from";
    open(OUT,">$to") || die "could not open $to";
    $line = <IN>;
    while ($line && ($line =~ /^>(\S+)/))
    {
	$id = $1;
	@set = ($line);
	while (defined($line = <IN>) && ($line !~ /^>/))
	{
	    push(@set,$line);
	}
	
	if (($id =~ /^fig\|(\d+\.\d+)/) && (! $genomes->{$1}))
	{
#	    print STDERR "dropping $id from nr\n";
	}
	else
	{
	    foreach $x (@set)
	    {
		print OUT $x;
	    }
	}
    }
    close(IN);
    close(OUT);
}

sub copy_nr {
    my($from_nr,$from_syn,$to_nr,$to_syn,$genomes) = @_;
    my($main,$rest,@rest,$line);

    &copy_fasta($from_nr,$to_nr,$genomes);

    open(IN,"<$from_syn") || die "could not open $from_syn";
    open(OUT,">$to_syn")  || die "could not open $to_syn";
    while (defined($line = <IN>))
    {
	chop $line;
	($main,$rest) = split(/\t/,$line);
	@rest = grep { ! (($_ =~ /^fig\|(\d+\.\d+)/) && (! $genomes->{$1})) } split(/;/,$rest);
	
	if (@rest > 0)
	{
	    if (($main =~ /^fig\|(\d+\.\d+).*,(\d+)/) && (! $genomes->{$1}))
	    {
#               print STDERR "deleting entry for $main\n";
	    }
	    else
	    {
		print OUT "$main\t",join(";",@rest),"\n";
	    }
	}
    }
    close(IN);
    close(OUT);
}

sub copy_sim_gen {
    my($from,$to,$genomes) = @_;
    my(@files,$file);

    unless ((-s "$from/exemplar.fasta") && (-s "$from/nohit.fasta"))
    {
	print STDERR "Missing either $from/exemplar.fasta or $from/nohit.fasta\n\t--- skipping copying SimGen\n\n";
	return;
    }
    
    mkdir($to,0777) || die "could not make $to";
    &copy_fasta_and_formatdb("$from/exemplar.fasta","$to/exemplar.fasta",$genomes);
    &copy_fasta_and_formatdb("$from/nohit.fasta","$to/nohit.fasta",$genomes);
    mkdir("$to/AccessSets",0777) || die "could not make $to/AccessSets";
    opendir(TMP,"$from/AccessSets") || die "could not make $from/AccessSets";
    my @subdirs = grep { $_ =~ /^\d+$/ } readdir(TMP);
    closedir(TMP);

    my($subdir);
    foreach $subdir (@subdirs)
    {
	mkdir("$to/AccessSets/$subdir",0777) || die "could not make $to/AccessSets/$subdir";
	opendir(TMP,"$to/AccessSets/$subdir") || die "could not open $to/AccessSets/$subdir";
	@files = grep { $_ =~ /^(\d+)$/ } readdir(TMP);
	closedir(TMP);
	foreach $file (@files)
	{
	    &copy_fasta("$from/AccessSets/$subdir/$file","$to/AccessSets/$subdir/$file",$genomes);
	    &FIG::run("formatdb -p T -i $to/AccessSets/$subdir/$file");
	}
    }
}

sub copy_sets {
    my($from,$to,$genomes) = @_;
    my($line,$curr,@set,$x);

    my @deleted = ();
    open(IN,"<$from") || die "could not open $from";
    open(OUT,">$to")  || die "could not open $to";
    $line = <IN>;
    while ($line && ($line =~ /^(\d+)/))
    {
	$curr = $1;
	@set = ();
	while ($line && ($line =~ /^$curr\t/))
	{
	    push(@set,$line);
	    $line = <IN>;
	}

	if (@set > 1)
	{
	    foreach $x (@set)
	    {
		print OUT $x;
	    }
	}
	else
	{
	    push(@deleted,$curr);
	}
    }
    close(IN);
    close(OUT);
    return @deleted;
}

sub copy_protein_families {
    my($from,$to,$genomes) = @_;
    my($line);

    mkdir($to,0777) || die "could not make $to";
    my @deleted = &copy_sets("$from/families","$to/families",$genomes);
    my %deleted = map { $_ => 1 } @deleted;
    open(IN,"<$from/family_function") || die "could not open $from/family_function";
    open(OUT,">$to/family_function") || die "could not open $to/family_function";
    while (defined($line = <IN>))
    {
	if (($line =~ /^(\d+)/) && (! $deleted{$1}))
	{
	    print OUT $line;
	}
    }
    close(IN);
    close(OUT);
}

sub copy_sims {
    my($genomes,$simsF,$simsT) = @_;
    my($file);

    if (opendir(TMP,$simsF))
    {
	mkdir($simsT,0777) || die "could not make $simsT";
	my @files = grep { $_ !~ /^\./ } readdir(TMP);
	closedir(TMP);
	foreach $file (@files)
	{
	    &strip_file("$simsF/$file","$simsT/$file",$genomes);
	}
    }
}

sub strip_file {
    my($from,$to,$genomes) = @_;
    my($line,@hits,$i);

    open(IN,"<$from") || die "could not open $from";
    open(OUT,">$to")  || die "could not open $to";
    while (defined($line = <IN>))
    {
	@hits = ($line =~ /fig\|\d+\.\d+\.[a-z]{2,4}\.\d+/g);
	for ($i=0; ($i < @hits) && ($hits[$i] =~ /^fig\|(\d+\.\d+)/) && $genomes->{$1}; $i++) {}
	if ($i == @hits)
	{
	    print OUT $line;
	}
    }
    close(IN);
    close(OUT);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3