[Bio] / FigKernelScripts / harvest_genomes_from_samples.pl Repository:
ViewVC logotype

View of /FigKernelScripts/harvest_genomes_from_samples.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Sun Jan 20 22:05:59 2013 UTC (6 years, 9 months ago) by overbeek
Branch: MAIN
CVS Tags: rast_rel_2014_0729, rast_rel_2014_0912, HEAD
possibly buggy - salvaged after the fact

use strict;
use Data::Dumper;

use SeedUtils;
use Proc::ParallelLoop;
use Getopt::Long;

my $processes = 1;
my $sampleDir;
my $new_genomes;
my $usage = "usage: harvest_genomes_from_samples -d SamplesDir -n NewGenomeObjects [-p Processes]";
my $rc    = GetOptions("p=i"            => \$processes,
		       "n=s"            => \$new_genomes,
                       "d=s"            => \$sampleDir
                      );
if (! $rc) { print STDERR $usage; exit }
(-d $sampleDir) || die "You need to specify an existing Samples directory";
($new_genomes && ((-d $new_genomes) || mkdir($new_genomes,0777)))
    || die "you need to give a new genomes directory";

my @samples = sort &in_directory($sampleDir);
my $args = [map { [$new_genomes,$sampleDir,$_] } @samples];
&pareach($args,\&do_one,{ Max_Workers => $processes });


sub do_one {
    my($tuple) = @_;
    my($new_genomes,$SampleDir,$sample) = @$tuple;
    print STDERR "processing $sample\n";
    my $dir = "$sampleDir/$sample";
    if (-d "$dir/Sets")
    {
	my @sets = &in_directory("$dir/Sets");
	foreach my $set (@sets)
	{
	    my $setD = "$dir/Sets/$set";
	    if (&complete($setD) && (! &multiple($setD)) && (! -d "$new_genomes/$set"))
	    {
		my $name   = &scientific_name($setD);
		my $domain = &domain($setD,$name);
		my $genetic_code = &genetic_code($setD,$name);
		&SeedUtils::run("fasta_to_genome '$name' $domain $genetic_code < $setD/contigs | annotate_genome > $setD/annotated.genome");
		&SeedUtils::run("genomeTO_to_reconstructionTO < $setD/annotated.genome > metabolic.reconstruction");
		&SeedUtils::run("cp -r $setD/annotated.genome $new_genomes/$set");
	    }
	}
    }
}

sub scientific_name {
    my($setD) = @_;
    
    my %poss_genus;
    foreach $_ (`cut -f5 $setD/kmer.report`)
    {
	if ($_ =~ /^(\S+)/) { $poss_genus{$1}++ }
    }
    my @poss = sort { $poss_genus{$b} <=> $poss_genus{$a} } keys(%poss_genus);
    return (@poss > 0) ? "$poss[0] sp." : "unknown prokaryotic organism";
}

sub domain {
    my($setD,$name) = @_;

    return "Bacteria";
}

sub genetic_code {
    my($setD,$name) = @_;

    if ($name !~ /(Mycoplasma|Ureoplasma)/) { return 11 }
}

sub complete {
    my($setD) = @_;

    if (! "$setD/status") { return () }
    my @tmp = `grep "genome is complete" $setD/status`;
    return (@tmp == 1);
}

sub multiple {
    my($setD) = @_;

    if (! "$setD/status") { return () }
    my @tmp = `grep multiple $setD/status`;
    return (@tmp == 1);
}

sub in_directory {
    my($dir) = @_;

    opendir(D,$dir) || die "could not open directory $dir";
    my @in = grep { $_ !~ /^\./ } readdir(D);
    closedir(D);
    return @in;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3