[Bio] / FigKernelScripts / export_phage_data.pl Repository:
ViewVC logotype

View of /FigKernelScripts/export_phage_data.pl

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.3 - (download) (as text) (annotate)
Sun Oct 9 18:46:38 2011 UTC (8 years, 4 months ago) by redwards
Branch: MAIN
CVS Tags: mgrast_version_3_2, mgrast_dev_12152011, mgrast_release_3_1_2, mgrast_dev_10262011
Changes since 1.2: +29 -2 lines
A phage download exporter



This script exports the data from the phage genomes (only), into a directory that we specify. Here it is hard coded to one that Rob uses. Basically creates subdirectories with the current date, and exports all data in both genbank and fasta format. 

This may well be run as a cron job!


use strict;
use FIG;
my $fig=new FIG;
use Phage;
my $phage=new Phage;

my $destdir = "/var/www/phantome/Downloads";
unless (-e $destdir) {
	die "FATAL: $destdir does not exist. Are you running this on a machine that is not phantome?";

# figure out the date
my $timestamp = time;
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($timestamp);
$year += 1900;

my $date = sprintf("%04d-%02d-%02d", $year, $mon, $mday);

# make the directory structure for the outputs
# Something like:
# /var/www/phantome/Downloads/
#                             DNA/all_sequences/
#                             DNA/by_genome/DATE
#                             proteins/all_sequences
#                             proteins/by_genome/DATE
#                             genomes/genbank/DATE
#                             genomes/gff3/DATA

foreach my $subd (qw[DNA proteins genomes]) {`touch $destdir/$subd`}

mkdir "$destdir/DNA/by_genome/$date", 0755;
mkdir "$destdir/proteins/by_genome/$date", 0755;
mkdir "$destdir/genomes/genbank/$date", 0755;
mkdir "$destdir/genomes/gff3/$date", 0755;

# open the generic files that we'll need
open(CONTIGS, ">$destdir/DNA/all_sequences/phage_contigs_$timestamp.fasta") || die "Can't open contigs $destdir/DNA/all_sequences/phage_contigs_$timestamp.fasta";
open(PROTEINS, ">$destdir/proteins/all_sequences/phage_proteins_$timestamp.fasta") || die "can't open all proteins $destdir/proteins/all_sequences/phage_proteins_$timestamp.fasta";

# iterate through the phages and export all the data:
my @children;
foreach my $phage ($phage->phages()) {
	my $n;
	my $gs = $fig->genus_species($phage);
	print STDERR "$gs ($phage)\n";
	# open the genome file
	open(DNA, ">$destdir/DNA/by_genome/$date/$phage.fasta") || die "can't open $destdir/DNA/by_genome/$date/$phage.fasta";
	# process the contigs
	foreach my $contig ($fig->contigs_of($phage)) {
	 	my $end = $fig->contig_ln($phage, $contig);
		my $dnaseq = $fig->dna_seq($phage, ($contig."_1_".$end));
		print CONTIGS ">$contig [$phage] [$gs]\n$dnaseq\n";
		print DNA ">$contig\n$dnaseq\n";
	close DNA;

	open(PROT, ">$destdir/proteins/by_genome/$date/$phage.fasta") || die "can't open $destdir/proteins/by_genome/$date/$phage.fasta";
	# process the proteins
	foreach my $peg ($fig->pegs_of($phage)) {
		my $trans= $fig->get_translation($peg);
		my $fn = scalar($fig->function_of($peg));
                my $ss = join("; ", map {$_->[0]} $fig->subsystems_for_peg($peg));
                if (!$ss) {$ss = "Not in a subsystem"}
		print PROT ">$peg [$fn] [$ss]\n$trans\n";
		print PROTEINS ">$peg [$fn] [$ss] [$phage] [$gs]\n$trans\n";
	close PROT;

	# finally, make the gff and genbank files
	# for the gbk files: seed2genbank -g $i -o $i.gbk   -t all -color phage
	my $output = "-o $destdir/genomes/genbank/$date/$phage.gbk";
	my $opts = " -g $phage -t all -color phage ";
	my $s2g = '/home/fig/FIGdisk/FIG/bin/seed2genbank';
	my $pid = $fig->run_in_background(sub {system("$s2g $opts $output")});
	push @children, $pid;
	print STDERR "Started $s2g $opts $output with PID: $pid\n";

	# for the gff3 files
	$output = "-o $destdir/genomes/gff3/$date/$phage.gff3";
	$opts = " -g $phage -t all ";
	$s2g = '/home/fig/FIGdisk/FIG/bin/seed2gff';
	$pid = $fig->run_in_background(sub {system("$s2g $opts $output")});
	push @children, $pid;
	print STDERR "Started $s2g $opts $output with PID: $pid\n";

	# this is just to sleep and let the children catch up!
	unless ($n % 5) {sleep 5}
close CONTIGS;

#compress those files

system("gzip $destdir/DNA/all_sequences/phage_contigs_$timestamp.fasta");
system("gzip $destdir/proteins/all_sequences/phage_proteins_$timestamp.fasta");

# now update the link to current
system("ln -s $destdir/proteins/all_sequences/phage_proteins_$timestamp.fasta $destdir/proteins/all_sequences/current");
system("ln -s $destdir/DNA/all_sequences/phage_contigs_$timestamp.fasta $destdir/DNA/all_sequences/current");

# now we have to wait for the children to die before we can make tar archives
print  STDERR "waiting for children to finish\n"; my $t=time;
foreach my $child (@children) {
	waitpid($child, 0);
print STDERR "Waiting took ", (time - $t), " seconds \n";

my $tar = "tar zcf $date.tgz $date";

my @tarChildren;
my @directories = ("$destdir/DNA/by_genome/", "$destdir/proteins/by_genome/", "$destdir/genomes/gff3/", "$destdir/genomes/genbank/");

foreach my $dir (@directories) {	
	my $pid = $fig->run_in_background(sub {chdir($dir); system($tar)});
	push @tarChildren, $pid;

print  STDERR "waiting for tars to finish \n"; $t=time;
foreach my $child (@tarChildren) {
	waitpid($child, 0);
print STDERR "Waiting took ", (time - $t), " seconds \n";

# update the current symlink. Do this last in case anything craps out
foreach my $thisdir ("$destdir/DNA/by_genome/", "$destdir/proteins/by_genome/", "$destdir/genomes/genbank/", "$destdir/genomes/gff3/") {
	if (-l "$thisdir/current") {unlink "$thisdir/current"}
	`ln -s $thisdir/$date $thisdir/current`;

# create the mysql dump file
system("mysqldump -S /var/run/mysqld/mysqld.sock -u seed -ptheseed seed > $destdir/MySQLDump/mysqldump.$date");
system("gzip $destdir/mysqldump.$date");

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3