[Bio] / FigKernelScripts / export_phage_data.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/export_phage_data.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : redwards 1.1 #__perl__
2 :    
3 :     =pod
4 :    
5 :     This script exports the data from the phage genomes (only), into a directory that we specify. Here it is hard coded to one that Rob uses. Basically creates subdirectories with the current date, and exports all data in both genbank and fasta format.
6 :    
7 :     This may well be run as a cron job!
8 :    
9 :     =cut
10 :    
11 :     use strict;
12 :     use FIG;
13 :     my $fig=new FIG;
14 :     use Phage;
15 :     my $phage=new Phage;
16 :    
17 :     my $destdir = "/var/www/phantome/Downloads";
18 :     unless (-e $destdir) {
19 :     die "FATAL: $destdir does not exist. Are you running this on a machine that is not phantome?";
20 :     }
21 :    
22 :     # figure out the date
23 :     my $timestamp = time;
24 :     my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($timestamp);
25 :     $mon++;
26 :     $year += 1900;
27 :    
28 : redwards 1.2 my $date = sprintf("%04d-%02d-%02d", $year, $mon, $mday);
29 :    
30 : redwards 1.1
31 :     # make the directory structure for the outputs
32 :     #
33 :     # Something like:
34 :     #
35 :     # /var/www/phantome/Downloads/
36 : redwards 1.2 # DNA/all_sequences/
37 : redwards 1.1 # DNA/by_genome/DATE
38 :     #
39 :     # proteins/all_sequences
40 :     # proteins/by_genome/DATE
41 :     #
42 :     # genomes/genbank/DATE
43 :     # genomes/gff3/DATA
44 :    
45 : redwards 1.2 foreach my $subd (qw[DNA proteins genomes]) {`touch $destdir/$subd`}
46 :    
47 : redwards 1.1 mkdir "$destdir/DNA/by_genome/$date", 0755;
48 :     mkdir "$destdir/proteins/by_genome/$date", 0755;
49 :     mkdir "$destdir/genomes/genbank/$date", 0755;
50 :     mkdir "$destdir/genomes/gff3/$date", 0755;
51 :    
52 :     # open the generic files that we'll need
53 :     open(CONTIGS, ">$destdir/DNA/all_sequences/phage_contigs_$timestamp.fasta") || die "Can't open contigs $destdir/DNA/all_sequences/phage_contigs_$timestamp.fasta";
54 :     open(PROTEINS, ">$destdir/proteins/all_sequences/phage_proteins_$timestamp.fasta") || die "can't open all proteins $destdir/proteins/all_sequences/phage_proteins_$timestamp.fasta";
55 :    
56 :    
57 :     # iterate through the phages and export all the data:
58 :     my @children;
59 :     foreach my $phage ($phage->phages()) {
60 :     my $n;
61 :     my $gs = $fig->genus_species($phage);
62 :     print STDERR "$gs ($phage)\n";
63 :    
64 :     # open the genome file
65 :     open(DNA, ">$destdir/DNA/by_genome/$date/$phage.fasta") || die "can't open $destdir/DNA/by_genome/$date/$phage.fasta";
66 :     # process the contigs
67 :     foreach my $contig ($fig->contigs_of($phage)) {
68 :     my $end = $fig->contig_ln($phage, $contig);
69 :     my $dnaseq = $fig->dna_seq($phage, ($contig."_1_".$end));
70 :     print CONTIGS ">$contig [$phage] [$gs]\n$dnaseq\n";
71 :     print DNA ">$contig\n$dnaseq\n";
72 :     }
73 :     close DNA;
74 :    
75 :     open(PROT, ">$destdir/proteins/by_genome/$date/$phage.fasta") || die "can't open $destdir/proteins/by_genome/$date/$phage.fasta";
76 :     # process the proteins
77 :     foreach my $peg ($fig->pegs_of($phage)) {
78 :     my $trans= $fig->get_translation($peg);
79 :     my $fn = scalar($fig->function_of($peg));
80 :     print PROT ">$peg [$fn]\n$trans\n";
81 :     print PROTEINS ">$peg [$fn] [$phage] [$gs]\n$trans\n";
82 :     }
83 :     close PROT;
84 :    
85 :     # finally, make the gff and genbank files
86 :     # for the gbk files: seed2genbank -g $i -o $i.gbk -t all -color phage
87 :     my $output = "-o $destdir/genomes/genbank/$date/$phage.gbk";
88 :     my $opts = " -g $phage -t all -color phage ";
89 :     my $s2g = '/home/fig/FIGdisk/FIG/bin/seed2genbank';
90 :     my $pid = $fig->run_in_background(sub {system("$s2g $opts $output")});
91 :     push @children, $pid;
92 :     print STDERR "Started $s2g $opts $output with PID: $pid\n";
93 :    
94 :     # for the gff3 files
95 :     $output = "-o $destdir/genomes/gff3/$date/$phage.gff3";
96 :     $opts = " -g $phage -t all ";
97 :     $s2g = '/home/fig/FIGdisk/FIG/bin/seed2gff';
98 :     $pid = $fig->run_in_background(sub {system("$s2g $opts $output")});
99 :     push @children, $pid;
100 :     print STDERR "Started $s2g $opts $output with PID: $pid\n";
101 :    
102 :     # this is just to sleep and let the children catch up!
103 :     $n++;
104 :     unless ($n % 5) {sleep 5}
105 :     }
106 :    
107 :     # now we have to wait for the children to die before we can make tar archives
108 :     print STDERR "waiting for children to finish\n"; my $t=time;
109 :     foreach my $child (@children) {
110 :     waitpid($child, 0);
111 :     }
112 :     print STDERR "Waiting took ", (time - $t), " seconds \n";
113 :    
114 :     my $tar = "tar zcf $date.tgz $date";
115 :    
116 :     my @tarChildren;
117 :     my @directories = ("$destdir/DNA/by_genome/", "$destdir/proteins/by_genome/", "$destdir/genomes/gff3/", "$destdir/genomes/genbank/");
118 :    
119 :     foreach my $dir (@directories) {
120 :     my $pid = $fig->run_in_background(sub {chdir($dir); system($tar)});
121 :     push @tarChildren, $pid;
122 :     }
123 :    
124 :     print STDERR "waiting for tars to finish \n"; $t=time;
125 :     foreach my $child (@tarChildren) {
126 :     waitpid($child, 0);
127 :     }
128 :     print STDERR "Waiting took ", (time - $t), " seconds \n";
129 :    
130 : redwards 1.2
131 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3