[Bio] / FigKernelScripts / p3-rep-prots.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/p3-rep-prots.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (view) (download) (as text)

1 : parrello 1.1 =head1 Create Representative Genome Server Directory
2 :    
3 :     p3-rep-prots.pl [options] outDir
4 :    
5 :     This script processes a list of genome IDs to create a directory suitable for use by the L<RepresentativeGenomes> server.
6 : parrello 1.3 It will extract all the instances of the specified seed protein (default is Phenylanyl synthetase alpha chain). The list of genome IDs and
7 :     names will go in the output file C<complete.genomes> and a FASTA of the seed proteins in C<6.1.1.20.fasta>.
8 : parrello 1.1
9 :     =head2 Parameters
10 :    
11 :     The positional parameter is the name of the output directory. If it does not exist, it will be created.
12 :    
13 :     The standard input can be overriddn using the options in L<P3Utils/ih_options>.
14 :    
15 :     Additional command-line options are those given in L<P3Utils/col_options> plus the following
16 :     options.
17 :    
18 :     =over 4
19 :    
20 : parrello 1.3 =item clear
21 : parrello 1.1
22 : parrello 1.3 Clear the output directory if it already exists. The default is to leave existing files in place.
23 : parrello 1.1
24 : parrello 1.3 =item prot
25 : parrello 1.1
26 : parrello 1.3 Role name of the protein to use. The default is C<Phenylalanyl-tRNA synthetase alpha chain>.
27 : parrello 1.1
28 : parrello 1.4 =item dna
29 :    
30 :     If specified, a C<6.1.1.20.dna.fasta> file will be produced in addition to the others, containing
31 :     the DNA sequences of the proteins.
32 :    
33 : parrello 1.5 =item binning
34 :    
35 :     If specified, a seed protein database suitable for binning will be produced with the specified name.
36 :     (This is similar to the C<dna> option, but produces the comments in a slightly different format).
37 :    
38 : parrello 1.4 =item debug
39 :    
40 :     If specified, status messages for the PATRIC3 API will be displayed.
41 :    
42 : parrello 1.1 =back
43 :    
44 :     =cut
45 :    
46 :     use strict;
47 :     use P3DataAPI;
48 :     use P3Utils;
49 :     use Stats;
50 :     use File::Copy::Recursive;
51 : parrello 1.2 use RoleParse;
52 : parrello 1.3 use Time::HiRes;
53 :     use Math::Round;
54 : parrello 1.4 use FastA;
55 :     use Data::Dumper;
56 : parrello 1.1
57 :     $| = 1;
58 :     # Get the command-line options.
59 :     my $opt = P3Utils::script_opts('outDir', P3Utils::col_options(), P3Utils::ih_options(),
60 : parrello 1.3 ['clear', 'clear the output directory if it exists'],
61 :     ['prot=s', 'name of the protein to use', { default => 'Phenylalanyl-tRNA synthetase alpha chain' }],
62 : parrello 1.4 ['dna', 'produce a DNA FASTA file in addition to the default files'],
63 : parrello 1.5 ['binning=s', 'produce a SEED protein binning database in the named file'],
64 : parrello 1.4 ['debug', 'show P3 API messages']
65 : parrello 1.1 );
66 :     # Get the output directory name.
67 :     my ($outDir) = @ARGV;
68 :     if (! $outDir) {
69 :     die "No output directory specified.";
70 :     } elsif (! -d $outDir) {
71 :     print "Creating directory $outDir.\n";
72 :     File::Copy::Recursive::pathmk($outDir) || die "Could not create $outDir: $!";
73 :     } elsif ($opt->clear) {
74 :     print "Erasing directory $outDir.\n";
75 :     File::Copy::Recursive::pathempty($outDir) || die "Error clearing $outDir: $!";
76 :     }
77 : parrello 1.4 # Check for DNA mode.
78 :     my $dnaFile;
79 :     if ($opt->dna) {
80 :     $dnaFile = "$outDir/6.1.1.20.dna.fasta";
81 :     }
82 : parrello 1.5 my $binning = $opt->binning;
83 : parrello 1.1 # Create the statistics object.
84 :     my $stats = Stats->new();
85 :     # Create a filter from the protein name.
86 : parrello 1.3 my $protName = $opt->prot;
87 :     my @filter = (['eq', 'product', $protName]);
88 : parrello 1.2 # Save the checksum for the seed role.
89 : parrello 1.3 my $roleCheck = RoleParse::Checksum($protName);
90 : parrello 1.1 # Create a list of the columns we want.
91 : parrello 1.4 my @cols = qw(genome_id genome_name patric_id aa_sequence_md5 product);
92 : parrello 1.5 my $dnaMode;
93 :     if ($dnaFile || $binning) {
94 : parrello 1.4 push @cols, 'na_sequence_md5';
95 : parrello 1.5 $dnaMode = 1;
96 : parrello 1.4 }
97 : parrello 1.1 # Open the output files.
98 :     print "Setting up files.\n";
99 :     open(my $gh, '>', "$outDir/complete.genomes") || die "Could not open genome output file: $!";
100 :     open(my $fh, '>', "$outDir/6.1.1.20.fasta") || die "Could not open FASTA output file: $!";
101 : parrello 1.5 my ($bh, $nh);
102 : parrello 1.4 if ($dnaFile) {
103 :     open($nh, '>', $dnaFile) || die "Could not open DNA output file: $!";
104 :     }
105 : parrello 1.5 if ($binning) {
106 :     open($bh, '>', $binning) || die "Could not open binning output file: $!";
107 :     }
108 : parrello 1.1 # Get access to PATRIC.
109 :     my $p3 = P3DataAPI->new();
110 : parrello 1.4 if ($opt->debug) {
111 :     $p3->debug_on(\*STDERR);
112 :     }
113 : parrello 1.1 # Open the input file.
114 :     my $ih = P3Utils::ih($opt);
115 :     # Read the incoming headers.
116 :     my ($outHeaders, $keyCol) = P3Utils::process_headers($ih, $opt);
117 : parrello 1.4 # Get the full list of proteins.
118 :     print "Reading proteins.\n";
119 : parrello 1.3 my $start0 = time;
120 : parrello 1.4 my $protList = P3Utils::get_data($p3, feature => \@filter, \@cols);
121 :     print scalar(@$protList) . " proteins returned in " . Math::Round::nearest(0.01, time - $start0) . " seconds.\n";
122 :     # Get the list of genomes we want.
123 :     print "Reading genomes.\n";
124 :     my $genomes = P3Utils::get_col($ih, $keyCol);
125 :     my %genomes = map { $_ => 1 } @$genomes;
126 :     print scalar(@$genomes) . " genomes found in input file.\n";
127 :     my ($gCount, $pCount) = 0;
128 :     # This will track the proteins for each genome. It maps a genome ID to a list of protein tuples [name, seq, dna].
129 :     my %proteins;
130 :     # Loop through the proteins.
131 :     print "Processing proteins.\n";
132 :     for my $prot (@$protList) {
133 :     my ($genome, $name, $fid, $seq, $product, $dna) = @$prot;
134 :     if ($fid) {
135 :     # We have a real feature, check the genome.
136 :     if (! $genomes{$genome}) {
137 :     $stats->Add(filteredGenome => 1);
138 :     } else {
139 : parrello 1.2 my $check = RoleParse::Checksum($product // '');
140 :     if ($check ne $roleCheck) {
141 :     $stats->Add(funnyProt => 1);
142 :     } else {
143 : parrello 1.4 push @{$proteins{$genome}}, [$name, $seq, $dna];
144 : parrello 1.2 $stats->Add(protFound => 1);
145 :     }
146 : parrello 1.1 }
147 :     }
148 : parrello 1.4 $pCount++;
149 :     print "$pCount proteins processed.\n" if $pCount % 10000 == 0;
150 :     }
151 :     # Process the genomes one at a time, remembering MD5s.
152 :     my %md5s;
153 :     print "Processing genomes.\n";
154 :     for my $genome (@$genomes) {
155 :     if (! $proteins{$genome}) {
156 :     $stats->Add(genomeNotFound => 1);
157 :     } else {
158 : parrello 1.1 my @prots = @{$proteins{$genome}};
159 :     $stats->Add(genomeFound => 1);
160 :     if (scalar @prots > 1) {
161 :     # Skip if we have multiple proteins.
162 :     $stats->Add(multiProt => 1);
163 : parrello 1.4 delete $proteins{$genome};
164 : parrello 1.1 } else {
165 : parrello 1.4 # Remember the genome name and sequence.
166 :     my ($name, $protMd5, $dnaMd5) = @{$prots[0]};
167 :     $proteins{$genome} = [$name, $protMd5, $dnaMd5];
168 :     $md5s{$protMd5} = 1;
169 :     if ($dnaMd5) {
170 :     $md5s{$dnaMd5} = 1;
171 :     }
172 :     $stats->Add(genomeSaved => 1);
173 :     }
174 :     }
175 :     }
176 :     # Get the sequences.
177 :     print "Reading MD5s.\n";
178 :     my $start1 = time;
179 :     my $md5Hash = $p3->lookup_sequence_data_hash([keys %md5s]);
180 :     print "Sequences retrieved in " . (time - $start1) . " seconds.\n";
181 :     for my $genome (keys %proteins) {
182 :     my ($name, $protMd5, $dnaMd5) = @{$proteins{$genome}};
183 :     my $seq = $md5Hash->{$protMd5};
184 :     if (! $seq) {
185 :     $stats->Add(missingProtein => 1);
186 :     } else {
187 :     print $gh "$genome\t$name\n";
188 :     print $fh ">$genome\n$seq\n";
189 : parrello 1.5 if ($dnaMd5) {
190 : parrello 1.4 my $dna = $md5Hash->{$dnaMd5};
191 :     if (! $dna) {
192 :     $stats->Add(missingDna => 1);
193 :     } else {
194 : parrello 1.5 if ($nh) {
195 :     print $nh ">$genome\n$dna\n";
196 :     $stats->Add(dnaOut => 1);
197 :     }
198 :     if ($bh) {
199 :     print $bh ">fig|$genome.peg.X $genome\t$name\n$dna\n";
200 :     $stats->Add(binDnaOut => 1);
201 :     }
202 : parrello 1.4 }
203 : parrello 1.1 }
204 :     }
205 : parrello 1.4 $stats->Add(genomeOut => 1);
206 :     $gCount++;
207 :     print "$gCount genomes processed.\n" if $gCount % 10000 == 0;
208 : parrello 1.1 }
209 : parrello 1.4 $stats->Add(timeElapsed => int(time - $start0));
210 : parrello 1.3 print "All done.\n" . $stats->Show();

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3