Parent Directory
|
Revision Log
Revision 1.4 - (view) (download) (as text)
1 : | parrello | 1.1 | =head1 Create Representative Genome Server Directory |
2 : | |||
3 : | p3-rep-prots.pl [options] outDir | ||
4 : | |||
5 : | This script processes a list of genome IDs to create a directory suitable for use by the L<RepresentativeGenomes> server. | ||
6 : | parrello | 1.3 | It will extract all the instances of the specified seed protein (default is Phenylanyl synthetase alpha chain). The list of genome IDs and |
7 : | names will go in the output file C<complete.genomes> and a FASTA of the seed proteins in C<6.1.1.20.fasta>. | ||
8 : | parrello | 1.1 | |
9 : | =head2 Parameters | ||
10 : | |||
11 : | The positional parameter is the name of the output directory. If it does not exist, it will be created. | ||
12 : | |||
13 : | The standard input can be overriddn using the options in L<P3Utils/ih_options>. | ||
14 : | |||
15 : | Additional command-line options are those given in L<P3Utils/col_options> plus the following | ||
16 : | options. | ||
17 : | |||
18 : | =over 4 | ||
19 : | |||
20 : | parrello | 1.3 | =item clear |
21 : | parrello | 1.1 | |
22 : | parrello | 1.3 | Clear the output directory if it already exists. The default is to leave existing files in place. |
23 : | parrello | 1.1 | |
24 : | parrello | 1.3 | =item prot |
25 : | parrello | 1.1 | |
26 : | parrello | 1.3 | Role name of the protein to use. The default is C<Phenylalanyl-tRNA synthetase alpha chain>. |
27 : | parrello | 1.1 | |
28 : | parrello | 1.4 | =item dna |
29 : | |||
30 : | If specified, a C<6.1.1.20.dna.fasta> file will be produced in addition to the others, containing | ||
31 : | the DNA sequences of the proteins. | ||
32 : | |||
33 : | =item debug | ||
34 : | |||
35 : | If specified, status messages for the PATRIC3 API will be displayed. | ||
36 : | |||
37 : | parrello | 1.1 | =back |
38 : | |||
39 : | =cut | ||
40 : | |||
41 : | use strict; | ||
42 : | use P3DataAPI; | ||
43 : | use P3Utils; | ||
44 : | use Stats; | ||
45 : | use File::Copy::Recursive; | ||
46 : | parrello | 1.2 | use RoleParse; |
47 : | parrello | 1.3 | use Time::HiRes; |
48 : | use Math::Round; | ||
49 : | parrello | 1.4 | use FastA; |
50 : | use Data::Dumper; | ||
51 : | parrello | 1.1 | |
52 : | $| = 1; | ||
53 : | # Get the command-line options. | ||
54 : | my $opt = P3Utils::script_opts('outDir', P3Utils::col_options(), P3Utils::ih_options(), | ||
55 : | parrello | 1.3 | ['clear', 'clear the output directory if it exists'], |
56 : | ['prot=s', 'name of the protein to use', { default => 'Phenylalanyl-tRNA synthetase alpha chain' }], | ||
57 : | parrello | 1.4 | ['dna', 'produce a DNA FASTA file in addition to the default files'], |
58 : | ['debug', 'show P3 API messages'] | ||
59 : | parrello | 1.1 | ); |
60 : | # Get the output directory name. | ||
61 : | my ($outDir) = @ARGV; | ||
62 : | if (! $outDir) { | ||
63 : | die "No output directory specified."; | ||
64 : | } elsif (! -d $outDir) { | ||
65 : | print "Creating directory $outDir.\n"; | ||
66 : | File::Copy::Recursive::pathmk($outDir) || die "Could not create $outDir: $!"; | ||
67 : | } elsif ($opt->clear) { | ||
68 : | print "Erasing directory $outDir.\n"; | ||
69 : | File::Copy::Recursive::pathempty($outDir) || die "Error clearing $outDir: $!"; | ||
70 : | } | ||
71 : | parrello | 1.4 | # Check for DNA mode. |
72 : | my $dnaFile; | ||
73 : | if ($opt->dna) { | ||
74 : | $dnaFile = "$outDir/6.1.1.20.dna.fasta"; | ||
75 : | } | ||
76 : | parrello | 1.1 | # Create the statistics object. |
77 : | my $stats = Stats->new(); | ||
78 : | # Create a filter from the protein name. | ||
79 : | parrello | 1.3 | my $protName = $opt->prot; |
80 : | my @filter = (['eq', 'product', $protName]); | ||
81 : | parrello | 1.2 | # Save the checksum for the seed role. |
82 : | parrello | 1.3 | my $roleCheck = RoleParse::Checksum($protName); |
83 : | parrello | 1.1 | # Create a list of the columns we want. |
84 : | parrello | 1.4 | |
85 : | my @cols = qw(genome_id genome_name patric_id aa_sequence_md5 product); | ||
86 : | if ($dnaFile) { | ||
87 : | push @cols, 'na_sequence_md5'; | ||
88 : | } | ||
89 : | parrello | 1.1 | # Open the output files. |
90 : | print "Setting up files.\n"; | ||
91 : | open(my $gh, '>', "$outDir/complete.genomes") || die "Could not open genome output file: $!"; | ||
92 : | open(my $fh, '>', "$outDir/6.1.1.20.fasta") || die "Could not open FASTA output file: $!"; | ||
93 : | parrello | 1.4 | my $nh; |
94 : | if ($dnaFile) { | ||
95 : | open($nh, '>', $dnaFile) || die "Could not open DNA output file: $!"; | ||
96 : | } | ||
97 : | parrello | 1.1 | # Get access to PATRIC. |
98 : | my $p3 = P3DataAPI->new(); | ||
99 : | parrello | 1.4 | if ($opt->debug) { |
100 : | $p3->debug_on(\*STDERR); | ||
101 : | } | ||
102 : | parrello | 1.1 | # Open the input file. |
103 : | my $ih = P3Utils::ih($opt); | ||
104 : | # Read the incoming headers. | ||
105 : | my ($outHeaders, $keyCol) = P3Utils::process_headers($ih, $opt); | ||
106 : | parrello | 1.4 | # Get the full list of proteins. |
107 : | print "Reading proteins.\n"; | ||
108 : | parrello | 1.3 | my $start0 = time; |
109 : | parrello | 1.4 | my $protList = P3Utils::get_data($p3, feature => \@filter, \@cols); |
110 : | print scalar(@$protList) . " proteins returned in " . Math::Round::nearest(0.01, time - $start0) . " seconds.\n"; | ||
111 : | # Get the list of genomes we want. | ||
112 : | print "Reading genomes.\n"; | ||
113 : | my $genomes = P3Utils::get_col($ih, $keyCol); | ||
114 : | my %genomes = map { $_ => 1 } @$genomes; | ||
115 : | print scalar(@$genomes) . " genomes found in input file.\n"; | ||
116 : | my ($gCount, $pCount) = 0; | ||
117 : | # This will track the proteins for each genome. It maps a genome ID to a list of protein tuples [name, seq, dna]. | ||
118 : | my %proteins; | ||
119 : | # Loop through the proteins. | ||
120 : | print "Processing proteins.\n"; | ||
121 : | for my $prot (@$protList) { | ||
122 : | my ($genome, $name, $fid, $seq, $product, $dna) = @$prot; | ||
123 : | if ($fid) { | ||
124 : | # We have a real feature, check the genome. | ||
125 : | if (! $genomes{$genome}) { | ||
126 : | $stats->Add(filteredGenome => 1); | ||
127 : | } else { | ||
128 : | parrello | 1.2 | my $check = RoleParse::Checksum($product // ''); |
129 : | if ($check ne $roleCheck) { | ||
130 : | $stats->Add(funnyProt => 1); | ||
131 : | } else { | ||
132 : | parrello | 1.4 | push @{$proteins{$genome}}, [$name, $seq, $dna]; |
133 : | parrello | 1.2 | $stats->Add(protFound => 1); |
134 : | } | ||
135 : | parrello | 1.1 | } |
136 : | } | ||
137 : | parrello | 1.4 | $pCount++; |
138 : | print "$pCount proteins processed.\n" if $pCount % 10000 == 0; | ||
139 : | } | ||
140 : | # Process the genomes one at a time, remembering MD5s. | ||
141 : | my %md5s; | ||
142 : | print "Processing genomes.\n"; | ||
143 : | for my $genome (@$genomes) { | ||
144 : | if (! $proteins{$genome}) { | ||
145 : | $stats->Add(genomeNotFound => 1); | ||
146 : | } else { | ||
147 : | parrello | 1.1 | my @prots = @{$proteins{$genome}}; |
148 : | $stats->Add(genomeFound => 1); | ||
149 : | if (scalar @prots > 1) { | ||
150 : | # Skip if we have multiple proteins. | ||
151 : | $stats->Add(multiProt => 1); | ||
152 : | parrello | 1.4 | delete $proteins{$genome}; |
153 : | parrello | 1.1 | } else { |
154 : | parrello | 1.4 | # Remember the genome name and sequence. |
155 : | my ($name, $protMd5, $dnaMd5) = @{$prots[0]}; | ||
156 : | $proteins{$genome} = [$name, $protMd5, $dnaMd5]; | ||
157 : | $md5s{$protMd5} = 1; | ||
158 : | if ($dnaMd5) { | ||
159 : | $md5s{$dnaMd5} = 1; | ||
160 : | } | ||
161 : | $stats->Add(genomeSaved => 1); | ||
162 : | } | ||
163 : | } | ||
164 : | } | ||
165 : | # Get the sequences. | ||
166 : | print "Reading MD5s.\n"; | ||
167 : | my $start1 = time; | ||
168 : | my $md5Hash = $p3->lookup_sequence_data_hash([keys %md5s]); | ||
169 : | print "Sequences retrieved in " . (time - $start1) . " seconds.\n"; | ||
170 : | for my $genome (keys %proteins) { | ||
171 : | my ($name, $protMd5, $dnaMd5) = @{$proteins{$genome}}; | ||
172 : | my $seq = $md5Hash->{$protMd5}; | ||
173 : | if (! $seq) { | ||
174 : | $stats->Add(missingProtein => 1); | ||
175 : | } else { | ||
176 : | print $gh "$genome\t$name\n"; | ||
177 : | print $fh ">$genome\n$seq\n"; | ||
178 : | if ($nh && $dnaMd5) { | ||
179 : | my $dna = $md5Hash->{$dnaMd5}; | ||
180 : | if (! $dna) { | ||
181 : | $stats->Add(missingDna => 1); | ||
182 : | } else { | ||
183 : | print $nh ">$genome\n$dna\n"; | ||
184 : | $stats->Add(dnaOut => 1); | ||
185 : | } | ||
186 : | parrello | 1.1 | } |
187 : | } | ||
188 : | parrello | 1.4 | $stats->Add(genomeOut => 1); |
189 : | $gCount++; | ||
190 : | print "$gCount genomes processed.\n" if $gCount % 10000 == 0; | ||
191 : | parrello | 1.1 | } |
192 : | parrello | 1.4 | $stats->Add(timeElapsed => int(time - $start0)); |
193 : | parrello | 1.3 | print "All done.\n" . $stats->Show(); |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |