25 |
|
|
26 |
Role name of the protein to use. The default is C<Phenylalanyl-tRNA synthetase alpha chain>. |
Role name of the protein to use. The default is C<Phenylalanyl-tRNA synthetase alpha chain>. |
27 |
|
|
28 |
|
=item dna |
29 |
|
|
30 |
|
If specified, a C<6.1.1.20.dna.fasta> file will be produced in addition to the others, containing |
31 |
|
the DNA sequences of the proteins. |
32 |
|
|
33 |
|
=item debug |
34 |
|
|
35 |
|
If specified, status messages for the PATRIC3 API will be displayed. |
36 |
|
|
37 |
=back |
=back |
38 |
|
|
39 |
=cut |
=cut |
46 |
use RoleParse; |
use RoleParse; |
47 |
use Time::HiRes; |
use Time::HiRes; |
48 |
use Math::Round; |
use Math::Round; |
49 |
|
use FastA; |
50 |
|
use Data::Dumper; |
51 |
|
|
52 |
$| = 1; |
$| = 1; |
53 |
# Get the command-line options. |
# Get the command-line options. |
54 |
my $opt = P3Utils::script_opts('outDir', P3Utils::col_options(), P3Utils::ih_options(), |
my $opt = P3Utils::script_opts('outDir', P3Utils::col_options(), P3Utils::ih_options(), |
55 |
['clear', 'clear the output directory if it exists'], |
['clear', 'clear the output directory if it exists'], |
56 |
['prot=s', 'name of the protein to use', { default => 'Phenylalanyl-tRNA synthetase alpha chain' }], |
['prot=s', 'name of the protein to use', { default => 'Phenylalanyl-tRNA synthetase alpha chain' }], |
57 |
|
['dna', 'produce a DNA FASTA file in addition to the default files'], |
58 |
|
['debug', 'show P3 API messages'] |
59 |
); |
); |
60 |
# Get the output directory name. |
# Get the output directory name. |
61 |
my ($outDir) = @ARGV; |
my ($outDir) = @ARGV; |
68 |
print "Erasing directory $outDir.\n"; |
print "Erasing directory $outDir.\n"; |
69 |
File::Copy::Recursive::pathempty($outDir) || die "Error clearing $outDir: $!"; |
File::Copy::Recursive::pathempty($outDir) || die "Error clearing $outDir: $!"; |
70 |
} |
} |
71 |
|
# Check for DNA mode. |
72 |
|
my $dnaFile; |
73 |
|
if ($opt->dna) { |
74 |
|
$dnaFile = "$outDir/6.1.1.20.dna.fasta"; |
75 |
|
} |
76 |
# Create the statistics object. |
# Create the statistics object. |
77 |
my $stats = Stats->new(); |
my $stats = Stats->new(); |
78 |
# Create a filter from the protein name. |
# Create a filter from the protein name. |
81 |
# Save the checksum for the seed role. |
# Save the checksum for the seed role. |
82 |
my $roleCheck = RoleParse::Checksum($protName); |
my $roleCheck = RoleParse::Checksum($protName); |
83 |
# Create a list of the columns we want. |
# Create a list of the columns we want. |
84 |
my @cols = qw(genome_name patric_id aa_sequence product); |
|
85 |
|
my @cols = qw(genome_id genome_name patric_id aa_sequence_md5 product); |
86 |
|
if ($dnaFile) { |
87 |
|
push @cols, 'na_sequence_md5'; |
88 |
|
} |
89 |
# Open the output files. |
# Open the output files. |
90 |
print "Setting up files.\n"; |
print "Setting up files.\n"; |
91 |
open(my $gh, '>', "$outDir/complete.genomes") || die "Could not open genome output file: $!"; |
open(my $gh, '>', "$outDir/complete.genomes") || die "Could not open genome output file: $!"; |
92 |
open(my $fh, '>', "$outDir/6.1.1.20.fasta") || die "Could not open FASTA output file: $!"; |
open(my $fh, '>', "$outDir/6.1.1.20.fasta") || die "Could not open FASTA output file: $!"; |
93 |
|
my $nh; |
94 |
|
if ($dnaFile) { |
95 |
|
open($nh, '>', $dnaFile) || die "Could not open DNA output file: $!"; |
96 |
|
} |
97 |
# Get access to PATRIC. |
# Get access to PATRIC. |
98 |
my $p3 = P3DataAPI->new(); |
my $p3 = P3DataAPI->new(); |
99 |
|
if ($opt->debug) { |
100 |
|
$p3->debug_on(\*STDERR); |
101 |
|
} |
102 |
# Open the input file. |
# Open the input file. |
103 |
my $ih = P3Utils::ih($opt); |
my $ih = P3Utils::ih($opt); |
104 |
# Read the incoming headers. |
# Read the incoming headers. |
105 |
my ($outHeaders, $keyCol) = P3Utils::process_headers($ih, $opt); |
my ($outHeaders, $keyCol) = P3Utils::process_headers($ih, $opt); |
106 |
# Count the batches of input. |
# Get the full list of proteins. |
107 |
|
print "Reading proteins.\n"; |
108 |
my $start0 = time; |
my $start0 = time; |
109 |
my $gCount = 0; |
my $protList = P3Utils::get_data($p3, feature => \@filter, \@cols); |
110 |
# Loop through the input. |
print scalar(@$protList) . " proteins returned in " . Math::Round::nearest(0.01, time - $start0) . " seconds.\n"; |
111 |
while (! eof $ih) { |
# Get the list of genomes we want. |
112 |
my $couplets = P3Utils::get_couplets($ih, $keyCol, $opt); |
print "Reading genomes.\n"; |
113 |
# Convert the couplets to contain only genome IDs. |
my $genomes = P3Utils::get_col($ih, $keyCol); |
114 |
my @couples = map { [$_->[0], [$_->[0]]] } @$couplets; |
my %genomes = map { $_ => 1 } @$genomes; |
115 |
$stats->Add(genomeRead => scalar @couples); |
print scalar(@$genomes) . " genomes found in input file.\n"; |
116 |
# Get the features of interest for these genomes. |
my ($gCount, $pCount) = 0; |
117 |
my $protList = P3Utils::get_data($p3, feature => \@filter, \@cols, genome_id => \@couples); |
# This will track the proteins for each genome. It maps a genome ID to a list of protein tuples [name, seq, dna]. |
|
$gCount += scalar @couples; |
|
|
# Collate them by genome ID, discarding the nulls. |
|
118 |
my %proteins; |
my %proteins; |
119 |
|
# Loop through the proteins. |
120 |
|
print "Processing proteins.\n"; |
121 |
for my $prot (@$protList) { |
for my $prot (@$protList) { |
122 |
my ($genome, $name, $fid, $sequence, $product) = @$prot; |
my ($genome, $name, $fid, $seq, $product, $dna) = @$prot; |
123 |
if ($fid) { |
if ($fid) { |
124 |
# We have a real feature, check the function. |
# We have a real feature, check the genome. |
125 |
|
if (! $genomes{$genome}) { |
126 |
|
$stats->Add(filteredGenome => 1); |
127 |
|
} else { |
128 |
my $check = RoleParse::Checksum($product // ''); |
my $check = RoleParse::Checksum($product // ''); |
129 |
if ($check ne $roleCheck) { |
if ($check ne $roleCheck) { |
130 |
$stats->Add(funnyProt => 1); |
$stats->Add(funnyProt => 1); |
131 |
} else { |
} else { |
132 |
push @{$proteins{$genome}}, [$name, $sequence]; |
push @{$proteins{$genome}}, [$name, $seq, $dna]; |
133 |
$stats->Add(protFound => 1); |
$stats->Add(protFound => 1); |
134 |
} |
} |
135 |
} |
} |
136 |
} |
} |
137 |
# Process the genomes one at a time. |
$pCount++; |
138 |
for my $genome (keys %proteins) { |
print "$pCount proteins processed.\n" if $pCount % 10000 == 0; |
139 |
|
} |
140 |
|
# Process the genomes one at a time, remembering MD5s. |
141 |
|
my %md5s; |
142 |
|
print "Processing genomes.\n"; |
143 |
|
for my $genome (@$genomes) { |
144 |
|
if (! $proteins{$genome}) { |
145 |
|
$stats->Add(genomeNotFound => 1); |
146 |
|
} else { |
147 |
my @prots = @{$proteins{$genome}}; |
my @prots = @{$proteins{$genome}}; |
148 |
$stats->Add(genomeFound => 1); |
$stats->Add(genomeFound => 1); |
149 |
if (scalar @prots > 1) { |
if (scalar @prots > 1) { |
150 |
# Skip if we have multiple proteins. |
# Skip if we have multiple proteins. |
151 |
$stats->Add(multiProt => 1); |
$stats->Add(multiProt => 1); |
152 |
|
delete $proteins{$genome}; |
153 |
|
} else { |
154 |
|
# Remember the genome name and sequence. |
155 |
|
my ($name, $protMd5, $dnaMd5) = @{$prots[0]}; |
156 |
|
$proteins{$genome} = [$name, $protMd5, $dnaMd5]; |
157 |
|
$md5s{$protMd5} = 1; |
158 |
|
if ($dnaMd5) { |
159 |
|
$md5s{$dnaMd5} = 1; |
160 |
|
} |
161 |
|
$stats->Add(genomeSaved => 1); |
162 |
|
} |
163 |
|
} |
164 |
|
} |
165 |
|
# Get the sequences. |
166 |
|
print "Reading MD5s.\n"; |
167 |
|
my $start1 = time; |
168 |
|
my $md5Hash = $p3->lookup_sequence_data_hash([keys %md5s]); |
169 |
|
print "Sequences retrieved in " . (time - $start1) . " seconds.\n"; |
170 |
|
for my $genome (keys %proteins) { |
171 |
|
my ($name, $protMd5, $dnaMd5) = @{$proteins{$genome}}; |
172 |
|
my $seq = $md5Hash->{$protMd5}; |
173 |
|
if (! $seq) { |
174 |
|
$stats->Add(missingProtein => 1); |
175 |
} else { |
} else { |
|
# Get the genome name and sequence. |
|
|
my ($name, $seq) = @{$prots[0]}; |
|
176 |
print $gh "$genome\t$name\n"; |
print $gh "$genome\t$name\n"; |
177 |
print $fh ">$genome\n$seq\n"; |
print $fh ">$genome\n$seq\n"; |
178 |
$stats->Add(genomeOut => 1); |
if ($nh && $dnaMd5) { |
179 |
|
my $dna = $md5Hash->{$dnaMd5}; |
180 |
|
if (! $dna) { |
181 |
|
$stats->Add(missingDna => 1); |
182 |
|
} else { |
183 |
|
print $nh ">$genome\n$dna\n"; |
184 |
|
$stats->Add(dnaOut => 1); |
185 |
} |
} |
186 |
} |
} |
|
print "$gCount genomes processed at " . Math::Round::nearest(0.01, (time - $start0) / $gCount) . " seconds/genome.\n"; |
|
187 |
} |
} |
188 |
|
$stats->Add(genomeOut => 1); |
189 |
|
$gCount++; |
190 |
|
print "$gCount genomes processed.\n" if $gCount % 10000 == 0; |
191 |
|
} |
192 |
|
$stats->Add(timeElapsed => int(time - $start0)); |
193 |
print "All done.\n" . $stats->Show(); |
print "All done.\n" . $stats->Show(); |