[Bio] / FigKernelScripts / p3-rep-prots.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/p3-rep-prots.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download) (as text)

1 : parrello 1.1 =head1 Create Representative Genome Server Directory
2 :    
3 :     p3-rep-prots.pl [options] outDir
4 :    
5 :     This script processes a list of genome IDs to create a directory suitable for use by the L<RepresentativeGenomes> server.
6 : parrello 1.3 It will extract all the instances of the specified seed protein (default is Phenylanyl synthetase alpha chain). The list of genome IDs and
7 :     names will go in the output file C<complete.genomes> and a FASTA of the seed proteins in C<6.1.1.20.fasta>.
8 : parrello 1.1
9 :     =head2 Parameters
10 :    
11 :     The positional parameter is the name of the output directory. If it does not exist, it will be created.
12 :    
13 :     The standard input can be overriddn using the options in L<P3Utils/ih_options>.
14 :    
15 :     Additional command-line options are those given in L<P3Utils/col_options> plus the following
16 :     options.
17 :    
18 :     =over 4
19 :    
20 : parrello 1.3 =item clear
21 : parrello 1.1
22 : parrello 1.3 Clear the output directory if it already exists. The default is to leave existing files in place.
23 : parrello 1.1
24 : parrello 1.3 =item prot
25 : parrello 1.1
26 : parrello 1.3 Role name of the protein to use. The default is C<Phenylalanyl-tRNA synthetase alpha chain>.
27 : parrello 1.1
28 : parrello 1.4 =item dna
29 :    
30 :     If specified, a C<6.1.1.20.dna.fasta> file will be produced in addition to the others, containing
31 :     the DNA sequences of the proteins.
32 :    
33 :     =item debug
34 :    
35 :     If specified, status messages for the PATRIC3 API will be displayed.
36 :    
37 : parrello 1.1 =back
38 :    
39 :     =cut
40 :    
41 :     use strict;
42 :     use P3DataAPI;
43 :     use P3Utils;
44 :     use Stats;
45 :     use File::Copy::Recursive;
46 : parrello 1.2 use RoleParse;
47 : parrello 1.3 use Time::HiRes;
48 :     use Math::Round;
49 : parrello 1.4 use FastA;
50 :     use Data::Dumper;
51 : parrello 1.1
52 :     $| = 1;
53 :     # Get the command-line options.
54 :     my $opt = P3Utils::script_opts('outDir', P3Utils::col_options(), P3Utils::ih_options(),
55 : parrello 1.3 ['clear', 'clear the output directory if it exists'],
56 :     ['prot=s', 'name of the protein to use', { default => 'Phenylalanyl-tRNA synthetase alpha chain' }],
57 : parrello 1.4 ['dna', 'produce a DNA FASTA file in addition to the default files'],
58 :     ['debug', 'show P3 API messages']
59 : parrello 1.1 );
60 :     # Get the output directory name.
61 :     my ($outDir) = @ARGV;
62 :     if (! $outDir) {
63 :     die "No output directory specified.";
64 :     } elsif (! -d $outDir) {
65 :     print "Creating directory $outDir.\n";
66 :     File::Copy::Recursive::pathmk($outDir) || die "Could not create $outDir: $!";
67 :     } elsif ($opt->clear) {
68 :     print "Erasing directory $outDir.\n";
69 :     File::Copy::Recursive::pathempty($outDir) || die "Error clearing $outDir: $!";
70 :     }
71 : parrello 1.4 # Check for DNA mode.
72 :     my $dnaFile;
73 :     if ($opt->dna) {
74 :     $dnaFile = "$outDir/6.1.1.20.dna.fasta";
75 :     }
76 : parrello 1.1 # Create the statistics object.
77 :     my $stats = Stats->new();
78 :     # Create a filter from the protein name.
79 : parrello 1.3 my $protName = $opt->prot;
80 :     my @filter = (['eq', 'product', $protName]);
81 : parrello 1.2 # Save the checksum for the seed role.
82 : parrello 1.3 my $roleCheck = RoleParse::Checksum($protName);
83 : parrello 1.1 # Create a list of the columns we want.
84 : parrello 1.4
85 :     my @cols = qw(genome_id genome_name patric_id aa_sequence_md5 product);
86 :     if ($dnaFile) {
87 :     push @cols, 'na_sequence_md5';
88 :     }
89 : parrello 1.1 # Open the output files.
90 :     print "Setting up files.\n";
91 :     open(my $gh, '>', "$outDir/complete.genomes") || die "Could not open genome output file: $!";
92 :     open(my $fh, '>', "$outDir/6.1.1.20.fasta") || die "Could not open FASTA output file: $!";
93 : parrello 1.4 my $nh;
94 :     if ($dnaFile) {
95 :     open($nh, '>', $dnaFile) || die "Could not open DNA output file: $!";
96 :     }
97 : parrello 1.1 # Get access to PATRIC.
98 :     my $p3 = P3DataAPI->new();
99 : parrello 1.4 if ($opt->debug) {
100 :     $p3->debug_on(\*STDERR);
101 :     }
102 : parrello 1.1 # Open the input file.
103 :     my $ih = P3Utils::ih($opt);
104 :     # Read the incoming headers.
105 :     my ($outHeaders, $keyCol) = P3Utils::process_headers($ih, $opt);
106 : parrello 1.4 # Get the full list of proteins.
107 :     print "Reading proteins.\n";
108 : parrello 1.3 my $start0 = time;
109 : parrello 1.4 my $protList = P3Utils::get_data($p3, feature => \@filter, \@cols);
110 :     print scalar(@$protList) . " proteins returned in " . Math::Round::nearest(0.01, time - $start0) . " seconds.\n";
111 :     # Get the list of genomes we want.
112 :     print "Reading genomes.\n";
113 :     my $genomes = P3Utils::get_col($ih, $keyCol);
114 :     my %genomes = map { $_ => 1 } @$genomes;
115 :     print scalar(@$genomes) . " genomes found in input file.\n";
116 :     my ($gCount, $pCount) = 0;
117 :     # This will track the proteins for each genome. It maps a genome ID to a list of protein tuples [name, seq, dna].
118 :     my %proteins;
119 :     # Loop through the proteins.
120 :     print "Processing proteins.\n";
121 :     for my $prot (@$protList) {
122 :     my ($genome, $name, $fid, $seq, $product, $dna) = @$prot;
123 :     if ($fid) {
124 :     # We have a real feature, check the genome.
125 :     if (! $genomes{$genome}) {
126 :     $stats->Add(filteredGenome => 1);
127 :     } else {
128 : parrello 1.2 my $check = RoleParse::Checksum($product // '');
129 :     if ($check ne $roleCheck) {
130 :     $stats->Add(funnyProt => 1);
131 :     } else {
132 : parrello 1.4 push @{$proteins{$genome}}, [$name, $seq, $dna];
133 : parrello 1.2 $stats->Add(protFound => 1);
134 :     }
135 : parrello 1.1 }
136 :     }
137 : parrello 1.4 $pCount++;
138 :     print "$pCount proteins processed.\n" if $pCount % 10000 == 0;
139 :     }
140 :     # Process the genomes one at a time, remembering MD5s.
141 :     my %md5s;
142 :     print "Processing genomes.\n";
143 :     for my $genome (@$genomes) {
144 :     if (! $proteins{$genome}) {
145 :     $stats->Add(genomeNotFound => 1);
146 :     } else {
147 : parrello 1.1 my @prots = @{$proteins{$genome}};
148 :     $stats->Add(genomeFound => 1);
149 :     if (scalar @prots > 1) {
150 :     # Skip if we have multiple proteins.
151 :     $stats->Add(multiProt => 1);
152 : parrello 1.4 delete $proteins{$genome};
153 : parrello 1.1 } else {
154 : parrello 1.4 # Remember the genome name and sequence.
155 :     my ($name, $protMd5, $dnaMd5) = @{$prots[0]};
156 :     $proteins{$genome} = [$name, $protMd5, $dnaMd5];
157 :     $md5s{$protMd5} = 1;
158 :     if ($dnaMd5) {
159 :     $md5s{$dnaMd5} = 1;
160 :     }
161 :     $stats->Add(genomeSaved => 1);
162 :     }
163 :     }
164 :     }
165 :     # Get the sequences.
166 :     print "Reading MD5s.\n";
167 :     my $start1 = time;
168 :     my $md5Hash = $p3->lookup_sequence_data_hash([keys %md5s]);
169 :     print "Sequences retrieved in " . (time - $start1) . " seconds.\n";
170 :     for my $genome (keys %proteins) {
171 :     my ($name, $protMd5, $dnaMd5) = @{$proteins{$genome}};
172 :     my $seq = $md5Hash->{$protMd5};
173 :     if (! $seq) {
174 :     $stats->Add(missingProtein => 1);
175 :     } else {
176 :     print $gh "$genome\t$name\n";
177 :     print $fh ">$genome\n$seq\n";
178 :     if ($nh && $dnaMd5) {
179 :     my $dna = $md5Hash->{$dnaMd5};
180 :     if (! $dna) {
181 :     $stats->Add(missingDna => 1);
182 :     } else {
183 :     print $nh ">$genome\n$dna\n";
184 :     $stats->Add(dnaOut => 1);
185 :     }
186 : parrello 1.1 }
187 :     }
188 : parrello 1.4 $stats->Add(genomeOut => 1);
189 :     $gCount++;
190 :     print "$gCount genomes processed.\n" if $gCount % 10000 == 0;
191 : parrello 1.1 }
192 : parrello 1.4 $stats->Add(timeElapsed => int(time - $start0));
193 : parrello 1.3 print "All done.\n" . $stats->Show();

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3