[Bio] / FigKernelScripts / p3-rep-prots.pl Repository:
ViewVC logotype

Diff of /FigKernelScripts/p3-rep-prots.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Tue Aug 21 20:35:55 2018 UTC revision 1.4, Fri Oct 19 16:06:14 2018 UTC
# Line 25  Line 25 
25    
26  Role name of the protein to use. The default is C<Phenylalanyl-tRNA synthetase alpha chain>.  Role name of the protein to use. The default is C<Phenylalanyl-tRNA synthetase alpha chain>.
27    
28    =item dna
29    
30    If specified, a C<6.1.1.20.dna.fasta> file will be produced in addition to the others, containing
31    the DNA sequences of the proteins.
32    
33    =item debug
34    
35    If specified, status messages for the PATRIC3 API will be displayed.
36    
37  =back  =back
38    
39  =cut  =cut
# Line 37  Line 46 
46  use RoleParse;  use RoleParse;
47  use Time::HiRes;  use Time::HiRes;
48  use Math::Round;  use Math::Round;
49    use FastA;
50    use Data::Dumper;
51    
52  $| = 1;  $| = 1;
53  # Get the command-line options.  # Get the command-line options.
54  my $opt = P3Utils::script_opts('outDir', P3Utils::col_options(), P3Utils::ih_options(),  my $opt = P3Utils::script_opts('outDir', P3Utils::col_options(), P3Utils::ih_options(),
55          ['clear', 'clear the output directory if it exists'],          ['clear', 'clear the output directory if it exists'],
56          ['prot=s', 'name of the protein to use', { default => 'Phenylalanyl-tRNA synthetase alpha chain' }],          ['prot=s', 'name of the protein to use', { default => 'Phenylalanyl-tRNA synthetase alpha chain' }],
57            ['dna', 'produce a DNA FASTA file in addition to the default files'],
58            ['debug', 'show P3 API messages']
59          );          );
60  # Get the output directory name.  # Get the output directory name.
61  my ($outDir) = @ARGV;  my ($outDir) = @ARGV;
# Line 55  Line 68 
68      print "Erasing directory $outDir.\n";      print "Erasing directory $outDir.\n";
69      File::Copy::Recursive::pathempty($outDir) || die "Error clearing $outDir: $!";      File::Copy::Recursive::pathempty($outDir) || die "Error clearing $outDir: $!";
70  }  }
71    # Check for DNA mode.
72    my $dnaFile;
73    if ($opt->dna) {
74        $dnaFile = "$outDir/6.1.1.20.dna.fasta";
75    }
76  # Create the statistics object.  # Create the statistics object.
77  my $stats = Stats->new();  my $stats = Stats->new();
78  # Create a filter from the protein name.  # Create a filter from the protein name.
# Line 63  Line 81 
81  # Save the checksum for the seed role.  # Save the checksum for the seed role.
82  my $roleCheck = RoleParse::Checksum($protName);  my $roleCheck = RoleParse::Checksum($protName);
83  # Create a list of the columns we want.  # Create a list of the columns we want.
84  my @cols = qw(genome_name patric_id aa_sequence product);  
85    my @cols = qw(genome_id genome_name patric_id aa_sequence_md5 product);
86    if ($dnaFile) {
87        push @cols, 'na_sequence_md5';
88    }
89  # Open the output files.  # Open the output files.
90  print "Setting up files.\n";  print "Setting up files.\n";
91  open(my $gh, '>', "$outDir/complete.genomes") || die "Could not open genome output file: $!";  open(my $gh, '>', "$outDir/complete.genomes") || die "Could not open genome output file: $!";
92  open(my $fh, '>', "$outDir/6.1.1.20.fasta") || die "Could not open FASTA output file: $!";  open(my $fh, '>', "$outDir/6.1.1.20.fasta") || die "Could not open FASTA output file: $!";
93    my $nh;
94    if ($dnaFile) {
95        open($nh, '>', $dnaFile) || die "Could not open DNA output file: $!";
96    }
97  # Get access to PATRIC.  # Get access to PATRIC.
98  my $p3 = P3DataAPI->new();  my $p3 = P3DataAPI->new();
99    if ($opt->debug) {
100        $p3->debug_on(\*STDERR);
101    }
102  # Open the input file.  # Open the input file.
103  my $ih = P3Utils::ih($opt);  my $ih = P3Utils::ih($opt);
104  # Read the incoming headers.  # Read the incoming headers.
105  my ($outHeaders, $keyCol) = P3Utils::process_headers($ih, $opt);  my ($outHeaders, $keyCol) = P3Utils::process_headers($ih, $opt);
106  # Count the batches of input.  # Get the full list of proteins.
107    print "Reading proteins.\n";
108  my $start0 = time;  my $start0 = time;
109  my $gCount = 0;  my $protList = P3Utils::get_data($p3, feature => \@filter, \@cols);
110  # Loop through the input.  print scalar(@$protList) . " proteins returned in " . Math::Round::nearest(0.01, time - $start0) . " seconds.\n";
111  while (! eof $ih) {  # Get the list of genomes we want.
112      my $couplets = P3Utils::get_couplets($ih, $keyCol, $opt);  print "Reading genomes.\n";
113      # Convert the couplets to contain only genome IDs.  my $genomes = P3Utils::get_col($ih, $keyCol);
114      my @couples = map { [$_->[0], [$_->[0]]] } @$couplets;  my %genomes = map { $_ => 1 } @$genomes;
115      $stats->Add(genomeRead => scalar @couples);  print scalar(@$genomes) . " genomes found in input file.\n";
116      # Get the features of interest for these genomes.  my ($gCount, $pCount) = 0;
117      my $protList = P3Utils::get_data($p3, feature => \@filter, \@cols, genome_id => \@couples);  # This will track the proteins for each genome. It maps a genome ID to a list of protein tuples [name, seq, dna].
     $gCount += scalar @couples;  
     # Collate them by genome ID, discarding the nulls.  
118      my %proteins;      my %proteins;
119    # Loop through the proteins.
120    print "Processing proteins.\n";
121      for my $prot (@$protList) {      for my $prot (@$protList) {
122          my ($genome, $name, $fid, $sequence, $product) = @$prot;      my ($genome, $name, $fid, $seq, $product, $dna) = @$prot;
123          if ($fid) {          if ($fid) {
124              # We have a real feature, check the function.          # We have a real feature, check the genome.
125            if (! $genomes{$genome}) {
126                $stats->Add(filteredGenome => 1);
127            } else {
128              my $check = RoleParse::Checksum($product // '');              my $check = RoleParse::Checksum($product // '');
129              if ($check ne $roleCheck) {              if ($check ne $roleCheck) {
130                  $stats->Add(funnyProt => 1);                  $stats->Add(funnyProt => 1);
131              } else {              } else {
132                  push @{$proteins{$genome}}, [$name, $sequence];                  push @{$proteins{$genome}}, [$name, $seq, $dna];
133                  $stats->Add(protFound => 1);                  $stats->Add(protFound => 1);
134              }              }
135          }          }
136      }      }
137      # Process the genomes one at a time.      $pCount++;
138      for my $genome (keys %proteins) {      print "$pCount proteins processed.\n" if $pCount % 10000 == 0;
139    }
140    # Process the genomes one at a time, remembering MD5s.
141    my %md5s;
142    print "Processing genomes.\n";
143    for my $genome (@$genomes) {
144        if (! $proteins{$genome}) {
145            $stats->Add(genomeNotFound => 1);
146        } else {
147          my @prots = @{$proteins{$genome}};          my @prots = @{$proteins{$genome}};
148          $stats->Add(genomeFound => 1);          $stats->Add(genomeFound => 1);
149          if (scalar @prots > 1) {          if (scalar @prots > 1) {
150              # Skip if we have multiple proteins.              # Skip if we have multiple proteins.
151              $stats->Add(multiProt => 1);              $stats->Add(multiProt => 1);
152                delete $proteins{$genome};
153            } else {
154                # Remember the genome name and sequence.
155                my ($name, $protMd5, $dnaMd5) = @{$prots[0]};
156                $proteins{$genome} = [$name, $protMd5, $dnaMd5];
157                $md5s{$protMd5} = 1;
158                if ($dnaMd5) {
159                    $md5s{$dnaMd5} = 1;
160                }
161                $stats->Add(genomeSaved => 1);
162            }
163        }
164    }
165    # Get the sequences.
166    print "Reading MD5s.\n";
167    my $start1 = time;
168    my $md5Hash = $p3->lookup_sequence_data_hash([keys %md5s]);
169    print "Sequences retrieved in " . (time - $start1) . " seconds.\n";
170    for my $genome (keys %proteins) {
171        my ($name, $protMd5, $dnaMd5) = @{$proteins{$genome}};
172        my $seq = $md5Hash->{$protMd5};
173        if (! $seq) {
174            $stats->Add(missingProtein => 1);
175          } else {          } else {
             # Get the genome name and sequence.  
             my ($name, $seq) = @{$prots[0]};  
176              print $gh "$genome\t$name\n";              print $gh "$genome\t$name\n";
177              print $fh ">$genome\n$seq\n";              print $fh ">$genome\n$seq\n";
178              $stats->Add(genomeOut => 1);          if ($nh && $dnaMd5) {
179                my $dna = $md5Hash->{$dnaMd5};
180                if (! $dna) {
181                    $stats->Add(missingDna => 1);
182                } else {
183                    print $nh ">$genome\n$dna\n";
184                    $stats->Add(dnaOut => 1);
185          }          }
186      }      }
     print "$gCount genomes processed at " . Math::Round::nearest(0.01, (time - $start0) / $gCount) . " seconds/genome.\n";  
187  }  }
188        $stats->Add(genomeOut => 1);
189        $gCount++;
190        print "$gCount genomes processed.\n" if $gCount % 10000 == 0;
191    }
192    $stats->Add(timeElapsed => int(time - $start0));
193  print "All done.\n" . $stats->Show();  print "All done.\n" . $stats->Show();

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.4

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3