[Bio] / FigKernelScripts / p3-rep-prots.pl Repository:
ViewVC logotype

Diff of /FigKernelScripts/p3-rep-prots.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Fri Mar 16 01:24:56 2018 UTC revision 1.3, Tue Aug 21 20:35:55 2018 UTC
# Line 3  Line 3 
3      p3-rep-prots.pl [options] outDir      p3-rep-prots.pl [options] outDir
4    
5  This script processes a list of genome IDs to create a directory suitable for use by the L<RepresentativeGenomes> server.  This script processes a list of genome IDs to create a directory suitable for use by the L<RepresentativeGenomes> server.
6  It will extract all the instances of the specified seed protein (Phenylanyl synthetase alpha chain) and only  It will extract all the instances of the specified seed protein (default is Phenylanyl synthetase alpha chain). The list of genome IDs and
7  keep genomes with a single instance of reasonable length. The list of genome IDs and names will go in the output file  names will go in the output file C<complete.genomes> and a FASTA of the seed proteins in C<6.1.1.20.fasta>.
 C<complete.genomes> and a FASTA of the seed proteins in C<6.1.1.20.fasta>.  
8    
9  =head2 Parameters  =head2 Parameters
10    
# Line 18  Line 17 
17    
18  =over 4  =over 4
19    
 =item minlen  
   
 The minimum acceptable length for the protein. The default is 209.  
   
 =item maxlen  
   
 The maximum acceptable length for the protein. The default is 485.  
   
20  =item clear  =item clear
21    
22  Clear the output directory if it already exists. The default is to leave existing files in place.  Clear the output directory if it already exists. The default is to leave existing files in place.
23    
24    =item prot
25    
26    Role name of the protein to use. The default is C<Phenylalanyl-tRNA synthetase alpha chain>.
27    
28  =back  =back
29    
30  =cut  =cut
# Line 40  Line 35 
35  use Stats;  use Stats;
36  use File::Copy::Recursive;  use File::Copy::Recursive;
37  use RoleParse;  use RoleParse;
38    use Time::HiRes;
39    use Math::Round;
40    
41  $| = 1;  $| = 1;
42  # Get the command-line options.  # Get the command-line options.
43  my $opt = P3Utils::script_opts('outDir', P3Utils::col_options(), P3Utils::ih_options(),  my $opt = P3Utils::script_opts('outDir', P3Utils::col_options(), P3Utils::ih_options(),
44          ['minlen=i', 'minimum protein length', { default => 209 }],          ['clear', 'clear the output directory if it exists'],
45          ['maxlen=i', 'maximum protein length', { default => 485 }],          ['prot=s', 'name of the protein to use', { default => 'Phenylalanyl-tRNA synthetase alpha chain' }],
         ['clear', 'clear the output directory if it exists']  
46          );          );
47  # Get the output directory name.  # Get the output directory name.
48  my ($outDir) = @ARGV;  my ($outDir) = @ARGV;
# Line 62  Line 58 
58  # Create the statistics object.  # Create the statistics object.
59  my $stats = Stats->new();  my $stats = Stats->new();
60  # Create a filter from the protein name.  # Create a filter from the protein name.
61  my @filter = (['eq', 'product', 'Phenylalanyl tRNA-synthetase alpha chain']);  my $protName = $opt->prot;
62    my @filter = (['eq', 'product', $protName]);
63  # Save the checksum for the seed role.  # Save the checksum for the seed role.
64  my $roleCheck = "WCzieTC/aZ6262l19bwqgw";  my $roleCheck = RoleParse::Checksum($protName);
65  # Create a list of the columns we want.  # Create a list of the columns we want.
66  my @cols = qw(genome_name patric_id aa_sequence product);  my @cols = qw(genome_name patric_id aa_sequence product);
 # Get the length options.  
 my $minlen = $opt->minlen;  
 my $maxlen = $opt->maxlen;  
67  # Open the output files.  # Open the output files.
68  print "Setting up files.\n";  print "Setting up files.\n";
69  open(my $gh, '>', "$outDir/complete.genomes") || die "Could not open genome output file: $!";  open(my $gh, '>', "$outDir/complete.genomes") || die "Could not open genome output file: $!";
# Line 81  Line 75 
75  # Read the incoming headers.  # Read the incoming headers.
76  my ($outHeaders, $keyCol) = P3Utils::process_headers($ih, $opt);  my ($outHeaders, $keyCol) = P3Utils::process_headers($ih, $opt);
77  # Count the batches of input.  # Count the batches of input.
78  my $batches = 0;  my $start0 = time;
79    my $gCount = 0;
80  # Loop through the input.  # Loop through the input.
81  while (! eof $ih) {  while (! eof $ih) {
     $batches++;  
     print "Processing batch $batches.\n";  
82      my $couplets = P3Utils::get_couplets($ih, $keyCol, $opt);      my $couplets = P3Utils::get_couplets($ih, $keyCol, $opt);
83      # Convert the couplets to contain only genome IDs.      # Convert the couplets to contain only genome IDs.
84      my @couples = map { [$_->[0], [$_->[0]]] } @$couplets;      my @couples = map { [$_->[0], [$_->[0]]] } @$couplets;
85      $stats->Add(genomeRead => scalar @couples);      $stats->Add(genomeRead => scalar @couples);
86      # Get the features of interest for these genomes.      # Get the features of interest for these genomes.
87      my $protList = P3Utils::get_data($p3, feature => \@filter, \@cols, genome_id => \@couples);      my $protList = P3Utils::get_data($p3, feature => \@filter, \@cols, genome_id => \@couples);
88        $gCount += scalar @couples;
89      # Collate them by genome ID, discarding the nulls.      # Collate them by genome ID, discarding the nulls.
90      my %proteins;      my %proteins;
91      for my $prot (@$protList) {      for my $prot (@$protList) {
# Line 115  Line 109 
109              # Skip if we have multiple proteins.              # Skip if we have multiple proteins.
110              $stats->Add(multiProt => 1);              $stats->Add(multiProt => 1);
111          } else {          } else {
112              # Get the genome name and sequence, then check the length of the sequence.              # Get the genome name and sequence.
113              my ($name, $seq) = @{$prots[0]};              my ($name, $seq) = @{$prots[0]};
             my $len = length($seq);  
             if ($len < $minlen) {  
                 $stats->Add(protTooShort => 1);  
             } elsif ($len > $maxlen) {  
                 $stats->Add(protTooLong => 1);  
             } else {  
                 # Here we have a good genome.  
114                  print $gh "$genome\t$name\n";                  print $gh "$genome\t$name\n";
115                  print $fh ">$genome\n$seq\n";                  print $fh ">$genome\n$seq\n";
116                  $stats->Add(genomeOut => 1);                  $stats->Add(genomeOut => 1);
117              }              }
118          }          }
119      }      print "$gCount genomes processed at " . Math::Round::nearest(0.01, (time - $start0) / $gCount) . " seconds/genome.\n";
120  }  }
121  print "All done.\n" . $stats->Show();  print "All done.\n" . $stats->Show();

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.3

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3