[Bio] / FigKernelScripts / p3-rep-prots.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/p3-rep-prots.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 =head1 Create Representative Genome Server Directory
2 :    
3 :     p3-rep-prots.pl [options] outDir
4 :    
5 :     This script processes a list of genome IDs to create a directory suitable for use by the L<RepresentativeGenomes> server.
6 :     It will extract all the instances of the specified seed protein (usually Phenylanyl synthetase alpha chain) and only
7 :     keep genomes with a single instance of reasonable length. The list of genome IDs and names will go in the output file
8 :     C<complete.genomes> and a FASTA of the seed proteins in C<6.1.1.20.fasta>.
9 :    
10 :     =head2 Parameters
11 :    
12 :     The positional parameter is the name of the output directory. If it does not exist, it will be created.
13 :    
14 :     The standard input can be overriddn using the options in L<P3Utils/ih_options>.
15 :    
16 :     Additional command-line options are those given in L<P3Utils/col_options> plus the following
17 :     options.
18 :    
19 :     =over 4
20 :    
21 :     =item protein
22 :    
23 :     The description string of the desired protein role. The default is C<Phenylalanyl-tRNA synthetase alpha chain>.
24 :    
25 :     =item minlen
26 :    
27 :     The minimum acceptable length for the protein. The default is 209.
28 :    
29 :     =item maxlen
30 :    
31 :     The maximum acceptable length for the protein. The default is 485.
32 :    
33 :     =item clear
34 :    
35 :     Clear the output directory if it already exists. The default is to leave existing files in place.
36 :    
37 :     =back
38 :    
39 :     =cut
40 :    
41 :     use strict;
42 :     use P3DataAPI;
43 :     use P3Utils;
44 :     use Stats;
45 :     use File::Copy::Recursive;
46 :    
47 :     $| = 1;
48 :     # Get the command-line options.
49 :     my $opt = P3Utils::script_opts('outDir', P3Utils::col_options(), P3Utils::ih_options(),
50 :     ['protein=s', 'protein role description', { default => 'Phenylalanyl-tRNA synthetase alpha chain'}],
51 :     ['minlen=i', 'minimum protein length', { default => 209 }],
52 :     ['maxlen=i', 'maximum protein length', { default => 485 }],
53 :     ['clear', 'clear the output directory if it exists']
54 :     );
55 :     # Get the output directory name.
56 :     my ($outDir) = @ARGV;
57 :     if (! $outDir) {
58 :     die "No output directory specified.";
59 :     } elsif (! -d $outDir) {
60 :     print "Creating directory $outDir.\n";
61 :     File::Copy::Recursive::pathmk($outDir) || die "Could not create $outDir: $!";
62 :     } elsif ($opt->clear) {
63 :     print "Erasing directory $outDir.\n";
64 :     File::Copy::Recursive::pathempty($outDir) || die "Error clearing $outDir: $!";
65 :     }
66 :     # Create the statistics object.
67 :     my $stats = Stats->new();
68 :     # Create a filter from the protein name.
69 :     my @filter = (['eq', 'product', $opt->protein]);
70 :     # Create a list of the columns we want.
71 :     my @cols = qw(genome_name patric_id aa_sequence);
72 :     # Get the length options.
73 :     my $minlen = $opt->minlen;
74 :     my $maxlen = $opt->maxlen;
75 :     # Open the output files.
76 :     print "Setting up files.\n";
77 :     open(my $gh, '>', "$outDir/complete.genomes") || die "Could not open genome output file: $!";
78 :     open(my $fh, '>', "$outDir/6.1.1.20.fasta") || die "Could not open FASTA output file: $!";
79 :     # Get access to PATRIC.
80 :     my $p3 = P3DataAPI->new();
81 :     # Open the input file.
82 :     my $ih = P3Utils::ih($opt);
83 :     # Read the incoming headers.
84 :     my ($outHeaders, $keyCol) = P3Utils::process_headers($ih, $opt);
85 :     # Count the batches of input.
86 :     my $batches = 0;
87 :     # Loop through the input.
88 :     while (! eof $ih) {
89 :     $batches++;
90 :     print "Processing batch $batches.\n";
91 :     my $couplets = P3Utils::get_couplets($ih, $keyCol, $opt);
92 :     # Convert the couplets to contain only genome IDs.
93 :     my @couples = map { [$_->[0], [$_->[0]]] } @$couplets;
94 :     $stats->Add(genomeRead => scalar @couples);
95 :     # Get the features of interest for these genomes.
96 :     my $protList = P3Utils::get_data($p3, feature => \@filter, \@cols, genome_id => \@couples);
97 :     # Collate them by genome ID, discarding the nulls.
98 :     my %proteins;
99 :     for my $prot (@$protList) {
100 :     my ($genome, $name, $fid, $sequence) = @$prot;
101 :     if ($fid) {
102 :     push @{$proteins{$genome}}, [$name, $sequence];
103 :     $stats->Add(protFound => 1);
104 :     }
105 :     }
106 :     # Process the genomes one at a time.
107 :     for my $genome (keys %proteins) {
108 :     my @prots = @{$proteins{$genome}};
109 :     $stats->Add(genomeFound => 1);
110 :     if (scalar @prots > 1) {
111 :     # Skip if we have multiple proteins.
112 :     $stats->Add(multiProt => 1);
113 :     } else {
114 :     # Get the genome name and sequence, then check the length of the sequence.
115 :     my ($name, $seq) = @{$prots[0]};
116 :     my $len = length($seq);
117 :     if ($len < $minlen) {
118 :     $stats->Add(protTooShort => 1);
119 :     } elsif ($len > $maxlen) {
120 :     $stats->Add(protTooLong => 1);
121 :     } else {
122 :     # Here we have a good genome.
123 :     print $gh "$genome\t$name\n";
124 :     print $fh ">$genome\n$seq\n";
125 :     $stats->Add(genomeOut => 1);
126 :     }
127 :     }
128 :     }
129 :     }
130 :     print "All done.\n" . $stats->Show();

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3