[Bio] / FigKernelScripts / svr_summarize_contigs.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/svr_summarize_contigs.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :     use strict;
3 :    
4 :     use Getopt::Long;
5 :     use SAPserver;
6 :     use ScriptThing;
7 :    
8 :     #
9 :     # This is a SAS Component.
10 :     #
11 :    
12 :     =head1 svr_summarize_contigs
13 :    
14 :     svr_summarize_contigs <genome_ids.tbl >genome_data.tbl
15 :    
16 :     For each incoming genome ID, return statistics about its contigs.
17 :    
18 :     This script takes as input a tab-delimited file with genome IDs at the end of each
19 :     line. For each genome ID, a single output line is produced containing statistics
20 :     about the genome's contigs.
21 :    
22 :     This is a pipe command: the input is taken from the standard input and the output
23 :     is to the standard output.
24 :    
25 :     The data fields produced for each genome (and which are appended to each output
26 :     line in this order) are as follows:
27 :    
28 :     =over 4
29 :    
30 :     =item 1
31 :    
32 :     number of contigs
33 :    
34 :     =item 2
35 :    
36 :     mean contig length
37 :    
38 :     =item 3
39 :    
40 :     median contig length
41 :    
42 :     =item 4
43 :    
44 :     total number of base pairs
45 :    
46 :     =item 5
47 :    
48 :     total number of ambiguity characters
49 :    
50 :     =item 6
51 :    
52 :     total number of GC pairs
53 :    
54 :     =back
55 :    
56 :     =head2 Command-Line Options
57 :    
58 :     =over 4
59 :    
60 :     =item url
61 :    
62 :     The URL for the Sapling server, if it is to be different from the default.
63 :    
64 :     =item c
65 :    
66 :     Column index. If specified, indicates that the input IDs should be taken from the
67 :     indicated column instead of the last column. The first column is column 1.
68 :    
69 :     =back
70 :    
71 :     =cut
72 :    
73 :     # Parse the command-line options.
74 :     my $url = '';
75 :     my $column = '';
76 :     my $opted = GetOptions('url=s' => \$url, 'c=i' => \$column);
77 :     if (! $opted) {
78 : parrello 1.2 print "usage: svr_summarize_contigs [--url=http://...] [--c=N] <input >output\n";
79 : parrello 1.1 } else {
80 :     # Get the server object.
81 :     my $sapServer = SAPserver->new(url => $url);
82 :     # The main loop processes chunks of input.
83 :     while (my @tuples = ScriptThing::GetBatch(\*STDIN, undef, $column)) {
84 :     # Ask the server for results.
85 :     my $document = $sapServer->genome_contigs(-ids => [map { $_->[0] } @tuples]);
86 :     # Loop through the IDs, producing output.
87 :     for my $tuple (@tuples) {
88 :     my ($id, $line) = @$tuple;
89 :     # Get this genome's data.
90 :     my $contigList = $document->{$id};
91 :     # Did we get something?
92 :     if (! $contigList) {
93 :     # No. Write an error notification.
94 :     print STDERR "None found: $id\n";
95 :     } else {
96 :     # Yes. Get the DNA for each contig.
97 :     my $contigHash = $sapServer->contig_sequences(-ids => $contigList);
98 :     # We'll accumulate our base-pair totals in here.
99 :     my ($allPairs, $ambigPairs, $gcPairs) = (0, 0, 0);
100 :     # This will contain a list of the contig lengths.
101 :     my @lengths;
102 :     # Loop through the contig DNA.
103 :     for my $contigID (@$contigList) {
104 :     my $dna = $contigHash->{$contigID};
105 :     # Record the length.
106 :     my $length = length $dna;
107 :     push @lengths, $length;
108 :     $allPairs += $length;
109 :     # Record the GC pairs.
110 :     $gcPairs += $dna =~ tr/gcGC//;
111 :     # Count the ambiguity letters.
112 :     $ambigPairs++ while ($dna =~ /[^agctu]/ig);
113 :     }
114 :     # Compute the mean and median contig length.
115 :     my ($mean, $median);
116 :     my $contigCount = scalar @lengths;
117 :     if ($contigCount == 0) {
118 :     # If there are no contigs, we return 0 for both values.
119 :     $mean = 0;
120 :     $median = 0;
121 :     } else {
122 :     # Here the mean and median have actual values. First, the mean.
123 :     $mean = $allPairs / $contigCount;
124 :     # Sort the contig lengths and locate the middle.
125 :     my @sorted = sort @lengths;
126 :     my $midPoint = int($contigCount / 2);
127 :     $median = $sorted[$midPoint];
128 :     if ($contigCount % 2 == 0) {
129 :     # We have an even number of contigs, so the median is between the
130 :     # middle values.
131 :     $median = ($median + $sorted[$midPoint - 1]) / 2;
132 :     } else {
133 :     # Odd number of contigs: the median is the middle value.
134 :     $median = $sorted[$midPoint];
135 :     }
136 :     }
137 :     # We have our analysis. Produce the output line for this genome.
138 :     print join("\t", $line, $contigCount, $mean, $median, $allPairs, $ambigPairs, $gcPairs) . "\n";
139 :     }
140 :     }
141 :     }
142 :     }
143 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3