Parent Directory
|
Revision Log
Revision 1.2 - (view) (download) (as text)
1 : | parrello | 1.1 | #!/usr/bin/perl -w |
2 : | use strict; | ||
3 : | |||
4 : | use Getopt::Long; | ||
5 : | use SAPserver; | ||
6 : | use ScriptThing; | ||
7 : | |||
8 : | # | ||
9 : | # This is a SAS Component. | ||
10 : | # | ||
11 : | |||
12 : | =head1 svr_summarize_contigs | ||
13 : | |||
14 : | svr_summarize_contigs <genome_ids.tbl >genome_data.tbl | ||
15 : | |||
16 : | For each incoming genome ID, return statistics about its contigs. | ||
17 : | |||
18 : | This script takes as input a tab-delimited file with genome IDs at the end of each | ||
19 : | line. For each genome ID, a single output line is produced containing statistics | ||
20 : | about the genome's contigs. | ||
21 : | |||
22 : | This is a pipe command: the input is taken from the standard input and the output | ||
23 : | is to the standard output. | ||
24 : | |||
25 : | The data fields produced for each genome (and which are appended to each output | ||
26 : | line in this order) are as follows: | ||
27 : | |||
28 : | =over 4 | ||
29 : | |||
30 : | =item 1 | ||
31 : | |||
32 : | number of contigs | ||
33 : | |||
34 : | =item 2 | ||
35 : | |||
36 : | mean contig length | ||
37 : | |||
38 : | =item 3 | ||
39 : | |||
40 : | median contig length | ||
41 : | |||
42 : | =item 4 | ||
43 : | |||
44 : | total number of base pairs | ||
45 : | |||
46 : | =item 5 | ||
47 : | |||
48 : | total number of ambiguity characters | ||
49 : | |||
50 : | =item 6 | ||
51 : | |||
52 : | total number of GC pairs | ||
53 : | |||
54 : | =back | ||
55 : | |||
56 : | =head2 Command-Line Options | ||
57 : | |||
58 : | =over 4 | ||
59 : | |||
60 : | =item url | ||
61 : | |||
62 : | The URL for the Sapling server, if it is to be different from the default. | ||
63 : | |||
64 : | =item c | ||
65 : | |||
66 : | Column index. If specified, indicates that the input IDs should be taken from the | ||
67 : | indicated column instead of the last column. The first column is column 1. | ||
68 : | |||
69 : | =back | ||
70 : | |||
71 : | =cut | ||
72 : | |||
73 : | # Parse the command-line options. | ||
74 : | my $url = ''; | ||
75 : | my $column = ''; | ||
76 : | my $opted = GetOptions('url=s' => \$url, 'c=i' => \$column); | ||
77 : | if (! $opted) { | ||
78 : | parrello | 1.2 | print "usage: svr_summarize_contigs [--url=http://...] [--c=N] <input >output\n"; |
79 : | parrello | 1.1 | } else { |
80 : | # Get the server object. | ||
81 : | my $sapServer = SAPserver->new(url => $url); | ||
82 : | # The main loop processes chunks of input. | ||
83 : | while (my @tuples = ScriptThing::GetBatch(\*STDIN, undef, $column)) { | ||
84 : | # Ask the server for results. | ||
85 : | my $document = $sapServer->genome_contigs(-ids => [map { $_->[0] } @tuples]); | ||
86 : | # Loop through the IDs, producing output. | ||
87 : | for my $tuple (@tuples) { | ||
88 : | my ($id, $line) = @$tuple; | ||
89 : | # Get this genome's data. | ||
90 : | my $contigList = $document->{$id}; | ||
91 : | # Did we get something? | ||
92 : | if (! $contigList) { | ||
93 : | # No. Write an error notification. | ||
94 : | print STDERR "None found: $id\n"; | ||
95 : | } else { | ||
96 : | # Yes. Get the DNA for each contig. | ||
97 : | my $contigHash = $sapServer->contig_sequences(-ids => $contigList); | ||
98 : | # We'll accumulate our base-pair totals in here. | ||
99 : | my ($allPairs, $ambigPairs, $gcPairs) = (0, 0, 0); | ||
100 : | # This will contain a list of the contig lengths. | ||
101 : | my @lengths; | ||
102 : | # Loop through the contig DNA. | ||
103 : | for my $contigID (@$contigList) { | ||
104 : | my $dna = $contigHash->{$contigID}; | ||
105 : | # Record the length. | ||
106 : | my $length = length $dna; | ||
107 : | push @lengths, $length; | ||
108 : | $allPairs += $length; | ||
109 : | # Record the GC pairs. | ||
110 : | $gcPairs += $dna =~ tr/gcGC//; | ||
111 : | # Count the ambiguity letters. | ||
112 : | $ambigPairs++ while ($dna =~ /[^agctu]/ig); | ||
113 : | } | ||
114 : | # Compute the mean and median contig length. | ||
115 : | my ($mean, $median); | ||
116 : | my $contigCount = scalar @lengths; | ||
117 : | if ($contigCount == 0) { | ||
118 : | # If there are no contigs, we return 0 for both values. | ||
119 : | $mean = 0; | ||
120 : | $median = 0; | ||
121 : | } else { | ||
122 : | # Here the mean and median have actual values. First, the mean. | ||
123 : | $mean = $allPairs / $contigCount; | ||
124 : | # Sort the contig lengths and locate the middle. | ||
125 : | my @sorted = sort @lengths; | ||
126 : | my $midPoint = int($contigCount / 2); | ||
127 : | $median = $sorted[$midPoint]; | ||
128 : | if ($contigCount % 2 == 0) { | ||
129 : | # We have an even number of contigs, so the median is between the | ||
130 : | # middle values. | ||
131 : | $median = ($median + $sorted[$midPoint - 1]) / 2; | ||
132 : | } else { | ||
133 : | # Odd number of contigs: the median is the middle value. | ||
134 : | $median = $sorted[$midPoint]; | ||
135 : | } | ||
136 : | } | ||
137 : | # We have our analysis. Produce the output line for this genome. | ||
138 : | print join("\t", $line, $contigCount, $mean, $median, $allPairs, $ambigPairs, $gcPairs) . "\n"; | ||
139 : | } | ||
140 : | } | ||
141 : | } | ||
142 : | } | ||
143 : |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |