[Bio] / FigMetagenomeTools / countfastachars.pl Repository:
ViewVC logotype

Annotation of /FigMetagenomeTools/countfastachars.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : olson 1.1 #!/usr/bin/perl -w
2 :    
3 :     # count fasta characters
4 :    
5 : olson 1.2 my $have_meta;
6 :    
7 :     eval {
8 :     require GenomeMeta;
9 :     import GenomeMeta;
10 :     $have_meta++;
11 :     };
12 :    
13 : olson 1.1 use strict;
14 :    
15 :     my $rewrite; # should we rewrite the file?
16 :     my $summary; #do we just want a summary
17 :     my $protein; # is it a protein
18 : olson 1.2 my $meta;
19 :     my $meta_key = "countfasta.default";
20 :     unless (@ARGV) {die "$0: <-m metafile> <-k meta key> <-r rewrite file> <-s summary> <-p protein files> <fasta files or sequences to count>\n"}
21 :     while (@ARGV)
22 :     {
23 :     my $file = shift;
24 :     if ($file eq "-r") {$rewrite=1; next}
25 :     if ($file eq "-s") {$summary=1; next}
26 :     if ($file eq "-p") {$protein=1; next}
27 :     if ($file eq '-m')
28 :     {
29 :     $have_meta or die "metafile support not available\n";
30 :     $meta = new GenomeMeta(undef, shift);
31 :     next;
32 :     }
33 :     if ($file eq '-k')
34 :     {
35 :     $meta_key = shift;
36 :     next;
37 :     }
38 :     if (-e $file)
39 :     {
40 :     my $runningtotal; my $number;
41 :     my %long; my %short;
42 :     $long{'length'}=1;
43 :     $short{'length'}=1000000000; # use these to store shortest and longest
44 :    
45 :     if ($file =~ /\.gz$/)
46 :     {
47 :     open(IN, "gunzip -c $file|") || die "Can't open pipe to gunzip";
48 :     }
49 :     else
50 :     {
51 :     open (IN, $file)|| die "Can't open $file\n";
52 :     }
53 :    
54 :     unless ($summary) {
55 :     open (OUT, ">$file.count") || die "Can't open $file.count for writing\n";
56 :     if ($protein) {
57 :     print OUT "Sequence\tLength\tRunning Total\n";
58 :     }
59 :     else {
60 :     print OUT "Sequence\tA\tG\tC\tT\tN\tLine total\tRunning total\n";
61 :     }
62 :     }
63 :     if ($rewrite) {
64 :     open REWRITE, ">$file.fasta" || die "Can't open $file.fasta for writing\n";
65 :     }
66 :     my $seq; my $tag;
67 :     while (<IN>) {
68 :     chomp;
69 :     if (/^>/) {
70 :     if ($seq) {
71 :     my $length=length($seq);
72 :     if ($length > $long{'length'}) {%long=('length'=>$length, 'tag'=>$tag)}
73 :     if ($length < $short{'length'}) {%short=('length'=>$length, 'tag'=>$tag)}
74 :     if ($protein) {
75 :     $runningtotal+=length($seq); $number++;
76 :     unless ($summary) {print OUT "$tag\t", length($seq), "\t", $runningtotal, "\n"}
77 :     undef $seq;
78 :     }
79 :     else {
80 :     my $a = $seq =~ s/a/A/ig || '0';
81 :     my $g = $seq =~ s/g/G/ig || '0';
82 :     my $c = $seq =~ s/c/C/ig || '0';
83 :     my $t = $seq =~ s/t/T/ig || '0';
84 :     my $n = $seq =~ s/n/N/ig || '0';
85 :     my $total = $a+$g+$c+$t+$n;
86 :     $runningtotal+=$total; $number++;
87 :     undef $seq;
88 :     unless ($total == $length) {print STDERR "WARNING: Sequence contains letters that are not A, G, C, T, or N\n"}
89 :     if ($rewrite) {$seq =~ s/(.{0,60})/$1\n/g; chomp($seq); print REWRITE "$tag\n$seq"}
90 :     unless ($summary) {print OUT "$tag\t$a\t$g\t$c\t$t\t$n\t$total\t$runningtotal\n"}
91 :     }
92 :     }
93 :     $tag=$_;
94 :     }
95 :     else {s/\s//g; $seq.=$_}
96 :     }
97 :     close(IN);
98 :    
99 :     my $length=length($seq);
100 :     if ($length > $long{'length'}) {%long=('length'=>$length, 'tag'=>$tag)}
101 :     if ($length < $short{'length'}) {%short=('length'=>$length, 'tag'=>$tag)}
102 :     if ($protein) {
103 :     $runningtotal+=length($seq); $number++;
104 :     unless ($summary) {print OUT "$tag\t", length($seq), "\t$runningtotal\n"}
105 :     }
106 :     else {
107 :     my $a = $seq =~ s/a/A/ig || '0';
108 :     my $g = $seq =~ s/g/G/ig || '0';
109 :     my $c = $seq =~ s/c/C/ig || '0';
110 :     my $t = $seq =~ s/t/T/ig || '0';
111 :     my $n = $seq =~ s/n/N/ig || '0';
112 :     my $total = $a+$g+$c+$t+$n;
113 :     undef $seq;
114 :     $runningtotal+=$total; $number++;
115 :     unless ($total == $length) {print STDERR "WARNING: Sequence contains letters that are not A, G, C, T, or N\n"}
116 :     if ($rewrite) {$seq =~ s/(.{0,60})/$1\n/g; chomp($seq); print REWRITE "$tag\n$seq"}
117 :     unless ($summary) {print OUT "$tag\t$a\t$g\t$c\t$t\t$n\t$total\t$runningtotal\n"}
118 :     }
119 :     if ($summary) {
120 :     print "$file: number of seqs:$number total: $runningtotal, average: ", $runningtotal/$number, ", Shortest: $short{'tag'} ($short{'length'}) Longest: $long{'tag'} ($long{'length'})\n";
121 :     }
122 :    
123 :     if ($meta)
124 :     {
125 :     $meta->set_metadata("${meta_key}.file", $file);
126 :     $meta->set_metadata("${meta_key}.num_seqs", $number);
127 :     $meta->set_metadata("${meta_key}.total", $runningtotal);
128 :     $meta->set_metadata("${meta_key}.average", $runningtotal / $number);
129 :     $meta->set_metadata("${meta_key}.shortest_seq", $short{'tag'});
130 :     $meta->set_metadata("${meta_key}.shortest_len", $short{'length'});
131 :     $meta->set_metadata("${meta_key}.longest_seq", $long{'tag'});
132 :     $meta->set_metadata("${meta_key}.longest_len", $long{'length'});
133 :     }
134 :     }
135 :     else {
136 :     my $seq = $file;
137 :     my $length=length($seq);
138 :     my $a = $seq =~ s/a/A/ig || '0';
139 :     my $g = $seq =~ s/g/G/ig || '0';
140 :     my $c = $seq =~ s/c/C/ig || '0';
141 :     my $t = $seq =~ s/t/T/ig || '0';
142 :     my $n = $seq =~ s/n/N/ig || '0';
143 :     my $total = $a+$g+$c+$t+$n;
144 :     unless ($total == $length) {print STDERR "WARNING: Sequence contains letters that are not A, G, C, T, or N\n"}
145 :     print "A\tG\tC\tT\tN\tLine total\n";
146 :     print "$a\t$g\t$c\t$t\t$n\t$total\n";
147 : olson 1.1 }
148 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3