[Bio] / FigMetagenomeTools / countfastachars.pl Repository:
ViewVC logotype

Annotation of /FigMetagenomeTools/countfastachars.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : olson 1.1 #!/usr/bin/perl -w
2 :    
3 :     # count fasta characters
4 :    
5 :     use strict;
6 :    
7 :     my $rewrite; # should we rewrite the file?
8 :     my $summary; #do we just want a summary
9 :     my $protein; # is it a protein
10 :     unless (@ARGV) {die "$0: <-r rewrite file> <-s summary> <-p protein files> <fasta files or sequences to count>\n"}
11 :     foreach my $file (@ARGV) {
12 :     if ($file eq "-r") {$rewrite=1; next}
13 :     if ($file eq "-s") {$summary=1; next}
14 :     if ($file eq "-p") {$protein=1; next}
15 :     if (-e $file) {
16 :     my $runningtotal; my $number;
17 :     my %long; my %short; $long{'length'}=1; $short{'length'}=1000000000; # use these to store shortest and longest
18 :     if ($file =~ /\.gz$/) {open(IN, "gunzip -c $file|") || die "Can't open pipe to gunzip"}
19 :     else {open (IN, $file)|| die "Can't open $file\n"}
20 :     unless ($summary) {
21 :     open (OUT, ">$file.count") || die "Can't open $file.count for writing\n";
22 :     if ($protein) {print OUT "Sequence\tLength\tRunning Total\n"}
23 :     else {print OUT "Sequence\tA\tG\tC\tT\tN\tLine total\tRunning total\n"}
24 :     }
25 :     if ($rewrite) {open REWRITE, ">$file.fasta" || die "Can't open $file.fasta for writing\n"}
26 :     my $seq; my $tag;
27 :     while (<IN>) {
28 :     chomp;
29 :     if (/^>/) {
30 :     if ($seq) {
31 :     my $length=length($seq);
32 :     if ($length > $long{'length'}) {%long=('length'=>$length, 'tag'=>$tag)}
33 :     if ($length < $short{'length'}) {%short=('length'=>$length, 'tag'=>$tag)}
34 :     if ($protein) {
35 :     $runningtotal+=length($seq); $number++;
36 :     unless ($summary) {print OUT "$tag\t", length($seq), "\t", $runningtotal, "\n"}
37 :     undef $seq;
38 :     }
39 :     else {
40 :     my $a = $seq =~ s/a/A/ig || '0';
41 :     my $g = $seq =~ s/g/G/ig || '0';
42 :     my $c = $seq =~ s/c/C/ig || '0';
43 :     my $t = $seq =~ s/t/T/ig || '0';
44 :     my $n = $seq =~ s/n/N/ig || '0';
45 :     my $total = $a+$g+$c+$t+$n;
46 :     $runningtotal+=$total; $number++;
47 :     undef $seq;
48 :     unless ($total == $length) {print STDERR "WARNING: Sequence contains letters that are not A, G, C, T, or N\n"}
49 :     if ($rewrite) {$seq =~ s/(.{0,60})/$1\n/g; chomp($seq); print REWRITE "$tag\n$seq"}
50 :     unless ($summary) {print OUT "$tag\t$a\t$g\t$c\t$t\t$n\t$total\t$runningtotal\n"}
51 :     }
52 :     }
53 :     $tag=$_;
54 :     }
55 :     else {s/\s//g; $seq.=$_}
56 :     }
57 :     my $length=length($seq);
58 :     if ($length > $long{'length'}) {%long=('length'=>$length, 'tag'=>$tag)}
59 :     if ($length < $short{'length'}) {%short=('length'=>$length, 'tag'=>$tag)}
60 :     if ($protein) {
61 :     $runningtotal+=length($seq); $number++;
62 :     unless ($summary) {print OUT "$tag\t", length($seq), "\t$runningtotal\n"}
63 :     }
64 :     else {
65 :     my $a = $seq =~ s/a/A/ig || '0';
66 :     my $g = $seq =~ s/g/G/ig || '0';
67 :     my $c = $seq =~ s/c/C/ig || '0';
68 :     my $t = $seq =~ s/t/T/ig || '0';
69 :     my $n = $seq =~ s/n/N/ig || '0';
70 :     my $total = $a+$g+$c+$t+$n;
71 :     undef $seq;
72 :     $runningtotal+=$total; $number++;
73 :     unless ($total == $length) {print STDERR "WARNING: Sequence contains letters that are not A, G, C, T, or N\n"}
74 :     if ($rewrite) {$seq =~ s/(.{0,60})/$1\n/g; chomp($seq); print REWRITE "$tag\n$seq"}
75 :     unless ($summary) {print OUT "$tag\t$a\t$g\t$c\t$t\t$n\t$total\t$runningtotal\n"}
76 :     }
77 :     if ($summary) {print "$file: number of seqs:$number total: $runningtotal, average: ", $runningtotal/$number, ", Shortest: $short{'tag'} ($short{'length'}) Longest: $long{'tag'} ($long{'length'})\n"}
78 :     }
79 :     else {
80 :     my $seq = $file;
81 :     my $length=length($seq);
82 :     my $a = $seq =~ s/a/A/ig || '0';
83 :     my $g = $seq =~ s/g/G/ig || '0';
84 :     my $c = $seq =~ s/c/C/ig || '0';
85 :     my $t = $seq =~ s/t/T/ig || '0';
86 :     my $n = $seq =~ s/n/N/ig || '0';
87 :     my $total = $a+$g+$c+$t+$n;
88 :     unless ($total == $length) {print STDERR "WARNING: Sequence contains letters that are not A, G, C, T, or N\n"}
89 :     print "A\tG\tC\tT\tN\tLine total\n";
90 :     print "$a\t$g\t$c\t$t\t$n\t$total\n";
91 :     }
92 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3