[Bio] / Sprout / AAFrequencies.pl Repository:
ViewVC logotype

Annotation of /Sprout/AAFrequencies.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     =head1 Amino Acid Frequency Script
4 :    
5 :     This script reads a protein FASTA file and displays the number of times
6 :     each amino acid occurs. In addition, a SEED organism directory can be specified
7 :     and all of the proteins in all of the genomes will be counted.
8 :    
9 :     The currently-supported command-line options are as follows.
10 :    
11 :     =over 4
12 :    
13 :     =item figDir
14 :    
15 :     If specified, the input file name will be assumed to be a SEED organism directory. The protein
16 :     FASTA files for all the organisms will be scanned.
17 :    
18 :     =back
19 :    
20 :     There is one positional parameter: the name of the input FASTA file or organism directory,
21 :    
22 :    
23 :     =cut
24 :    
25 :     use strict;
26 :     use Stats;
27 :     use Getopt::Long;
28 :    
29 :     # Create the statistics object.
30 :     my $stats = Stats->new();
31 :     # Get the command-line options.
32 :     my $figDir;
33 :     my $ok = GetOptions(figDir => \$figDir);
34 :     my ($inFile) = @ARGV;
35 :     if (! $ok || ! $inFile) {
36 :     die "AAFrequencies [ --figDir ] infile";
37 :     }
38 :     # Compute the input files.
39 :     my @files;
40 :     if (! $figDir) {
41 :     if (! -f $inFile) {
42 :     die "Input file $inFile not found.";
43 :     } else {
44 :     @files = ($inFile);
45 :     }
46 :     } else {
47 :     # Here we have a SEED organism directory.
48 :     my $dh;
49 :     if (! opendir($dh, $inFile)) {
50 :     die "Could not open SEED directory $inFile.";
51 :     } else {
52 :     my @genomes = grep { $_ =~ /^\d+\.\d+$/ } readdir($dh);
53 :     closedir $dh;
54 :     print scalar(@genomes) . " genome directories found.\n";
55 :     for my $genome (@genomes) {
56 :     $stats->Add(genomes => 1);
57 :     my $file = "$inFile/$genome/Features/peg/fasta";
58 :     if (-f $file) {
59 :     push @files, $file;
60 :     }
61 :     }
62 :     }
63 :     }
64 :     # The counts will go in here.
65 :     my %counts;
66 :     # Process each FASTA file individually.
67 :     print scalar(@files) . " files to process.\n";
68 :     for my $file (@files) {
69 :     print "Processing $file.\n";
70 :     $stats->Add(inFiles => 1);
71 :     my $ih;
72 :     if (! open($ih, "<$file")) {
73 :     print "Could not open file: $!\n";
74 :     $stats->Add(fileFail => 1);
75 :     } else {
76 :     # Loop through the FASTA file.
77 :     while (! eof $ih) {
78 :     my $line = <$ih>;
79 :     chomp $line;
80 :     $stats->Add(lineIn => 1);
81 :     # Check for a header line.
82 :     if (substr($line, 0, 1) eq ">") {
83 :     $stats->Add(headerIn => 1);
84 :     } else {
85 :     # Here we have a data line.
86 :     $stats->Add(dataIn => 1);
87 :     # Loop through the letters.
88 :     for my $letter (split "", $line) {
89 :     $stats->Add(lettersIn => 1);
90 :     $counts{$letter}++;
91 :     }
92 :     }
93 :     }
94 :     }
95 :     }
96 :     # Sort the results and output them.
97 :     print "Result counts.\n\n";
98 :     my @letters = sort { $counts{$b} <=> $counts{$a} } keys %counts;
99 :     for my $letter (@letters) {
100 :     my $number = $counts{$letter};
101 :     if (length($number) < 10) {
102 :     $number = (" " x (10 - length($number))) . $number;
103 :     }
104 :     print "$letter $number\n";
105 :     }
106 :     print "\nAll done:\n" . $stats->Show();

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3