[Bio] / Sprout / AAFrequencies.pl Repository:
ViewVC logotype

View of /Sprout/AAFrequencies.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Tue Oct 7 17:12:21 2014 UTC (4 years, 6 months ago) by parrello
Branch: MAIN
CVS Tags: HEAD
Miscellaneous loader fixes.

#!/usr/bin/perl -w

=head1 Amino Acid Frequency Script

This script reads a protein FASTA file and displays the number of times
each amino acid occurs. In addition, a SEED organism directory can be specified
and all of the proteins in all of the genomes will be counted.

The currently-supported command-line options are as follows.

=over 4

=item figDir

If specified, the input file name will be assumed to be a SEED organism directory. The protein 
FASTA files for all the organisms will be scanned.
  
=back

There is one positional parameter: the name of the input FASTA file or organism directory,


=cut

use strict;
use Stats;
use Getopt::Long;

# Create the statistics object.
my $stats = Stats->new();
# Get the command-line options.
my $figDir;
my $ok = GetOptions(figDir => \$figDir);
my ($inFile) = @ARGV;
if (! $ok || ! $inFile) {
	die "AAFrequencies [ --figDir ] infile";
}
# Compute the input files.
my @files;
if (! $figDir) {
	if (! -f $inFile) {
		die "Input file $inFile not found.";
	} else {
		@files = ($inFile);
	}
} else {
	# Here we have a SEED organism directory.
	my $dh;
	if (! opendir($dh, $inFile)) {
		die "Could not open SEED directory $inFile.";
	} else {
		my @genomes = grep { $_ =~ /^\d+\.\d+$/ } readdir($dh);
		closedir $dh;
		print scalar(@genomes) . " genome directories found.\n";
		for my $genome (@genomes) {
			$stats->Add(genomes => 1);
			my $file = "$inFile/$genome/Features/peg/fasta";
			if (-f $file) {
				push @files, $file;
			}
		}
	}
}
# The counts will go in here.
my %counts;
# Process each FASTA file individually.
print scalar(@files) . " files to process.\n";
for my $file (@files) {
	print "Processing $file.\n";
	$stats->Add(inFiles => 1);
	my $ih;
	if (! open($ih, "<$file")) {
		print "Could not open file: $!\n";
		$stats->Add(fileFail => 1);
	} else {
		# Loop through the FASTA file.
		while (! eof $ih) {
			my $line = <$ih>;
			chomp $line;
			$stats->Add(lineIn => 1);
			# Check for a header line.
			if (substr($line, 0, 1) eq ">") {
				$stats->Add(headerIn => 1);
			} else {
				# Here we have a data line.
				$stats->Add(dataIn => 1);
				# Loop through the letters.
				for my $letter (split "", $line) {
					$stats->Add(lettersIn => 1);
					$counts{$letter}++;
				}
			}
		}
	}
}
# Sort the results and output them.
print "Result counts.\n\n";
my @letters = sort { $counts{$b} <=> $counts{$a} } keys %counts;
for my $letter (@letters) {
	my $number = $counts{$letter};
	if (length($number) < 10) {
		$number = (" " x (10 - length($number))) . $number;
	}
	print "$letter $number\n";
}
print "\nAll done:\n" . $stats->Show();

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3