[Bio] / Sprout / CommonFunctions.pl Repository:
ViewVC logotype

View of /Sprout/CommonFunctions.pl

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.1 - (download) (as text) (annotate)
Tue Oct 7 17:12:21 2014 UTC (4 years, 4 months ago) by parrello
Branch: MAIN
Miscellaneous loader fixes.

#!/usr/bin/perl -w

=head1 Common Functions Script

This script reads an assigned-functions file and outputs the number of times
each function occurs. In addition, a SEED organism directory can be specified
and all of the assignments in all of the genomes will be counted. Suspicious
functional assignments (ones that are too long, or have odd punctuation), will
be ignored. Only CDS functions will be processed.

The currently-supported command-line options are as follows.

=over 4

=item figDir

If specified, the input file name will be assumed to be a SEED organism directory. The protein 
FASTA files for all the organisms will be scanned.

=item keep

The number of functional assignments to keep in the final output. The default is 65536.

There are two positional parameters the name of the input assigned functions file or organism directory, and
the name of an output directory. Three output files are produced in this directory, all in tab-delimited

=over 4

=item functions.tbl

A map of MD5 codes to functional assignment strings, unsorted.

=item counts.tbl

A map of MD5 codes to ocurrence counts, sorted by count in descending order.

=item bestfuns.tbl

A list of the most common functional assignments, containing the MD5 code, the count, and
the functional assignment, sorted by count in descending order.



use strict;
use Stats;
use Getopt::Long;
use SeedUtils;
use Digest::MD5;

# Create the statistics object.
my $stats = Stats->new();
# Get the command-line options.
my $figDir;
my $keep = 65536;
my $ok = GetOptions(figDir => \$figDir, "keep=i" => \$keep);
my ($inFile, $outDir) = @ARGV;
if (! $ok || ! $inFile || ! $outDir) {
	die "CommonFunctions [ --figDir --keep=65536 ] infile outDir";
# Compute the input files.
my @files;
if (! $figDir) {
	if (! -f $inFile) {
		die "Input file $inFile not found.";
	} else {
		@files = ($inFile);
} else {
	# Here we have a SEED organism directory.
	my $dh;
	if (! opendir($dh, $inFile)) {
		die "Could not open SEED directory $inFile.";
	} else {
		my @genomes = grep { $_ =~ /^\d+\.\d+$/ } readdir($dh);
		closedir $dh;
		print scalar(@genomes) . " genome directories found.\n";
		for my $genome (@genomes) {
			$stats->Add(genomes => 1);
			my $file = "$inFile/$genome/assigned_functions";
			if (-f $file) {
				push @files, $file;
# Check the output directory.
if (! -d $outDir) {
	die "Output directory $outDir not found.";
# The counts will go in here, keyed by MD5.
my %counts;
# Open the output file for the MD5-to-function map.
my $mh;
open($mh, ">$outDir/functions.tbl") || die "Could not open function file: $!";
# Process each assignment file individually.
print scalar(@files) . " files to process.\n";
for my $file (@files) {
	print "Processing $file.\n";
	$stats->Add(inFiles => 1);
	my $ih;
	if (! open($ih, "<$file")) {
		print "Could not open file: $!\n";
		$stats->Add(fileFail => 1);
	} else {
		# This hash collects all the assignments by PEG ID so that overridden assignments can be ignored.
		my %pegs;
		# Loop through the assignment file.
		while (! eof $ih) {
			# Read this assignment.
			my ($fid, $function) = split /\t|\n/, <$ih>;
			$stats->Add(lineIn => 1);
			# Check to insure we want to keep this function.
			if ($fid !~ /peg/) {
				$stats->Add(nonPeg => 1);
			} else {
				my ($roles, $errors) = SeedUtils::roles_for_loading($function);
				if (! defined $roles) {
					# Function appears suspicious.
					$stats->Add(invalidFunction => 1);
				} elsif ($errors) {
					# Functions roles are too long, indicating explanatory text.
					$stats->Add(functionLongRoles => 1);
				} else {
					# Here the function is valid and belongs to a PEG. Compute its
					# MD5.
					my $md5 = Digest::MD5::md5_hex($function);
					# Check to see if we are replacing an existing assignment.
					if (exists $pegs{$fid}) {
						# Yes we are. We expect this to be rare. We must back out the
						# old assignment.
						$stats->Add(functionReplaced => 1);
					# Record this function.
					if (! exists $counts{$md5}) {
						# It's a new function, so output the MD5.
						print $mh "$md5\t$function\n";
						$stats->Add(uniqueFunctions => 1);
					$pegs{$fid} = $md5;
					$stats->Add(functionCounted => 1);
# Close the MD5 map file.
close $mh;
undef $mh;
# Output the MD5-to-count map.
my $ch;
open($ch, "| sort -k2,2nr -t\"\t\" >$outDir/counts.tbl") || die "Could not open counts file: $!";
print "Producing count file\n";
for my $md5 (keys %counts) {
	my $count = $counts{$md5};
	if ($count) {
		$stats->Add(countOut => 1);
		print $ch "$md5\t$count\n";
# Release the memory for the counts hash.
%counts = ();
# Close the count file.
close $ch;
undef $ch;
# Now we read the first N entries in the counts output back into the hash.
print "Rereading counts file.\n";
open($ch, "<$outDir/counts.tbl") || die "Could not reopen counts file: $!";
for (my $i = 0; $i < $keep && ! eof $ch; $i++) {
	my ($md5, $count) = split /\t|\n/, <$ch>;
	$stats->Add(countIn => 1);
	$counts{$md5} = $count;
close $ch;
# Finally, we use the MD5 map to match these counts to the actual function strings. We will
# read the whole MD5 map, and output a line for each MD5 found in the counts hash, sorting
# the result.
print "Producing output file.\n";
open($mh, "<$outDir/functions.tbl") || die "Could not reopen function file: $!";
open(my $oh, "| sort -k2,2nr -t\"\t\" >$outDir/bestfuns.tbl") || die "Could not open output file: $!";
while (! eof $mh) {
	my ($md5, $function) = split /\t|\n/, <$mh>;
	$stats->Add(md5In => 1);
	# Check to see if this is a function we're interested in.
	my $count = $counts{$md5};
	if ($count) {
		print $oh "$md5\t$count\t$function\n";
		$stats->Add(outputLine => 1);
# Close the files.
close $mh;
close $oh;
# All done.
print "All done:\n" . $stats->Show();

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3