[Bio] / FigMetagenomeTools / countfastachars.pl Repository:
ViewVC logotype

View of /FigMetagenomeTools/countfastachars.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Fri Mar 16 20:51:09 2007 UTC (12 years, 9 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_rel_2008_0806, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_rel_2008_0923, mgrast_release_3_0, mgrast_dev_03252011, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, mgrast_rel_2008_0919, mgrast_rel_2008_1110, myrast_33, mgrast_rel_2008_0917, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
Changes since 1.1: +137 -81 lines
add genomemeta support

#!/usr/bin/perl -w

# count fasta characters

my $have_meta;

eval {
    require GenomeMeta;
    import GenomeMeta;
    $have_meta++;
};

use strict;

my $rewrite; # should we rewrite the file?
my $summary; #do we just want a summary
my $protein; # is it a protein
my $meta;
my $meta_key = "countfasta.default";
unless (@ARGV) {die "$0: <-m metafile> <-k meta key> <-r rewrite file> <-s summary> <-p protein files> <fasta files or sequences to count>\n"}
while (@ARGV)
{
    my $file = shift;
    if ($file eq "-r") {$rewrite=1; next}
    if ($file eq "-s") {$summary=1; next}
    if ($file eq "-p") {$protein=1; next}
    if ($file eq '-m')
    {
	$have_meta or die "metafile support not available\n";
	$meta = new GenomeMeta(undef, shift);
	next;
    }
    if ($file eq '-k')
    {
	$meta_key = shift;
	next;
    }
    if (-e $file)
    {
	my $runningtotal; my $number;
	my %long; my %short;
	$long{'length'}=1;
	$short{'length'}=1000000000; # use these to store shortest and longest
	
	if ($file =~ /\.gz$/)
	{
	    open(IN, "gunzip -c $file|") || die "Can't open pipe to gunzip";
	}
	else
	{
	    open (IN, $file)|| die "Can't open $file\n";
	}
	
	unless ($summary) {
	    open (OUT, ">$file.count") || die "Can't open $file.count for writing\n";
	    if ($protein) {
		print OUT "Sequence\tLength\tRunning Total\n";
	    }
	    else {
		print OUT "Sequence\tA\tG\tC\tT\tN\tLine total\tRunning total\n";
	    }
	}
	if ($rewrite) {
	    open REWRITE, ">$file.fasta" || die "Can't open $file.fasta for writing\n";
	}
	my $seq; my $tag;
	while (<IN>) {
	    chomp;
	    if (/^>/) {
		if ($seq) {
		    my $length=length($seq);
		    if ($length > $long{'length'}) {%long=('length'=>$length, 'tag'=>$tag)}
		    if ($length < $short{'length'}) {%short=('length'=>$length, 'tag'=>$tag)}
		    if ($protein) {
			$runningtotal+=length($seq); $number++;
			unless ($summary) {print OUT "$tag\t", length($seq), "\t", $runningtotal, "\n"}
			undef $seq;
		    }
		    else {
			my $a = $seq =~ s/a/A/ig || '0';
			my $g = $seq =~ s/g/G/ig || '0';
			my $c = $seq =~ s/c/C/ig || '0';
			my $t = $seq =~ s/t/T/ig || '0';
			my $n = $seq =~ s/n/N/ig || '0';
			my $total = $a+$g+$c+$t+$n;
			$runningtotal+=$total; $number++;
			undef $seq;
			unless ($total == $length) {print STDERR "WARNING: Sequence contains letters that are not A, G, C, T, or N\n"}
			if ($rewrite) {$seq =~ s/(.{0,60})/$1\n/g; chomp($seq); print REWRITE "$tag\n$seq"}
			unless ($summary) {print OUT "$tag\t$a\t$g\t$c\t$t\t$n\t$total\t$runningtotal\n"}
		    }
		}
		$tag=$_;
	    }
	    else {s/\s//g; $seq.=$_}
	}
	close(IN);
	
	my $length=length($seq);
	if ($length > $long{'length'}) {%long=('length'=>$length, 'tag'=>$tag)}
	if ($length < $short{'length'}) {%short=('length'=>$length, 'tag'=>$tag)}
	if ($protein) {
	    $runningtotal+=length($seq); $number++;
	    unless ($summary) {print OUT "$tag\t", length($seq), "\t$runningtotal\n"}
	}
	else {
	    my $a = $seq =~ s/a/A/ig || '0';
	    my $g = $seq =~ s/g/G/ig || '0';
	    my $c = $seq =~ s/c/C/ig || '0';
	    my $t = $seq =~ s/t/T/ig || '0';
	    my $n = $seq =~ s/n/N/ig || '0';
	    my $total = $a+$g+$c+$t+$n;
	    undef $seq;
	    $runningtotal+=$total; $number++;
	    unless ($total == $length) {print STDERR "WARNING: Sequence contains letters that are not A, G, C, T, or N\n"}
	    if ($rewrite) {$seq =~ s/(.{0,60})/$1\n/g; chomp($seq); print REWRITE "$tag\n$seq"}
	    unless ($summary) {print OUT "$tag\t$a\t$g\t$c\t$t\t$n\t$total\t$runningtotal\n"}
	}
	if ($summary) {
	    print "$file: number of seqs:$number total: $runningtotal, average: ", $runningtotal/$number, ", Shortest: $short{'tag'} ($short{'length'}) Longest: $long{'tag'} ($long{'length'})\n";
	}

	if ($meta)
	{
	    $meta->set_metadata("${meta_key}.file", $file);
	    $meta->set_metadata("${meta_key}.num_seqs", $number);
	    $meta->set_metadata("${meta_key}.total", $runningtotal);
	    $meta->set_metadata("${meta_key}.average", $runningtotal / $number);
	    $meta->set_metadata("${meta_key}.shortest_seq", $short{'tag'});
	    $meta->set_metadata("${meta_key}.shortest_len", $short{'length'});
	    $meta->set_metadata("${meta_key}.longest_seq", $long{'tag'});
	    $meta->set_metadata("${meta_key}.longest_len", $long{'length'});
	}
    }
    else {
	my $seq = $file;
	my $length=length($seq);
	my $a = $seq =~ s/a/A/ig || '0';
	my $g = $seq =~ s/g/G/ig || '0';
	my $c = $seq =~ s/c/C/ig || '0';
	my $t = $seq =~ s/t/T/ig || '0';
	my $n = $seq =~ s/n/N/ig || '0';
	my $total = $a+$g+$c+$t+$n;
	unless ($total == $length) {print STDERR "WARNING: Sequence contains letters that are not A, G, C, T, or N\n"}
	print "A\tG\tC\tT\tN\tLine total\n";
	print "$a\t$g\t$c\t$t\t$n\t$total\n";
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3