[Bio] / StandaloneTools / histo_seqs Repository:
ViewVC logotype

View of /StandaloneTools/histo_seqs

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (annotate)
Fri May 26 22:33:05 2006 UTC (13 years, 4 months ago) by overbeek
Branch: MAIN
CVS Tags: HEAD
Changes since 1.3: +30 -21 lines
New "-by_chars" option plus minor cosmetic changes. -- /gdp

#!/usr/bin/perl -w

$usage = "histo_seqs [-norm] [-null] [-nolabel] [-get_gc] [-get_dna] < fasta.file > fasta.cumul";
if (defined($ARGV[0]) && ($ARGV[0] =~ m/-help/))
{
    print STDERR "\n\t$usage\n\n";
    exit(1);
}

$norm = $null = $nolabel = $gc = "";
while (@ARGV)
{
    if    ($ARGV[0] =~ m/-norm/)      { $norm = shift; }
    elsif ($ARGV[0] =~ m/-null/)      { $null = shift; }
    elsif ($ARGV[0] =~ m/-nolabel/)   { $nolabel  = shift; }
    elsif ($ARGV[0] =~ m/-by_chars/)  { $by_chars = shift; }
    elsif ($ARGV[0] =~ m/-get_gc/)    { $get_gc   = shift; }
    elsif ($ARGV[0] =~ m/-get_dna/)   { $get_dna  = shift; }
    elsif (-s $ARGV[0])               { $file = shift;  open(FILE, "<$file") || die "could not read-open $file"; $fh = \*FILE; }
    else  { die "Invalid arg $ARGV[0] --- usage: $usage"; }
}

$chars = 0;
$num_seqs = 0;
$gc = $at = 1;
while ( ($head, $seqP) = &get_a_fasta_record($fh) )
{
    if (defined($$seqP) && defined($seq_len = length($$seqP)))
    {
	++$num_seqs;
	$head =~ m/^(\S+)/;  $id = $1;
	
	unless (defined($histo{$seq_len})) 
	{ 
	    $histo{$seq_len} = 0;
	    unless ($nolabel || $null)  { $id_set{$seq_len} = []; }
	}
	
	$chars  += $seq_len;
	$histo{$seq_len} += 1;
	unless ($nolabel || $null)  { $x = $id_set{$seq_len};  push(@$x, $id); }
	
	if ($get_gc)
	{
	    $gc += ($$seqP =~ tr/gcGC//);
	    $at += ($$seqP =~ tr/atAT//);
	}
	
	if ($get_dna)
	{
	    $a += ($$seqP =~ tr/aA//);
	    $c += ($$seqP =~ tr/cC//);
	    $g += ($$seqP =~ tr/gG//);
	    $t += ($$seqP =~ tr/tT//);
	}
    }
}

$cumul  = 0;
$expect = 0;
foreach $seq_len (sort {$a <=> $b} keys %histo)
{
    $expect += $histo{$seq_len} * $seq_len;
    
    $cumul  += $by_chars ? ($seq_len * $histo{$seq_len}) : $histo{$seq_len};
    
    $plot = $norm ? ($by_chars ? $cumul/$num_chars : $cumul/$num_seqs) : $cumul ;
    
    if (! defined($min))     { $min = $seq_len; }
    
    if ((! defined($median)) && ($cumul > ($by_chars ? $num_chars : $num_seqs) / 2 ))  { 
	$median = $seq_len; 
    }
    
    unless ($null)
    {
	print "$seq_len\t$histo{$seq_len}\t$plot";
	print "\t", join(", ", @{$id_set{$seq_len}}) unless ($nolabel || $null);
	print "\n";
    }
    
    $max = $seq_len;
}
$expect  = int(0.5 + 10*$expect/$cumul)/10;

print  STDERR "\nThere are $chars chars in $num_seqs seqs.";
printf STDERR " (G+C = %4.1f%%)", 100*$gc/($gc+$at) if ($get_gc);
printf STDERR " (A:%u, C:%u, G:%u, T:%u, Ambig:%u)", $a, $c, $g, $t, ($chars - ($a +$c + $g + $t)) if ($get_dna);
print  STDERR "\nmin length = $min, median length = $median, mean length = $expect, max length = $max\n\n";


sub get_a_fasta_record
{
    my ($fh) = @_;
    my ( $old_eol, $entry, @lines, $head, $seq, @result );
    
    if (not defined($fh))  { $fh = \*STDIN; }
    
    $old_eol = $/;
    $/ = "\n>";
 
    if (defined($entry = <$fh>))
    {
	chomp $entry;
	@lines  =  split( /\n/, $entry );
	$head   =  shift @lines;
	$head   =~ s/^>?//;
	$seq    =  join( "", @lines );
	
	@result =  ($head, \$seq);
    }
    else
    {
	@result = ();
    }
    
    $/ = $old_eol;
    return @result;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3