[Bio] / FigMetagenomeTools / RobInformationTheory.pm Repository:
ViewVC logotype

View of /FigMetagenomeTools/RobInformationTheory.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Tue Apr 10 16:37:39 2007 UTC (12 years, 8 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_rel_2008_0806, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_rel_2008_0923, mgrast_release_3_0, mgrast_dev_03252011, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, mgrast_rel_2008_0919, mgrast_rel_2008_1110, myrast_33, mgrast_rel_2008_0917, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
new stuff that didn't make it in before

#__perl__

=pod

=head1 RobInforamtionTheory

I am playing with information theory and sequence quality, and so this module is to handle that.

=cut

package RobInformationTheory;
use strict;

=head1 new

instantiate

=cut

sub new {
  my ($class, %args)=@_;
   my $self = bless{},$class;
 return $self;
}


=head1 Shannons_H

Calculate Shannon's H on a DNA sequence. 

Shannon's H is calculated from the probability of the occurence of each base, using this formula:

H = - sum[ Pi log(2) Pi]  bits per base, where Pi is the fraction of each base.

Thus a random sequence has P(A) = P(G) = P(C) = P(T) = 0.25.

H = -( [0.25 * log(2) 0.25] + [0.25 * log(2) 0.25] + [0.25 * log(2) 0.25] + [0.25 * log(2) 0.25])

H = -( [0.25 * -2] + [0.25 * -2] + [0.25 * -2] + [0.25 * -2])

H = 2

The lower the H, the less random the sequences are.

At the moment this is specific for DNA.

=cut

sub Shannons_H {
	my ($self, $seq)=@_;
	return undef unless ($seq);
	$seq=uc($seq);
	$seq =~ s/\s+//g;
	my %count;
	my $n;
	while ($seq)
	{
		my $base=chop($seq);
		if ($base eq "N" || $base eq "X") {$n++; next}
		if ($base !~ /[ACGT]/) {print STDERR "Found $base which is not ACG or T. We are increasing the number of bases possible\n"}
		$count{$base}++;
		$n++;
	}
	my $h;
	foreach my $base (keys %count)
	{
		my $p = $count{$base}/$n;
		$h += ($p * (log($p)/log(2))) if ($p);
	}
	return -$h;
}

=pod hs_contig_coverage

This does something a little specific, and needs one file.

For each read I have a tuple of number of contigs, H score, genome number. From this data, we are going to choose sequences just from genomes that we know about, and then calculate a h score/contig coverage mapping.

=cut

sub hs_contig_coverage {
	my ($self, $genomes)=@_;
	my %genomes=map {$_=>1} @$genomes;
	open(IN, "gunzip -c seq_info_table.txt.gz|") || die "Can't open seq_info_table.txt.gz";
	my %contigs; my %count;
	while (<IN>)
	{
		chomp;
		my ($id, $co, $hs, $gen)=split /\t/;
		next unless ($genomes{$gen});
		unless (defined $co) {$co=1} # a contig of 1
		next unless (defined $hs); # don't continue if we don't have a score for it
		my $hs=(int($hs*100))/100; # convert the hs to two dp
		$contigs{$hs}+=$co;
		$count{$hs}++;
	}
	close IN;
	my $av;
	map {$av->{$_}=$contigs{$_}/$count{$_}} keys %count;
	return $av;
}
		
=head1 Var_Shannons_H

Calculate Shannon's H for a sequence of defined length.

=cut

sub Var_Shannons_H {
	my ($self, $seq, $c)=@_;
	return undef unless ($seq);
	$seq=uc($seq);
	$seq =~ s/\s+//g;
	my %count;
	my $n;
	for (my $i=0; $i<=length($seq); $i++)
	{
		my $base=substr($seq, $i, $c);
		next unless (length($base) == $c);
		$count{$base}++;
		$n++;
	}
	my $h;
	foreach my $base (keys %count)
	{
		my $p = $count{$base}/$n;
		$h += ($p * (log($p)/log(2))) if ($p);
	}
	return -$h;
}

	
		
1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3