[Bio] / FigMetagenomeTools / Rob.pm Repository:
ViewVC logotype

View of /FigMetagenomeTools/Rob.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Fri Apr 13 15:31:28 2007 UTC (12 years, 7 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_rel_2008_0806, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_rel_2008_0923, mgrast_release_3_0, mgrast_dev_03252011, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, mgrast_rel_2008_0919, mgrast_rel_2008_1110, myrast_33, mgrast_rel_2008_0917, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
new file

# a collection of stuff that I want for perl

package Rob;
use strict;

=head1 new

 instantiate

=cut

sub new {
	my ($class, %arg) = @_;
	my $self = bless{},$class;
	return $self;
}


=head1 clean_genome

 remove the crap out of genome names so you can open them with treeing programs

 my $cleangenome=clean_genome($genome);

=cut

sub clean_genome {
 my ($self, $genome)=@_;
 $genome =~ s/\'//g; $genome =~ s/\://g;$genome=~s/\(//g; $genome=~ s/\)//g;
 return $genome;
}

=head1 read_and_exec_file
  
  read and execute some code in a file. e.g. if I have a hash stored in the file, this will execute it
  We will not actually execute the code, and that way you should be able to get the answer with
  eval rob->read_exec_file($file)

=cut

sub read_exec_file {
 my ($self, $file)=@_;
 open (IN, $file) || die "Can't open $file";
 my $read;
 while (<IN>) {
  $read .= $_;
  }
  return $read;
}

=head1 argopts

My own version of Get::Opts

Pass in a REFERENCE to @ARGV, and a list of options that you want to extract. Other things on the command line are returned as
space separated.

Note that each of the objects are returned in order.

   my ($arg1, $arg2, $arg3, $rest_args)=$rob->argopts(\@ARGV, "-a", "b", "-c");


=cut

sub argopts {
 my ($self, $argv, @argu)=@_;
 return unless ($argv);
 my %need; my $i=0;
 my @found;
 foreach my $a (@argu) 
 {
  unless ($a =~ /^-/) {$a="-".$a}
  $need{$a}=$i;
  $found[$i]=undef;
  $i++;
 }
 my $rest;
 while (@$argv)
 {
  my $test=shift @$argv;
  if (defined $need{$test}) {$found[$need{$test}]=shift @$argv}
  else {$rest .= " ".$test." "}
 }
 
 return (@found, $rest);
}


=head1 sum

 Calculate the sum of an array

=cut

sub sum {
	my ($self, $arr)=@_;
	return 0 unless ($arr);
	my $sum=0;
	foreach (@$arr) {$sum += $_}
	return $sum;
}
	     

=head1 mean

 Calculate the mean on an array
 e.g. my $mean=Rob->mean(\@array);

 This is the arithmetical mean (i.e. sum/n)

=cut

sub mean {
 my ($self, $arr)=@_;
 return 0 unless ($arr);
 return 0 unless (@$arr);
 my $sum;
 foreach (@$arr) {$sum+=$_};
 return $sum/(scalar @$arr);
}

=head1 median

 Calculate the median for an array
 e.g. my $median=Rob->median(\@array);
 
 This is the median, the middle number on a list. Note I am not going to return mode.

=cut

sub median {
 my ($self, $arr)=@_;
 return 0 unless ($arr);
 @$arr=sort {$a <=> $b} @$arr;
 #@$arr=sort @$arr;
 my $mid=$#$arr/2;
 if ($mid == int($mid)) {
  return $$arr[$mid];
 } else {
  return ($$arr[$mid - 0.5] + $$arr[$mid + 0.5])/2;
 }
}

=head1 min

    Return the minimum value in an array.
    e.g. my $min=Rob->min($array);

=cut

sub min {
    my ($self, $arr)=@_;
    return undef unless ($arr);
    my $min=1e99;
    map {($_ < $min) ? ($min = $_) : 1} @$arr;
    return $min;
}


=head1 max

    Return the maximum value in an array.
    e.g. my $max=Rob->max($array);

=cut

sub max {
    my ($self, $arr)=@_;
    return undef unless ($arr);
    my $max=0;
    map {($_ > $max) ? ($max = $_) : 1} @$arr;
    return $max;
}

=head1 stdev
 
 Calculate the standard deviation on an array.

 e.g. my $stdev=Rob->stdev(\@array);
      my $stdev=Rob->stdev([1,2,3,4]);

=cut

sub stdev {        
 my ($self, $arr)=@_;
 return 0 unless ($arr);
 my ($n, $sum, $square)=(0,0,0);
 foreach (@$arr) {
  $n++;
  $sum += $_;
  $square += ($_**2);
 }
 
 #stdev = sqrt(((n.sum(x squared))-((sum x) squared))/n(n-1))
 unless ($n*($n-1)) {die "STDEV at div by zero as $sum, $square, $n\n"; return 0}
 unless ($n*($n-1)) {print STDERR "STDEV at div by zero as $sum, $square, $n\n"; return 0}
 my $stdev = (($n * $square)-($sum * $sum))/($n*($n-1));
 if ($stdev < 0) {die "STdev less than zero ($stdev) as $sum, $square, $n\n"}
 if ($stdev < 0) {print STDERR "STdev less than zero ($stdev) as $sum, $square, $n\n"}
 $stdev=sqrt($stdev);
 return $stdev;
}  

=head1 read_fasta

Read a fasta format file and return a hash with the data, but this is not just limited to fasta sequence files - it will also handle quality scores and such.

Takes a file as argument, and an optional boolean to ensure that it is a quality file

usage: 
	my $fa=$rob->read_fasta($fastafile);
	my $fa=$rob->read_fasta("fastafile.gz"); # can also handle gzipped files
	my $qu=$rob->read_fasta($qualfile, 1);

Returns a reference to a hash - keys are IDs values are values

=cut

sub read_fasta {
 my ($self, $file, $qual)=@_;
 if ($file =~ /.gz$/) {open(IN, "gunzip -c $file|") || die "Can't open a pipe to $file"}
 else {open (IN, $file) || die "Can't open $file"}
 my %f; my $t; my $s; my $newlinewarning;
 while (<IN>) {
  if (/\r/) 
  {
  	print STDERR "The fasta file $file contains \\r new lines. It should not do this. Please complain bitterly.\n" unless ($newlinewarning);
	$newlinewarning=1;
  	s/\r/\n/g; 
	s/\n\n/\n/g;
  }
  chomp;
  if (/^>/) {
   s#^>##;
   if ($t) {
    if ($qual) {$s =~ s/\s+/ /g; $s =~ s/\s$/ /; $s =~ s/^\s+//}
    else {$s =~ s/\s+//g}
    $f{$t}=$s;
    undef $s;
   }
   $t=$_;
  }
  else {$s .= " " . $_ . " "}
 }
 if ($qual) {$s =~ s/\s+/ /g; $s =~ s/\s$/ /; $s =~ s/^\s+//}
 else {$s =~ s/\s+//g}
 $f{$t}=$s;
 close IN;
 return \%f;
}

=head1 rc

Reverse complement a sequence

=cut

sub rc {
 my ($self, $seq)=@_;
 $seq =~ tr/GATCgatc/CTAGctag/;
 return reverse $seq;
}

=head1 rand

Randomize an array using the fisher-yates shuffle described in the perl cookbook.

=cut

sub rand {
  my ($self, $array) = @_;
  my $i;
  for ($i = @$array; --$i; ) {
   my $j = int rand ($i+1);
   next if $i == $j;
   @$array[$i,$j] = @$array[$j,$i];
  }
  return $array;
}

=head1 factorial

Return the factorial of a number. This method will calculate the factorial of any number, but also stores the results, so things are returned much quicker

=cut

sub factorial {
	my ($self, $n)=@_;
	
	return $self->{'factorial'}->[$n] if (defined $self->{'factorial'}->[$n]);
	$n=171 if ($n > 170); # 170 factorial is 7.25741561530799e+306. 171! is infinite for 32-bit computers :)
	my $res;
	if ($n == 0) {$res=1}
	else {$res=$n*$self->factorial($n-1)} # recursive using itself.
	$self->{'factorial'}->[$n]=$res;
	return $res;
}


=head1 combined_oligos_in

Find all the oligos in a sequence within a given size range, and combine all the oligos into a single list

use:
my $arr=$rob->combined_oligos_in($seq, $min, $max);

=cut

sub combined_oligos_in {
	my ($self, $seq, $min, $max)=@_;
	my $alloligos;
	foreach my $len ($min..$max)
	{
		map {$alloligos->{$_}=1} keys %{$self->oligos_in($seq, $len)};
		my $new;
		map {$new->{$_}=1} @{$self->combine_oligos([keys %$alloligos])};
		$alloligos=$new;
	}
	return [keys %$alloligos];
}


=head1 oligos_in

Find all the oligos in a fasta sequence

Returns a hash of with the keys are all the oligos of a given size that appear once or more in any sequence. The values are the number of occurences.

Note that the oligos are returned all in uppercase.

Usage:
	my $arr=$rob->oligos_in($seq, 5);

=cut

sub oligos_in {
	my ($self, $seq, $len)=@_;
	$seq=uc($seq);
	my $posn=0; my %oligo;
	while($posn<=length($seq)-$len)
	{
		my $s=substr($seq, $posn, $len);
		$oligo{$s}++;
		$posn++;
	}
	return \%oligo;
}
		

=head1 combine_oligos

Combine oligos to remove any oligos that are shorter and redundant.

For example, combine

	TTT
	TTTTA
	TTTTAGG

and just use TTTTAGG

Pass in a reference to an array of sequences, and retrieve a reference to an array of combined oligos.


=cut

sub combine_oligos {
	my ($self, $arr)=@_;

	my @oligos=sort {length($b) <=> length($a)} @$arr;

	my %all;
	foreach my $o (@oligos)
	{
		my $seen=-1;
		foreach my $k (keys %all)
		{
			$seen=index($k, $o);
			last if ($seen > -1);
		}
		if ($seen == -1) {$all{$o}=1}
	}
	return [keys %all];
}


=head1 Tm

Please Note: This is stolen 100% from BioPerl, but since I wrote that bioperl module it is not plaigarism :)
You should preferentially use the bp module for this, but I don't want to since one system I am working on
doesn't have it at the moment and this is the quickest way to get it working.

Title   : Tm()
	Usage   : $tm = $primer->Tm(-salt=>'0.05', -oligo=>'0.0000001', -seq=>'ATTACTTATATCA')
	Function: Calculates and returns the Tm (melting temperature) of the primer
	Returns : A scalar containing the Tm.
	Args    : -salt set the Na+ concentration on which to base the calculation (default=0.05 molar).
	: -oligo set the oligo concentration on which to base the calculation (default=0.00000025 molar).
	: -seq the sequence. This is the only required argument
	Notes   : Calculation of Tm as per Allawi et. al Biochemistry 1997 36:10581-10594.  Also see
	documentation at http://biotools.idtdna.com/analyzer/ as they use this formula and
	have a couple nice help pages.  These Tm values will be about are about 0.5-3 degrees
	off from those of the idtdna web tool.  I don't know why.

	This was suggested by Barry Moore (thanks!). See the discussion on the bioperl-l
	with the subject "Bio::SeqFeature::Primer Calculating the PrimerTM"

=cut

sub Tm  {
	my ($self, %args) = @_;
	my $salt_conc = 0.05; #salt concentration (molar units)
		my $oligo_conc = 0.00000025; #oligo concentration (molar units)
		if ($args{'-salt'}) {$salt_conc = $args{'-salt'}} #accept object defined salt concentration
			if ($args{'-oligo'}) {$oligo_conc = $args{'-oligo'}} #accept object defined oligo concentration
				my $sequence;
	if ($args{'-seq'}) {$sequence=uc $args{'-seq'}}
	unless ($sequence) {print STDERR "No sequence provided for Tm calculation. returned 0\n";  return 0}
	my $length = length($sequence);
	my @dinucleotides;
	my $enthalpy;
	my $entropy;
#Break sequence string into an array of all possible dinucleotides
	while ($sequence =~ /(.)(?=(.))/g) {
		push @dinucleotides, $1.$2;
	}
#Build a hash with the thermodynamic values
	my %thermo_values = ('AA' => {'enthalpy' => -7.9,
			'entropy'  => -22.2},
			'AC' => {'enthalpy' => -8.4,
			'entropy'  => -22.4},
			'AG' => {'enthalpy' => -7.8,
			'entropy'  => -21},
			'AT' => {'enthalpy' => -7.2,
			'entropy'  => -20.4},
			'CA' => {'enthalpy' => -8.5,
			'entropy'  => -22.7},
			'CC' => {'enthalpy' => -8,
			'entropy'  => -19.9},
			'CG' => {'enthalpy' => -10.6,
			'entropy'  => -27.2},
			'CT' => {'enthalpy' => -7.8,
			'entropy'  => -21},
			'GA' => {'enthalpy' => -8.2,
			'entropy'  => -22.2},
			'GC' => {'enthalpy' => -9.8,
			'entropy'  => -24.4},
			'GG' => {'enthalpy' => -8,
			'entropy'  => -19.9},
			'GT' => {'enthalpy' => -8.4,
				'entropy'  => -22.4},
			'TA' => {'enthalpy' => -7.2,
				'entropy'  => -21.3},
			'TC' => {'enthalpy' => -8.2,
				'entropy'  => -22.2},
			'TG' => {'enthalpy' => -8.5,
				'entropy'  => -22.7},
			'TT' => {'enthalpy' => -7.9,
				'entropy'  => -22.2},
			'A' =>  {'enthalpy' => 2.3,
				'entropy'  => 4.1},
			'C' =>  {'enthalpy' => 0.1,
				'entropy'  => -2.8},
			'G' =>  {'enthalpy' => 0.1,
				'entropy'  => -2.8},
			'T' =>  {'enthalpy' => 2.3,
				'entropy'  => 4.1}
	);
#Loop through dinucleotides and calculate cumulative enthalpy and entropy values
	for (@dinucleotides) {
		unless (defined $thermo_values{$_}{enthalpy}) {print STDERR "No enthalpy for |$_|\n"; $thermo_values{$_}{enthalpy}=0}
		unless (defined $thermo_values{$_}{entropy}) {print STDERR "No entropy for |$_|\n"; $thermo_values{$_}{entropy}=0}
		$enthalpy += $thermo_values{$_}{enthalpy};
		$entropy += $thermo_values{$_}{entropy};
	}
#Account for initiation parameters
	$enthalpy += $thermo_values{substr($sequence, 0, 1)}{enthalpy};
	$entropy += $thermo_values{substr($sequence, 0, 1)}{entropy};
	$enthalpy += $thermo_values{substr($sequence, -1, 1)}{enthalpy};
	$entropy += $thermo_values{substr($sequence, -1, 1)}{entropy};
#Symmetry correction
	$entropy -= 1.4;
	my $r = 1.987; #molar gas constant
		my $tm = ($enthalpy * 1000 / ($entropy + ($r * log($oligo_conc))) - 273.15 + (12* (log($salt_conc)/log(10))));
	return $tm;
}




1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3