[Bio] / FigKernelScripts / compute_mw_and_pi.pl Repository:
ViewVC logotype

View of /FigKernelScripts/compute_mw_and_pi.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Wed Feb 21 16:27:38 2007 UTC (12 years, 8 months ago) by mkubal
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.1: +21 -23 lines
one genome at a time

# -*- perl -*-
use FIG;

# usage: compute_mw_and_pi genome
# The results are written in attribute format to /vol/seed-attributes/molecular_weight/$genome_molecular_weight_attributes.txt and 
# /vol/seed-attributes/isoelectric_point/$genome_isoelectric_point_attributes.txt 

$fig = new FIG;

my $genome,$name;

$genome = shift(@ARGV);

if(!$genome){
    print "please supply SEED genome id as argumnets\n";
    exit;
}

open(MW,">/vol/seed-attributes/molecular_weight/$genome_molecular_weight_attributes.txt");
open(ISO,">/vol/seed-attributes/isoelectric_point/$genome_isoelectric_point_attributes.txt");

# moleculare weight or amino acids
our %aaweights = ('A' =>  89.09,
		  'C' => 121.16,
		  'D' => 133.10,
		  'E' => 147.13,
		  'F' => 165.19,
		  'G' =>  75.07,
		  'H' => 155.16,
		  'I' => 131.17,
		  'K' => 146.19,
		  'L' => 131.17,
		  'M' => 149.21,
		  'N' => 132.12,
		  'P' => 115.13,
		  'Q' => 146.15,
		  'R' => 174.20,
		  'S' => 105.09,
		  'T' => 119.12,
		  'V' => 117.15,
		  'W' => 204.22,
		  'Y' => 181.19, 
		  '*' => 0,
		  'X' => 0); 

our $h2oweight = 18.016;

our %aa3letternames = ('A' => 'Ala',
		       'B' => 'Asx',
		       'C' => 'Cys',
		       'D' => 'Asp',
		       'E' => 'Glu',
		       'F' => 'Phe',
		       'G' =>  'Gly',
		       'H' => 'His',
		       'I' => 'Ile',
		       'K' => 'Lys',
		       'L' => 'Leu',
		       'M' => 'Met',
		       'N' => 'Asn',
		       'P' => 'Pro',
		       'Q' => 'Gln',
		       'R' => 'Arg',
		       'S' => 'Ser',
		       'T' => 'Thr',
		       'V' => 'Val',
		       'W' => 'Trp',
		       'Y' => 'Tyr', 
		       '*' => 'Stop',
		       'Z' => 'Glx',
		       'X' => '?'); 


@pegs = $fig->pegs_of($genome);
foreach my $peg (@pegs){
    my $sequence = $fig->get_translation($peg);
    
    my $mw = &calc_molweight($sequence);
    if($mw =~/(\d+\.\d{2})/){$mw = $1}
    my $pI = &calc_pI($sequence);
    if($pI =~/(\d+\.\d{2})/){$pI = $1}   
    print MW "$peg\tmolecular_weight\t$mw\n";
    print ISO "$peg\tisoelectric_point\t$pI\n";
}

########################################################
# calculate the molecular weight of a protein sequence #
########################################################
sub calc_molweight {
    my ($aaseq)=@_;

    my $molwt  = 0;  
    my $numaa  = length ($aaseq);
    
    my $i = 0; 
    my $aa;
    while ($aa = substr($aaseq, $i, 1)) {
        $molwt += $aaweights{$aa};
        $i++;
    }

    # Subtract a water for each peptide bond.
    $molwt -= (($numaa - 1) * $h2oweight );

    return ($molwt);
};


###############################
# calculate isoelectric point #
###############################
sub calc_pI {
    ###################################################################
    #  Assume that no electrostatic interactions perturb ionization.
    #  Assume that the pI lies between pH1 and pH13.
    #
    #  Net Charge = number of positively charges residues 
    #               minus the number of negatively charged residues 
    #               plus  the number of protonated amino termini 
    #               minus the number of deprotonated termini
    #
    #  For each amino acid capable of charge, the number
    #  of protonated residues is determined by the following equation:
    #
    #     Np = Nt * [H+] / ([H+] + Kn)
    #
    #     where Np is the number of protonated residues, Nt is the number of
    #     residues of a specific amino acid, [H+] is the H+ concentration,
    #     and Kn is the dissociation constant of the amino acid.  
    #
    #  Optional Input:
    #  picalc (PIVAR,AMINOTERMINI,CARBOXYLTERMINI)
    #
    #  where PIVAR is the max. variance in the pI permitted (default=0.000001).
    #        AMINOTERMINI is the number of amino termini (default=1).
    #        CARBOXYLTERMINI is the number of carboxyl termini (default=1).
    ###################################################################

    my ($aaseq)=@_;
    
    my $allowederror = 0.00001; 

    my $numaterm;
    my $numcterm;
    if (defined($_[1])) { $numaterm = $_[1]; }
    else { $numaterm = 1; }

    if (defined($_[2])) { $numcterm = $_[2]; }
    else { $numcterm = 1; }

    # Count lysines, arginines, histidines, 
    # glutamates, aspartates,
    # cysteines, tyrosines, amino termini, carboxyl termini, ambiguities.

     $_ = $aaseq;
    my $numR = tr/R/R/;
    my $numK = tr/K/K/;
    my $numH = tr/H/H/;
    my $numY = tr/Y/Y/;
    my $numC = tr/C/C/;
    my $numE = tr/E/E/;
    my $numD = tr/D/D/;

    my $numX = tr/X/X/;
    my $numN = tr/N/N/;

    ## A  quick search method for the pI will involve a method that requires
    ## a low number of loops through the function.  Therefore, an interval
    ## search method will be used to narrow the search range (an interval where
    ## the net charge goes from negative to positive).  Then, a binary search
    ## method will be used to search for the 'netcharge=0' point within this
    ## interval.  Perhaps a Newton search method would be faster?  The desired
    ## error limit is within +/- 0.01 for the pI.

    # Set Ka values of charged amino acids: 
    
    my $KaR = 10**(-12.48);
    my $KaK = 10**(-10.53);
    my $KaH = 10**(-6);
    my $KaY = 10**(-10.07);
    my $KaC = 10**(-10.28);
    my $KaE = 10**(-4.25);
    my $KaD = 10**(-3.65);
    my $Kaaterm = 10**(-8.56);
    my $Kacterm = 10**(-3.56);

    # define helper functions
    # since both methods use a lot of variables defined in this function,
    # both are implemented as coderefs

    ##################################################
    # calculate the netcharge                        #
    # Input:  proton concentration                   #
    # Output: net charge of the amino acid sequence. #
    ##################################################
    my $netcharge = sub {
	my ($in)=@_;
	
	$numR*$in/($in+$KaR) + $numK*$in/($in+$KaK) + $numH*$in/($in+$KaH) - $numY + $numY*$in/($in+$KaY) - $numC + $numC*$in/($in+$KaC) - $numE + $numE*$in/($in+$KaE) - $numD + $numD*$in/($in+$KaD) + $numaterm*$in/($in+$Kaaterm) - $numcterm + $numcterm*$in/($in+$Kacterm);
	
    };

    ########################################################################
    # perform binary search                                                #  
    # Input: $high and $low boundaries                                     #
    # Exit:  when the net charge at $mid is 0 or within the allowed error. #
    ########################################################################
    my $binarysearch;

    $binarysearch = sub {
	my ($a,$b, $lastmid)=@_;
	
	my $mid = ($b + $a) / 2;
	my $net = &$netcharge(10**(-$mid));
	
	if ( ($net == 0) || (($mid - $lastmid)**2 <= $allowederror) ) {
	    return $mid;
	}
	
	if ($net > 0) { 
	    &$binarysearch($a, $mid, $mid); 
	}
	else { 
	    &$binarysearch($mid, $b, $mid); 
	}
    };


    my $i;
    for ($i=2; $i<15; $i++) {
	if (&$netcharge(10**(-$i)) <= 0) { last; }
    } 

    my $high = $i;
    my $low  = $i - 1;
    my $pI;
    if (&$netcharge(10**(-$high)) == 0) {
	$pI = $high;
    }
    elsif (&$netcharge(10**(-$low)) == 0) {
	$pI = $low;
    }
    else {
	$pI = &$binarysearch($high, $low, 0);
    }

    return ($pI);   
};



MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3