[Bio] / FigKernelScripts / make_calls_for_start_predictions.pl Repository:
ViewVC logotype

View of /FigKernelScripts/make_calls_for_start_predictions.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Fri Nov 30 21:53:05 2007 UTC (12 years, 4 months ago) by gdpusch
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.3: +3 -2 lines
Added FIGV support. -- /gdp

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

# usage: make_calls_for_start_predictions [parms_file] < stream_of_objects > tbl_entries

if (@ARGV > 0)
{
    $parms_file = shift @ARGV;
    $parms = &set_parms($parms_file);
}
else
{
    $parms = &default_parms();
}
$p_sim         = $parms->[0];
$p_dicodon     = $parms->[1];
$p_rbs4        = $parms->[2];
$p_rbs5        = $parms->[3];
$p_rbs6        = $parms->[4];
$p_rbs7        = $parms->[5];
$p_atg         = $parms->[6];
$p_ttg         = $parms->[7];
$p_gtg         = $parms->[8];
$p_ov1         = $parms->[9];
$p_ov10        = $parms->[10];
$p_ov20        = $parms->[11];
$p_ov50        = $parms->[12];
$p_ov100       = $parms->[13];

$/ = "///\n";
while (defined($obj = <STDIN>))
{
    chomp $obj;
    if ($obj =~ /^(.*?\n)\/\/\n(.*)/s)
    {
        ($hdr,$start_data) = ($1,$2);
        
        $kv = &load_kv($hdr);
	$hdr =~ s/\nNEW_START_POS=\d+//s;
	$hdr =~ s/\nNEW_START_LOC=\S+//s;

        $id      = $kv->{"ID"};
        $contig  = $kv->{'CONTIG'};
        $beg     = $kv->{'BEGIN'};
        $end     = $kv->{'END'};
        @starts  = map { $_ =~ s/\nSCORE\S+//s; $_ } split(/\n\/\/\n/,$start_data);
        @poss = ();
        foreach $poss_start (@starts)
        {
            $sim_sc         = 0;
            $dicodon_sc     = 0;
            $start_codon_sc = 0;
            $rbs_sc         = 0;
            $ov_sc          = 0;
            $kv2        = &load_kv($poss_start);
            $start      = $kv2->{'START'};
            if (exists($kv2->{'SIM'}))
            {
                $sim_sc = $p_sim * $kv2->{'SIM'};
            }
            if (exists($kv2->{'DICODON'}))
            {
                $dicodon_sc = $p_dicodon * $kv2->{'DICODON'};
            }
            if (exists($kv2->{'START_CODON'}))
            {
                if ($kv2->{'START_CODON'} eq "ATG")
                {
                    $start_codon_sc = $p_atg;
                }
                elsif ($kv2->{'START_CODON'} eq "TTG")
                {
                    $start_codon_sc = $p_ttg;
                }
                elsif ($kv2->{'START_CODON'} eq "GTG")
                {
                    $start_codon_sc = $p_gtg;
                }
                else
                {
                    $start_codon_sc = 0;
                }
            }
            if (exists($kv2->{'RBS_LEN'}))
            {
                $rbs_ln = $kv2->{'RBS_LEN'};
                $rbs_sc = ($p_rbs4 * (($rbs_ln >= 4) ? 1 : 0)) +
                          ($p_rbs5 * (($rbs_ln >= 5) ? 1 : 0)) +
                          ($p_rbs6 * (($rbs_ln >= 6) ? 1 : 0)) +
                          ($p_rbs7 * (($rbs_ln >= 7) ? 1 : 0));
            }
            if (exists($kv2->{'OVERLAP'}))
            {
                $ov_ln  = $kv2->{'OVERLAP'};
                $ov_sc  = ($p_ov1   * (($ov_ln >=   1) ? 1 : 0)) +
                          ($p_ov10  * (($ov_ln >=  10) ? 1 : 0)) +
                          ($p_ov20  * (($ov_ln >=  20) ? 1 : 0)) +
                          ($p_ov50  * (($ov_ln >=  50) ? 1 : 0)) +
                          ($p_ov100 * (($ov_ln >= 100) ? 1 : 0));
            }
            $sc = $sim_sc + $dicodon_sc + $start_codon_sc + $rbs_sc + $ov_sc;
            push(@poss,[$sc,$start]);
            $poss_starts{$poss_start} = $sc;    # just for printing later
        }
        @poss = sort { $b->[0] <=> $a->[0] } @poss;

        print "$hdr";
        if (@poss == 0)
        {
            print STDERR "could not call a start for $id\n";
            # next line puts out tbl format with score as last field
            # print "$id\t",join("_",($contig,$beg,$end)),"\t0\n";    # sc = 0
	    print "NEW_START_POS=0\n";
	    print "NEW_START_LOC=", join("_",($contig,$beg1,$end)), "\n";
        }
        else
        {
            $best_start = $poss[0]->[1];
            # print "BEST_START=$best_start\n";
            $beg1 = ($beg < $end) ? ($beg + $best_start - 1) : ($beg - $best_start + 1);
	    # next line puts out tbl format with score as last field
	    # print "$id\t", join("_",($contig,$beg1,$end)), "\t$poss[0]->[0]\n";
	    print "NEW_START_POS=$poss[0]->[1]\n";
	    print "NEW_START_LOC=", join("_",($contig,$beg1,$end)), "\n";
        }
	print "//\n";
        foreach $poss_start (@starts)
        {
            print "$poss_start\n";
	    print "SCORE=$poss_starts{$poss_start}\n";
	    print "//\n";
        }
	print "///\n";
    }
}

sub load_kv {
    my($obj) = @_;
    my($kv,$kvH);

    $kvH = {};
    foreach $kv (split(/\n/,$obj))
    {
        if ($kv =~ /^([^=]+)=(.*)$/)
        {
            $kvH->{$1} = $2;
        }
    }
    return $kvH;
}

sub set_parms {
    my($file) = @_;

# parms:
#
#         parms->[0]  = multiplier for sims
#         parms->[1]  = multiplier for dicodon usage
#         parms->[2]  = multiplier for RBS len >= 4
#         parms->[3]  = multiplier for RBS len >= 5
#         parms->[4]  = multiplier for RBS len >= 6
#         parms->[5]  = multiplier for RBS len >= 7
#         parms->[6]  = multiplier for start-codon ATG
#         parms->[7]  = multiplier for start-codon TTG
#         parms->[8]  = multiplier for start-codon GTG
#         parms->[9]  = multiplier for overlap len >= 1
#         parms->[10]  = multiplier for overlap len >= 10
#         parms->[11]  = multiplier for overlap len >= 20
#         parms->[12] = multiplier for overlap len >= 50
#         parms->[13] = multiplier for overlap len >= 100
#            

    my $parms = &default_parms;
    my $i = 0;
    foreach $_ (`cat $file`)
    {
        chomp $_;
        $parms->[$i] = $_;
        $i++;
    }
    return $parms;
}

sub default_parms {

    return [
               # this top set gets about 70%
               # 0.4,                        # sims
               # 0.2,                        # dicodon
               # 0.1, 0.06, 0.005, 0.005,    # rbs
               # 0.1,                        # start-codon
               # 0.2, 0.05, 0.05, 0.05, 0.05 # overlap
               ###############
               # 1.0,                        # sims
               # 0.0,                        # dicodon
               # 0.0, 0.00, 0.000, 0.000,    # rbs
               # 0.0,                        # start-codon
               # 0.0, 0.00, 0.00, 0.00, 0.00 # overlap

               ###############  this set gets 90.2 % in buchnera
               # 400,2,20,20,20,20,80,4,4,2,20,20,40,40

               ###############  this set gets 81.3% in test subsystem
               0.999,                       # sims
               0.0,                       # dicodon
               0.05, 0.05, 0.05, 0.05,      # rbs
               0.25, 0.01, 0.01,             # start-codon
               0.005, 0.1, 0.1, 0.2, 0.2  # overlap
           ];
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3