[Bio] / FigKernelScripts / rationalize_blast.pl Repository:
ViewVC logotype

View of /FigKernelScripts/rationalize_blast.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Mon Dec 5 18:56:38 2005 UTC (14 years, 2 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07
Changes since 1.2: +17 -0 lines
Add license words.

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

#  rationalize_blastp
#
#  Output fields:
#
# Query=  query_id  query_def  query_len
# >       sbjct_id  sbjct_def  sbjct_len
# HSP  score  exp  p_n  p_val  n_match  n_ident  n_sim  n_gap  dir  q1  q2  q_sq  s1  s2  s_sq
# HSP  score  exp  p_n  p_val  n_match  n_ident  n_sim  n_gap  dir  q1  q2  q_sq  s1  s2  s_sq
# .
# .
# .
# HSP  score  exp  p_n  p_val  n_match  n_ident  n_sim  n_gap  dir  q1  q2  q_sq  s1  s2  s_sq
# .
# .
# .
# >       sbjct_id  sbjct_def  sbjct_len
# HSP  score  exp  p_n  p_val  n_match  n_ident  n_sim  n_gap  dir  q1  q2  q_sq  s1  s2  s_sq
# .
# .
# .
#
#  1    2    3   4    5      6      7       8     9   10  11 12  13  14 15 16
# HSP score exp p_n p_val n_match n_ident n_sim n_gap dir q1 q2 q_sq s1 s2 s_sq

use strict;

my ($qid, $qdef, $qlen, $q1, $q2, $qseq) = ("","",0,0,0,"");
my ($sid, $sdef, $slen, $s1, $s2, $sseq) = ("","",0,0,0,"");
my ($sdesc);
my ($s, $e, $n, $p, $ident, $outof, $posit, $ngap, $frame);

while (<>) {
	chomp;
	s/ +$//;  #  trim trailing spaces (there probably are not any)

	if ( $qseq && ( /^>/   # end of an HSP; report it -------------------
	             || /^Parameters:/
	             || /^ Score = /
	             || /^ +Plus Strand HSPs:/
	             || /^ +Minus Strand HSPs:/
	             || /^ +Database:/
	              ) ) {
		if ($q1 && $s1 && ($qid ne $sid)) {
			if ($sdesc) {
				print "$sdesc\n";
				$sdesc = "";
			}

			if (! $frame) {
				$frame = (seqdir($q1,$q2) * seqdir($s1,$s2) > 0) ? "+" : "-";
			}

			#  my $tmp = $qseq . $sseq;
			#  $ngap = $tmp =~ s/-//g;

			print join "\t", "HSP", $s, $e, $n, $p,
			                 $outof, $ident, $posit, $ngap, $frame,
			                 $q1, $q2, $qseq,
			                 $s1, $s2, $sseq . "\n";

			$frame = "";
		}

		$q1 = $s1 = 0;
		$qseq = $sseq = "";
	}


	if ( s/^Query= +// ) {   # Query sequence description --------------------
		$qdef = $_;
		$qlen = 0;

		while (<>) {  # continue reading to query length
			s/ +$//;

			if ( /^  +[(]([1-9][\d,]*) letters.*[)]/ ) {   # end of description
				$qlen = $1;                      # grab query length
				$qlen =~ s/,//g;                 # get rid of commas
				($qid, $qdef) = split_id($qdef); # parse query description
				                                 # print query description
				print join "\t", "Query=", $qid, $qdef, $qlen . "\n";
				last;                            # done with query description
			}

			elsif ( s/^ +// ) {                  # continuation of description
				$qdef .= (($qdef =~ /-$/) ? "" : " ") . $_;
			}

			else { last; }                       # something else is an error
		}
		
		if (! $qlen) {
			$_ || ($_ = defined($_) ? "" : "[EOF]");
			die "Error parsing query definition for sequence length:\n$qdef\n<<<here>>>\n$_\n\n";
		}

		$sid = $sdef = "";
		$q1 = 0;
	}


	elsif ( /^>/ ) {         # Subject sequence description ------------------
		$sdef = $_;
		$slen = 0;

		while (<>) {
		        chomp;
			if ( /^  *Length = ([1-9][\d,]*)/ ) {  # length is end of desc.
				$slen = $1;
				$slen =~ s/,//g;

				($sid, $sdef) = split_id($sdef);
				$sdesc = join "\t", ">", $sid, $sdef, $slen;
				last;
			}

			elsif ( s/^  +// ) {     #  multiple spaces = continuation
				$sdef .= (($sdef =~ /-$/) ? "" : " ") . $_;
			}

			elsif ( s/^ /\001/ ) {   #  merged nr entries start with one space
				$sdef .= $_;
			}

			else { last; }           #  something else is an error
		}

		if (! $slen) {
			$_ || ($_ = defined($_) ? "" : "[EOF]");
			die "Error parsing subject definition for sequence length:\n$sdesc\n<<<here>>>\n$_\n\n";
		}

		$q1 = 0;
	}


	elsif ( /^ Score = +([\d.e+-]+) / ) {   # Score is start of an HSP -------------
		$s = $1;

		/Expect = +([^ ,]+)/ || /Expect[(]\d+[)] = +([^ ,]+)/
			|| die "Error parsing Score line for Expect:\n$sdesc\n<<<here>>>\n$_\n";

		$e = $1; 
		$n = ( /Expect[(](\d+)[)]/ ) ? $1 : (( /P[(](\d+)[)]/ ) ? $1 : 1);
		$p = ( / = +(\S+)$/   ) ? $1 : $e;

		defined($_ = <>) || die "End-of-file while looking for Identities line:\n$sdesc\n<<<here>>>\n";

		/^ Identities = +(\d+)\/(\d+)/ || die "Error parsing Identities line:\n$sdesc\n<<<here>>>\n$_\n";

		$ident = $1;
		$outof = $2;
		$posit = ( /Positives = +(\d+)/ ) ? $1 : $ident;
		$ngap  = ( /Gaps = +(\d+)/      ) ? $1 : 0;

		$q1 = $s1 = 0;
		$qseq = $sseq = "";
		
		$e = ($e =~ /^e-/) ? "1.0$e" : $e;
		$p = ($p =~ /^e-/) ? "1.0$p" : $p;
	}


	elsif ( /^ Frame = +(\S+)/ ) {  # frame of a translated blast ------------
		$frame = $1;
		$q1 = 0;
	}


	elsif ( s/^Query: +// ) {   # query sequence data ------------------------
		my ($t1, $t2, $t3) = split / +/, $_;
		if (! $q1) { $q1 = $t1; }
		elsif ($t1 != $q2 + 1) {
			print STDERR "Warning: Query position $t1 follows $q2 in $qid vs $sid:\n";
			print STDERR "$_\nFlushing alignment fragment\n\n";
			# flush this line and next 3
			defined($_ = <>) && defined($_ = <>) && defined($_ = <>)
				|| die "End-of-file while reading alignment:\n$sdesc\n";
			next;
		}
		$qseq .= $t2;
		$q2 = $t3;
		#  Flush the alignment match symbol line
		defined($_ = <>) || die "End-of-file while reading alignment:\n$sdesc\n";
	}


	elsif ( s/^Sbjct: +// ) {   # subject sequence data ----------------------
		my ($t1, $t2, $t3) = split / +/, $_;
		if (! $s1) { $s1 = $t1; }
		elsif (($t1 != $s2 + 1) && ($t1 != $s2 - 1)) {
			print STDERR "Warning: Subject position $t1 follows $s2 in $qid vs $sid:\n";
			print STDERR "$_\nFlushing alignment fragment\n\n";
			# flush this line and next 1
			defined($_ = <>)
				|| die "End-of-file while reading alignment:\n$sdesc\n";
			next;
			}
		$sseq .= $t2;
		$s2 = $t3;
		#  Flush the blank line
		defined($_ = <>) || die "End-of-file while reading alignment:\n$sdesc\n";
	}
}


sub split_id {
	my ($str) = shift;
	my ($id, $def);
	($str =~ m/^>?\s*(\S+)(:?\s+(.*))?/s) || return ();
	$id  = $1;
	$def = $3 || "";
	return ($id, $def);
}


sub seqdir {
	my ($from, $to) = @_;
	return ($from <= $to) ? 1 : -1;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3