[Bio] / FigKernelScripts / extract_metagene_contigs.pl Repository:
ViewVC logotype

View of /FigKernelScripts/extract_metagene_contigs.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12 - (download) (as text) (annotate)
Mon Aug 23 23:29:45 2010 UTC (9 years, 3 months ago) by gdpusch
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.11: +11 -2 lines
Updated search for executables to revised software environment.

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

use FIG;
use File::Basename;
use strict;
use Pod::Text;

use strict;
use Pod::Text;
# use File::Basename;   $this_tool_name = basename($0);
if ((@ARGV == 1) && ($ARGV[0] =~ m/-help/))  {
    pod2text($0);  exit(0);
}

=pod

=over 5

=item Usage:     extract_metagene_contigs [-help] [-output-directory=output directory]  fasta_in fasta_out

=item Function:  Given a contig fasta input file, metagene tool is run to find any contig which may have multiple gene calls. 
                 If a contig with multiple gene calls is found, multiple new sequences are created in the output fasta file, 
                 otherwise the original contig is written to the output fasta file.
                 Three output files will be generated: 
                      - The raw metagene output
                      - The parsed metagene output of those sequences which have multiple genes, and their start and end locations.
                      - The new fasta file generated which include the subsequences generated from the metagene output.

                -output-directory      Directory address where the output files should be written.
                -peg                   Default is false. If true [peg=T], then the new ids assigned will be peg ids
                -genome                genome id for peg sequences
                -AA                    Default is false. If true [AA=T], the output sequences will be given as amino acid sequences.
                -exec                  Default is metagene. Metagene version to run. Options are -exec=mga or -exec=metagene
                -featuremap            File where you wish the new feature to contig map to be written to. Default is "metagene.map"
                -startpeg              when creating peg feature id, this would be the number to start at. Default is 1.
                -called_by             if present, it will write a file feature.called_by which states that metagene called the gene

=back

=cut

my $outD = "";
my $trouble = 0;
my $input_eol_marker = $/;
my $fasta_in = "";
my $fasta_out = "";
my $first_line  = 1;
my $featuremap;
my $startpeg = 1;
my $featurecalled;

my $badchar = 0;
my $max_bad = 500;
my $width       = 50;
my ($peg_flag,$genome_id,$amino,$tool);
my $exec = "$FIG_Config::metagene_bin/metagene";
my $tool="metagene";

while ($ARGV[0] =~ m/^-/) {
    if ($ARGV[0] =~ m/-output-directory=(\S+)/) {
        $outD = $1;
    }
    elsif ($ARGV[0] =~ m/-peg=(\S+)/) {
	$peg_flag=1 if ($1 eq "T");
    }
    elsif ($ARGV[0] =~ m/-genome=(\S+)/) {
	$genome_id=$1;
    }
    elsif ($ARGV[0] =~ m/-AA=(\S+)/) {
	$amino=1 if ($1 eq "T");
    }
    elsif ($ARGV[0] =~ m/-featuremap=(\S+)/) {
	$featuremap=$1;
    }
    elsif ($ARGV[0] =~ m/-called_by=(\S+)/) {
	$featurecalled=$1;
    }
    elsif ($ARGV[0] =~ m/-startpeg=(\S+)/) {
	$startpeg=$1;
    }
    elsif ($ARGV[0] =~ m/-exec=(\S+)/) {
	if ($1 eq "mga") {
	    $tool="mga";
	    if    (-x "$FIG_Config::mga_bin/mga_exec") {
		$exec="$FIG_Config::mga_bin/mga_exec";
	    }
	    elsif (-x "$FIG_Config::mga_bin/mga") {
		$exec="$FIG_Config::mga_bin/mga";
	    }
	    else {
		$trouble = 1;
	    }
	}
	elsif ($1 eq "metagene") {
	    $tool="metagene";
	    $exec = "$FIG_Config::metagene_bin/$1";
	}
	else {
	    $trouble = 1;
	    warn "Can't find MGA";
	}
    }
    else {
        $trouble = 1;
        print STDERR "bad arg $ARGV[0]\n\n";
    }

    shift;
}

if (@ARGV == 2) {
    $fasta_in  = shift;
    $fasta_out = shift;

    if (!-s $fasta_in) {
        $trouble = 1;
        warn "Input file $fasta_in does not exist\n";
    }
}

if ($trouble || @ARGV) {
    warn qq(There were invalid arguments: ), join(" ", @ARGV), qq(\n\n);
    pod2text($0);
    die "aborted";
}

my $fig = new FIG;
if (!$outD){
    pod2text($0);  exit(0);
}

# run metagene on the raw input file
my @metagene_args;
if ($tool eq "mga") {
    push (@metagene_args, "-s");
}
push (@metagene_args, ($fasta_in, "> $outD/metagene.out"));

$fig->run(join(" ", $exec, @metagene_args));


if (!$featuremap)
{
    $featuremap = "$outD/metagene_seqs.map";
}

if ($featurecalled)
{
    open (CALLED_BY, ">>$featurecalled");
}

open (FH_MG, ">>$featuremap");    # write the sequences that need to be divided into subsequences in this file

# parse the metagene output file
my $metagene_hits = {};
if ($tool eq "metagene")
{
    $/ = "\n\n";
}
open (FH, "$outD/metagene.out");
my $count=$startpeg;

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ... Parse Metagene / MetageneAnnotator ouput ...
#-----------------------------------------------------------------------
my @mg_hits = ();
if ($tool eq "metagene")
{
    while (my $block = <FH>){
	my ($contig_id) = ($block) =~ m/^\# (.*?)\n/;
	my @lines = split (/\n/, $block);
	if (@lines >= 4){   # write into a file the new sequences
	    foreach my $line (@lines){
		next if ($line =~ /^\#/);
		
		chomp $line;
		my ($start, $end, $strand, undef, $trunc) = split (/\t/, $line);
		push @mg_hits, [$contig_id, $start, $end, $strand, $trunc, $line];
		
#  		$metagene_hits->{$contig_id}->{$new_fid}->{start}  = $map_start;
# 		$metagene_hits->{$contig_id}->{$new_fid}->{stop}   = $map_end;
# 		$metagene_hits->{$contig_id}->{$new_fid}->{strand} = $strand;
	    }
	}
    }
}
elsif ($tool eq "mga")
{
    my $line = <FH>;
    while ($line && $line =~ m/^\#\s+(\S+)/)
    {
	my $contig_id = $1;
	$line= <FH>; $line= <FH>;
	$line =  <FH>;
	while ($line && $line !~ /^\#/)
	{
	    chomp $line;
	    my (undef, $start, $end, $strand, $frame, $trunc) = split (/\t/, $line);
	    push @mg_hits, [$contig_id, $start, $end, $strand, $frame, $trunc, $line];
	    
# 	    $metagene_hits->{$contig_id}->{$new_fid}->{start}  = $map_start;
# 	    $metagene_hits->{$contig_id}->{$new_fid}->{stop}   = $map_end;
# 	    $metagene_hits->{$contig_id}->{$new_fid}->{strand} = $strand;

	    $line = <FH>;
	}
    }
}
close (FH);
$/ = "\n" if ($tool eq "metagene");


foreach my $call (@mg_hits) {
    my ($contig_id, $start, $end, $strand, $frame, $trunc, $line) = @$call;
    
    my ($map_start, $map_end, $sign);
    
    if ($strand eq "+") {
	$map_start = $start;
	$map_end   = $end;
    }
    else {
	$map_start = $end;
	$map_end   = $start;
    }
    
    if    ($tool eq qq(metagene)) {
	if ($trunc ne qq(complete)) {
	    warn qq(Skipping METAGENE partial: $line\n);
	    next;
	}
    }
    elsif ($tool eq qq(mga)) {
	$sign = ($map_end <=> $map_start);
	if    ($trunc eq qq(01)) {
	    $map_start = $map_end   - $sign * ( 3 * int( (1 + abs($map_end-$map_start)) / 3) - 1);
	    warn qq(Fixed truncated START:\t($start, $end) ==> ($map_start, $map_end):\t$line\n);
	}
	elsif ($trunc eq qq(10)) {
	    $map_end   = $map_start + $sign * ( 3 * int( (1 + abs($map_end-$map_start)) / 3) - 1);
	    warn qq(Fixed truncated STOP:\t($start, $end) ==> ($map_start, $map_end):\t$line\n);
	}
	elsif ($trunc eq qq(00)) {
	    if ($strand eq "+")
	    {
		$map_start = $map_start + $frame;
	    }
	    elsif ($strand eq "-")
	    {
		$map_start = $map_start - $frame;
	    }

	    $map_end = $map_start + $sign * ( 3 * int( (1 + abs($map_end-$map_start)) / 3) - 1);
	    #warn qq(Skipping MGA double-truncated partial: $line\n);
	    warn qq(Fixed MGA double-truncated partial: $line\n);
	    #next;
	}
    }
    else {
	die qq(Unrecognized tool: $tool);
    }
    
    my $new_fid;
    if ($peg_flag) {
	$new_fid = "fig|".$genome_id.".peg.".$count;
	$count++;
    }
    else {
	$new_fid = $contig_id . "_mg_" . $map_start . "_" . $map_end;
    }
    
    $metagene_hits->{$contig_id}->{$new_fid}->{start}  = $map_start;
    $metagene_hits->{$contig_id}->{$new_fid}->{stop}   = $map_end;
    $metagene_hits->{$contig_id}->{$new_fid}->{strand} = $strand;

    print FH_MG     "$new_fid\t$contig_id"."_".$map_start."_". $map_end."\n";
    print CALLED_BY "$new_fid\t$tool\n" if ($featurecalled);
}
close (FH_MG);
close (CALLED_BY) if ($featurecalled);


if ($amino)
{
    $fig->run("get_fasta_for_tbl_entries $fasta_in < $featuremap > $fasta_out");
}
else
{
#    die qq(Disabled option --- aborting);
    
    my $fh_in  = \*STDIN;
    my $fh_out = \*STDOUT;
    
    open($fh_in,  "<$fasta_in")   || die "Could not read-open $fasta_in";
    open($fh_out, ">>$fasta_out") || die "Could not append-open $fasta_out";
    
    while (my($head, $seqP) = &get_fasta_record($fh_in)) {
#	my $contig_len = length($$seqP);
	
	if (defined ($metagene_hits->{$head})){
	    my @new_fids;
	    if ($peg_flag) {
		@new_fids = sort { FIG::by_fig_id($a,$b) } keys %{$metagene_hits->{$head}};
	    }
	    else {
		@new_fids = keys %{$metagene_hits->{$head}};
	    }
	    
	    foreach my $new_fid ( @new_fids){
		#if ($new_fid eq "fig|441534.3.peg.1805")
		#{
		#    print "debug_time\n";
		#}
		my $start  = $metagene_hits->{$head}->{$new_fid}->{start};
		my $stop   = $metagene_hits->{$head}->{$new_fid}->{stop};
		my $strand = $metagene_hits->{$head}->{$new_fid}->{strand};
		
		my ($mg_contig);
		if ($start > $stop)
		{
		    $mg_contig = substr($$seqP, $stop-1, $start-($stop-1));
		}
		else
		{
		    $mg_contig = substr($$seqP, $start-1, $stop-($start-1));
		}
		if ($strand eq "-") {
		    $mg_contig = &FIG::rev_comp( \$mg_contig );
		    $mg_contig = $$mg_contig;
		}
		&display_id_and_seq( $fig, $new_fid, \$mg_contig, $fh_out, $amino );
	    }
	}
	elsif (!$peg_flag)
	{
	    &display_id_and_seq( $fig, $head, $seqP, $fh_out, $amino );
	}
    }
    close ($fh_in);
    close ($fh_out);
}

sub get_fasta_record {
    my ( $fh ) = @_;
    my ( $old_eol, $entry, @lines, $head, $seq);
    
    if (not defined($fh))  { $fh = \*STDIN; }
    $old_eol = $/;
    $/ = "$input_eol_marker>";
    
    my @record = ();
    if (defined($entry = <$fh>)) {
	chomp $entry;
	@lines =  split( /$input_eol_marker/, $entry );
	while (@lines and not $lines[0])  { shift @lines; }
	
	$head  =  shift @lines;
	if ($first_line) {
	    $first_line = 0;
	        
	    if (not ($head  =~ s/^\s*>//)) {
		$trouble = 1;
		warn $head;
		die "ERROR: File does not appear to be in FASTA format\n";
	    }
	}
	else {
	    if ($head  =~ s/^\s*>//) {
		$trouble = 1;
		warn $head;
		die "Spurious beginning-of record mark found in record $.\n";
	    }
	}
	
	foreach my $ln (@lines) {
	    $_  =  $ln;
	    $ln =~ s/\s//g;
	        
	    print STDERR "$head: contains X's\n"    if ($ln =~ s/x/n/ig);
	    print STDERR "$head: contains colons\n" if ($ln =~ s/://g);
	        
	    while ($ln =~ s/([^ACGTUMRWSYKBDHVN]+)/n/i) {
		$trouble = 1;
		$badchar++;
		print STDERR ">$head:\tbad char $1 at ", pos($ln), " at line $.\n";
	    }
	}
	
	$seq   =  join( "", @lines );
	$seq   =~ s/\cM//g;
	$seq   =~ tr/a-z/A-Z/;
	@record = ($head, \$seq);
    }
    
    $/ = $old_eol;
    return @record;
}


sub display_id_and_seq {
    my( $fig, $id, $seq, $fh, $amino ) = @_;
    my ( $i, $n, $ln );
    
    if (! defined($fh) )  { $fh = \*STDOUT; }
    
    print $fh ">$id\n";
    my $dna_seq = $$seq;
    my $sequence;
    
    if ($amino) {
	my $code_number = 11;
	my $change_start_to_M=1;
	#my $genetic_code_hash = $fig->genetic_code($code_number);
	my $genetic_code_hash = $fig->standard_genetic_code();
	$sequence = $fig->translate($dna_seq, $genetic_code_hash, $change_start_to_M);
    }
    else {
	$sequence = $dna_seq;
    }
    
    $n = length($sequence);
#   confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
    for ($i=0; ($i < $n); $i += $width) {
	if (($i + $width) <= $n) {
	    $ln = substr($sequence,$i,$width);
	}
	else {
	    $ln = substr($sequence,$i,($n-$i));
	}
	
	print $fh "$ln\n";
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3