[Bio] / FigKernelScripts / k-class-get-bork-universals.pl Repository:
ViewVC logotype

View of /FigKernelScripts/k-class-get-bork-universals.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Fri Apr 17 18:00:27 2015 UTC (4 years, 6 months ago) by jdavis
Branch: MAIN
CVS Tags: HEAD
This gets a set of universal proteins for a genome using blast

#! /usr/bin/env perl 

use strict;
use Universal_Seqs;
use gjoseqlib;
use BlastInterface;
use Data::Dumper;
use Getopt::Long;

my $usage = 'get_bork_universals.pl [options] contigs > universals.fasta
    options:
    -p PSEEDmode, the program tries to give the file name as the header line in the fasta file
        This wont work for the pseed because it is /vol/pseed/FIGdisk/FIG/Data/Organisms/83333.1/contigs
        and everything would get named "contigs". If you declare -p, it will grab the second to last 
        direcory and use it instead.
';

my ($individual, $help, $pseed);
my $opts = GetOptions('h'   => \$help,
                      'i'   => \$individual, 
                      'p'   => \$pseed);
                      
if ($help){die "$usage\n"}

my @seqs = @Universal_Seqs::Universal_Seqs;

my $file = shift @ARGV;
my $name;
if  ($pseed)
{
    $name = $file;
    $name =~ s/\/vol\/pseed\/FIGdisk\/FIG\/Data\/Organisms\///g; 
    $name =~ s/\/.+//g;
}
else
{
    $name = $file;
    $name =~ s/.+\///g; 
}

open (IN, "$file") or die "cant open $file";
my @contigs = gjoseqlib::read_fasta(\*IN);


unless ( @seqs ){die "Universal sequences is empty\n"}

my %opts = ( #blastplus     => 1,
			 evalue        => 0.001,
			 #minIden       => $IS_Data{$IS_Element}{end_min_id},
			 #minCovS       => $IS_Data{$IS_Element}{end_min_cov},
			 num_threads   => 8);                          

my @hsps    = &BlastInterface::blastx( \@contigs, \@seqs, \%opts );


# Get best hsps.  I didn't turn this into a subroutine 
#  because I have the query and subj inverted in this use case.
# @hsps looks like this:
#----------------------------------------------------
# ( qid, sid, %id, alilen, mismatch, gaps, qstart, qend, sstart, send, eval, bit, qlen, slen)
#    0    1    2      3       4        5      6      7      8      9    10    11    12   13
#----------------------------------------------------
my %seen;
my @best_hsps;
for my $i (0..$#hsps)
{
	my $peg = $hsps[$i][1];
	unless (exists $seen{$peg})
	{
		my $id    = $hsps[$i][0];
		my $begin = $hsps[$i][6];
		my $end   = $hsps[$i][7];			
		push @best_hsps, [$peg, $id, $begin, $end];
		$seen{$peg} = 1;
	}
}
my @best_hsps_sorted = sort { $a->[0] cmp $b->[0] } @best_hsps;	

my $cat_seq;
for my $i (0..$#best_hsps_sorted)
{
	my @subseq = &subseq ($best_hsps_sorted[$i][1], $best_hsps_sorted[$i][2], $best_hsps_sorted[$i][3], @contigs);
	
	if ($individual)
	{
		&gjoseqlib::print_alignment_as_fasta (@subseq);
	}
	else
	{
		$cat_seq = $cat_seq.$subseq[0][2];
		
	}	
}

my @cat = [$name,,$cat_seq];
&gjoseqlib::print_alignment_as_fasta (@cat);


#----------------------------------------------------
# sub subseq gets a subsequence from a gary-formatted tuple
# adds the location to the comment as "begin_end".
# my @subseq = subseq ($id, $begin, $end, @sequence);
#  Watch out make sure you have your start and stop right to get
#  Them all out in the same direction.
#----------------------------------------------------
sub subseq
{
	my $cont_id =  shift  @_;
	my $begin = shift @_;
	my $end = shift @_;
	my @seq = @_;

	my @contig = grep{$_->[0] eq $cont_id}@seq;
	my $len = abs($end - $begin);
	my $start = $begin;
	
	my $subseq;
	if ($begin > $end)
	{
		$start = $end;
	}
   
	my $segmentf = substr $contig[0][2],($start - 1),($len + 1);
	my $subseq = $segmentf;
	
	if ($begin > $end)
	{
		my $segmentr = 	&gjoseqlib::complement_DNA_seq( $subseq );
		$subseq = $segmentr; 
	}
		
	my @new = [$cont_id, "$begin"."_"."$end", $subseq];
	
	return @new;
} 

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3