[Bio] / StandaloneTools / parse_refseq_without_FIG.pl Repository:
ViewVC logotype

View of /StandaloneTools/parse_refseq_without_FIG.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Wed Aug 30 21:30:44 2006 UTC (13 years, 1 month ago) by overbeek
Branch: MAIN
CVS Tags: HEAD
Script to parse RefSeq files into SEED skeletons. -- /gdp

#!/usr/bin/perl -w

use Carp;
use Data::Dumper;


$usage = "usage: parse_genbank [-genome=Genome_Bioname] [-tax=taxonomy] Genome_ID Dir genbank.entry";

while (@ARGV && ($ARGV[0] =~ m/^-/))
{
    if    ($ARGV[0] =~ m/^-{1,2}genome=\S+/) 
    {
	$force_genome =  shift @ARGV;
	$force_genome =~ s/^-{1,2}genome=\"?//;
	$force_genome =~ s/\"$//;
	$force_genome =~ s/\s+/ /g;
	
	print STDERR "\nGenome bioname will be taken as: \"$force_genome\"\n";
    }
    elsif ($ARGV[0] =~ m/^-{1,2}tax=\S+/) 
    {
	$force_tax =  shift @ARGV;
	$force_tax =~ s/^-{1,2}tax=\"?//;
	$force_tax =~ s/\"$//;
	$force_tax =~ s/\s+/ /g;
	if ($force_tax !~ m/\.$/) { $force_tax .= "."; }
	
	print STDERR "\nTaxonomy will be taken as: \"$force_tax\"\n";
    }
    else
    {
	die "Could not handle $ARGV[0]";
    }
}


(
 ($org_id = shift @ARGV) &&
 ($dir    = shift @ARGV) &&
 ($input  = shift @ARGV)
)
    || die $usage;

if (!-d $dir) { system "rm -fR $dir/*"; }

($locs,$trans,$assgn,$aliases) = &pass1($input,$dir);
# print STDERR "Contents of assgn:\n", Dumper($assgn);
&pass2($org_id,$dir,$input,$locs,$trans,$assgn,$aliases);

if (! -s "$dir/contigs")            { unlink("$dir/contigs"); print STDERR "no contigs in $dir\n"; }
if (! -s "$dir/assigned_functions") { unlink("$dir/assigned_functions"); print STDERR "no assigned_functions in $dir\n"; }
if (! -s "$dir/Features/peg/tbl")   { 
    system "rm -rf $dir/Features/peg"; 
    print STDERR "no PEGs in $dir --- deleting empty files\n"; 
}
if (! -s "$dir/Features/rna/tbl")   { 
    system "rm -rf $dir/Features/rna"; 
    print STDERR "no RNAs in $dir --- deleting empty files\n"; 
}
# if ((! -s "$dir/contigs") && (! -s "$dir/Features/peg/tbl")) { system "rm -rf $dir" }

exit 0;


sub pass1 {
    my($input) = @_;
    
    print STDERR "\nBeginning pass1\n";
    
    &verify_dir("$dir/Features/peg");
    &verify_dir("$dir/Features/rna");
    open(CONTIGS,">$dir/contigs")  || die "could not open $dir/contigs";
    open(TAX,">$dir/TAXONOMY")     || die "could not open $dir/TAXONOMY";
    open(GENOME,">$dir/GENOME")    || die "could not open $dir/GENOME";
    open(PROJECT,">$dir/PROJECT")  || die "could not open $dir/PROJECT";
    print PROJECT "Parsed NCBI Reference Sequences\n";
    close(PROJECT);
    open(TBLRNA,">$dir/Features/rna/tbl")     || die "could not open $dir/Features/rna/tbl";
    open(FASTARNA,">$dir/Features/rna/fasta") || die "could not open $dir/Features/rna/fasta";

    $idNr = 1;
    $prefixR = "fig|$org_id.rna.";

    open(IN,"<$input") || die "aborted";
    $/ = "\n//\n";
    my $locs    = {};
    my $trans   = {};
    my $assgn   = {};
    my $aliases = {};
    
    while ($record = <IN>)
    {
	if ($record =~ /LOCUS\s+(\S+)/s)
	{
	    $id = $1;
	    $is_dna = ($record =~ /[^\n]+\sDNA\s/);
	    
	    if ($record =~ /\n {0,4}ORGANISM\s+(\S[^\n]+\S)((\n\s{10,14}\S[^\n]+\S)+)/s)
	    {
		$genome = $1;
		$tax =  $2;
		$tax =~ s/\n\s+//g;
		$tax =~ s/ {2,}/ /g;
		$tax =~ s/^ {1,}//;
		if (! $written_genome)
		{
		    if ($force_genome) { $genome = $force_genome; }
		    print GENOME "$genome\n";
		    close(GENOME);
		    if ($force_tax) { $tax = $force_tax; }
		    print TAX "$tax\n";
		    close(TAX);
		    
		    $written_genome = "$tax\t$genome";
		}
		elsif (($written_genome ne "$tax\t$genome") && (!$force_genome || !$force_tax)) 
		{
		    print STDERR "serious mismatch in GENOME/TAX for $id\n$written_genome\n$tax\t$genome\n\n";
		}
	    }
	    
	    $is_virus = ($tax =~ m/Virus/);
	    if (($is_dna || $is_virus) && ($record =~ /\nORIGIN(.*?)(\n\/\/|\nLOCUS)/s))
	    {
		$seq = $1;
		$seq =~ s/\s//gs;
		$seq =~ s/\d//g;
		&display_id_and_seq($id,\$seq,\*CONTIGS);
	    }
	    else
	    {
		$seq = "";
		print STDERR "could not find sequence for $id\n" if $ENV{VERBOSE};
	    }
	    
	    if    ((@genes = split /\n {4,6}gene {8,}/s, $record) > 1) { shift @genes; }
	    elsif ((@genes = split /\n {4,6}CDS {9,}/s,  $record) > 1) { shift @genes; @genes = map { "\n     CDS             $_" } @genes; }
	    else  { 
		@genes = ();
		print STDERR "Could not find any gene entries in $id\n";
	    }
	    print STDERR "Read ", (scalar @genes), " genes from $id\n" if $ENV{VERBOSE};
	    print STDERR join("\n//\n", @genes), "\n//\n" if ($ENV{VERBOSE} && ($ENV{VERBOSE} > 1));

	    $num_rec = $num_cds = $num_rna = 0;
	    foreach $gene (@genes)
	    {
		next if ($gene =~ /\/pseudo/);
		
		$flag = 0;
		++$num_rec;
		while ($gene =~ /\n\s{4,6}CDS\s+([^\n]+(\n {20,}[^\n]*)+)/gs)
		{
		    ++$flag;
		    ++$num_cds;
		    $cds = $1;
		    if (($cds !~ /\/pseudo/) && (($cds !~ /\/exception/) || ($cds =~ /\/translation/)))
		    {
			&process_cds($id,$is_dna,\$cds,$locs,$trans,$assgn,$aliases);
		    }
		}
	    
		while ($gene =~ /\n\s{3,6}\S+RNA\s+([^\n]+(\n\s{20,22}\S[^\n]*)+)/gs)
		{
		    ++$flag;
		    ++$num_rna;
		    $rna = $1;
		    &process_rna($id,\$rna,$prefixR,\$idNr,\$seq,\*TBLRNA,\*FASTARNA);
		}

		if ($flag != 1)
		{
		    print STDERR "Variant record:\n$gene\n\n";
		}
	    }
	    print STDERR "From $id, read $num_rec records, $num_cds CDSs, $num_rna RNAs\n" if $ENV{VERBOSE};
	    $tot_rec += $num_rec;
	    $tot_cds += $num_cds;
	    $tot_rna += $num_rna;
	}
    }
    print STDERR "Read $tot_rec records, $tot_cds CDSs, $tot_rna RNAs\n";
    
    close(IN);
    close(TBLRNA);
    close(FASTARNA);
    close(CONTIGS);

    return ($locs,$trans,$assgn,$aliases);
}

sub prot_id {
    my($cdsP) = @_;

    if ($$cdsP =~ /\/protein_id=\"([^\"]+)\"/s)
    {
	return $1;
    }
    return "";
}

sub process_cds {
    my($contigID,$is_dna,$cdsP,$locs,$trans,$assgn,$aliases) = @_;
    my($protid,$loc,$precise,@aliases,$func,$seq);
    
    if ($protid = &prot_id($cdsP)) {} else { print STDERR "Problem with entry:", Dumper($cdsP); return; }
    
    if (($loc,$precise) = &get_loc($contigID,$cdsP))
    {
	if (!defined($locs->{$protid}))     { $locs->{$protid}    = []; }
	push(@{$locs->{$protid}},[$is_dna,$loc]);
    }
    
    if (@aliases = &get_aliases($cdsP))
    {
	if (!defined($aliases->{$protid}))  { $aliases->{$protid} = []; }
	push(@{$aliases->{$protid}},[@aliases]);
    }
    
    if ($func    = &get_func($cdsP))
    {
	if (!defined($assgn->{$protid}))    { $assgn->{$protid}   = []; }
	push(@{$assgn->{$protid}},$func);
    }
    
    if (defined($seq = &get_trans($cdsP)))
    {
	if (!defined($trans->{$protid}))    { $trans->{$protid}   = []; }
	my $x = $trans->{$protid};
	my $i;
	for ($i=0; $i < @$x; ++$i) { last if ($x->[$i] eq $seq); }
	unless ($i < @$x) { push(@$x, $seq); }
    }
    else
    {
	print STDERR "Could not extract translation from:\n$$cdsP\n";
    }
}

sub get_loc {
    my($contigID,$cdsP) = @_;
    my($beg,$end,$loc,$locS,$iter,$n,$args,@pieces);
    my($precise);

    if ($$cdsP =~ /^(([^\n]+)((\n\s{21,23}[^\/ ][^\n]+)+)?)/s)
    {
	$locS = $1;
	$locS =~ s/\s//g;
	$precise = ($locS !~ /[<>]/);

	@pieces = ();
	$n = 0;
	$iter = 500;
	while (($locS !~ /^\[\d+\]$/) && $iter)
	{
	    $iter--;
	    if ($locS =~ s/[<>]?(\d+)\.\.[<>]?(\d+)/\[$n\]/)
	    {
		push(@pieces,["loc",$1,$2]);
		$n++;
	    }

	    if ($locS =~ s/([,\(])(\d+)([,\)])/$1\[$n\]$3/)
	    {
		push(@pieces,["loc",$2,$2]);
		$n++;
	    }

	    if ($locS =~ s/join\((\[\d+\](,\[\d+\])*)\)/\[$n\]/)
	    {
		$args = $1;
		push(@pieces,["join",map { $_ =~ /^\[(\d+)\]$/; $1 } split(/,/,$args)]);
		$n++;
	    }

	    if ($locS =~ s/complement\((\[\d+\](,\[\d+\])*)\)/\[$n\]/g)
	    {
		$args = $1;
		push(@pieces,["complement",map { $_ =~ /^\[(\d+)\]$/; $1 } split(/,/,$args)]);
		$n++;
	    }
	}
	if ($locS =~ /^\[(\d+)\]$/)
	{
	    $loc = &conv(\@pieces,$1,$contigID);
	}
	else
	{
	    print STDERR Dumper(["could not parse $locS $iter",$cdsP]);
	    die "aborted";
	}

	my @locs = split(/,/,$loc);
#	&trim_stop(\@locs);
	$loc = join(",",@locs);
    }
    else
    {
	print STDERR Dumper($cdsP); die "could not parse location";
	die "aborted";
    }
    return ($loc,$precise);
}

sub trim_stop {
    my($locs) = @_;
    my($to_trim,$n);

    $to_trim = 3;
    while ($to_trim && (@$locs > 0))
    {
	$n  = @$locs - 1;
	if ($locs->[$n] =~ /^(\S+)_(\d+)_(\d+)$/)
	{
	    if ($2 <= ($3-$to_trim))
	    {
		$locs->[$n] = join("_",($1,$2,$3-$to_trim));
		$to_trim = 0;
	    }
	    elsif ($2 >= ($3 + $to_trim))
	    {
		$locs->[$n] = join("_",($1,$2,$3+$to_trim));
		$to_trim = 0;
	    }
	    else
	    {
		$to_trim -= abs($3-$2) + 1;
		pop @$locs;
	    }
	}
	else
	{
	    die "could not parse $locs->[$n]";
	}
    }
}


sub conv {
    my($pieces,$n,$contigID) = @_;

    if ($pieces->[$n]->[0] eq "loc")
    {
	return join("_",$contigID,@{$pieces->[$n]}[1..2]);
    }
    elsif ($pieces->[$n]->[0] eq "join")
    {
	return join(",",map { &conv($pieces,$_,$contigID) } @{$pieces->[$n]}[1..$#{$pieces->[$n]}]);
    }
    elsif ($pieces->[$n]->[0] eq "complement")
    {
	return join(",",&complement(join(",",map { &conv($pieces,$_,$contigID) } @{$pieces->[$n]}[1..$#{$pieces->[$n]}])));;
    }
}

sub complement {
    my($loc) = @_;
    my(@locs);

    @locs = reverse split(/,/,$loc);
    foreach $loc (@locs)
    {
	$loc =~ /^(\S+)_(\d+)_(\d+)$/;
	$loc = join("_",($1,$3,$2));
    }
    return join(",",@locs);
}

sub get_aliases {
    my($cdsP) = @_;
    my($db_ref);

    my @aliases = ();
    while ($$cdsP =~ /\/(protein_id|gene|locus_tag)=\"([^\"]+)\"/sg)
    {
	push(@aliases,$2);
    }
    
    while ($$cdsP =~ /\/db_xref=\"([^\"]+)\"/sg)
    {
	$db_ref = $1;
	$db_ref =~ s/^GI:/gi\|/;
	$db_ref =~ s/SWISS-PROT:/sp\|/;
	push(@aliases,$db_ref);
    }

    return @aliases;
}

sub get_trans {
    my($cdsP) = @_;
    my $tran = undef;
    
    if ($$cdsP =~ /\/translation=\"([^\"]*)\"/s)
    {
	$tran = $1;
	$tran =~ s/\s//gs;
    }
    elsif ($$cdsP =~ /\/protein_id=\"([^\"]+)\"/)
    {
	my $tran = "";
	#...nothing else needs to be done
    }
    else
    {
	print STDERR "No translation found in:\n$$cdsP\n";
    }
    
    return $tran;
}

sub get_func {
    my($cdsP) = @_;
    my $func = "";
    
    if (($$cdsP =~ /\/product=\"([^\"]*)\"/s) && ($func = &reduce_func($1)) && &ok_func($func) && ($func =~ / /))
    {
        $func =~ s/\n/ /gs;
	$func =~ s/\s+/ /gs;
    }
    elsif (($$cdsP =~ /\/function=\"([^\"]*)\"/s) && ($func = &reduce_func($1)) && &ok_func($func))
    {
        $func =~ s/\n/ /gs;
	$func =~ s/\s+/ /gs;
    }
    elsif (($$cdsP =~ /\/note=\"([^\"]*)\"/s) && ($func = &reduce_func($1)) && &ok_func($func))
    {
        $func =~ s/\n/ /gs;
	$func =~ s/\s+/ /gs;
    }
    else
    {
	$func = "";
    }
    
    $func = &fixup_func($func);
    return $func;
}

sub fixup_func {
    my($func) = @_;

    $func =~ s/^COG\d+:\s+//i;
    $func =~ s/^ORFID:\S+\s+//i;
    
    $func =~ s/\n/ /gs;
    $func =~ s/\s+/ /gs;
    
    return $func;
}

sub ok_func {
    my($func) = @_;

    return (
            ($func !~ /^[a-zA-Z]{1,3}\d+[a-zA-Z]{0,3}(\.\d+([a-zA-Z])?)?$/o) &&
            ($func !~ /^\d+$/o) &&
            ($func !~ /ensangp/io) &&
            ($func !~ /agr_/io) &&
	    ($func !~ /^SC.*:/o) &&
	    ($func !~ /^RIKEN/o) &&
	    ($func !~ /\slen: \d+/o) &&
	    ($func !~ /^similar to/io) &&
	    ($func !~ /^CPX/io) &&
	    ($func !~ /^\S{3,4}( or \S+ protein)?$/io) &&
	    ($func !~ /^putative$/o) &&
	    ($func !~ /\; putative$/o) &&
	    ($func !~ /\scomes from/io) &&
	    ($func !~ /^unknown( function)?$/io) &&
	    ($func !~ /^(hypothetical|unknown|predicted|expressed)( (function|protein))?$/io) &&
            ($func !~ /^orf /io) &&
            ($func !~ /^ORF\s?\d+[lr]?$/io) &&
            ($func !~ /^[a-z]{1,3}\s?\d+$/io) &&
	    ($func !~ /^conserved hypothetical protein$/io) &&
	    ($func !~ /uncharacterized( conserved)? protein/io) &&
	    ($func !~ /(ORF )?(identified|located|predicted) by (sequence similarity|GLIMMER|GeneMark|CRITICA)/io) &&
	    ($func !~ /^conserved hypothetical protein$/io)
	   )
}

sub reduce_func {
    my ($func) = @_;
    
    $func =~ s/\n/ /gs;
    $func =~ s/\s+/ /gs;
    
    my @funcs = split /(\;\s+|\s+\/\s+)/, $func;
    
    my @out = ();
    my $subfunc;
    foreach $subfunc (@funcs)
    {
	$subfunc =~ s/^\([A-Z]+\d+\) //o;
	
	next if ($subfunc =~ /^COG\d+/o);
	next if ($subfunc =~ /PFAM\:\d+/i);
	next if ($subfunc =~ /^contains Pfam profile/i);
	next if ($subfunc =~ /^[a-zA-Z]{1,3}\d+[a-zA-Z]{0,3}(\.\d+([a-zA-Z])?)?$/o);
	next if ($subfunc =~ /^\d+$/o);
	next if ($subfunc =~ /ensangp/io);
	next if ($subfunc =~ /agr_/io);
	next if ($subfunc =~ /^SC.*:/o);
	next if ($subfunc =~ /^RIKEN/o);
	next if ($subfunc =~ /\slen: \d+/o);
	next if ($subfunc =~ /^similar to/io);
	next if ($subfunc =~ /^CPX/io);
	next if ($subfunc =~ /^\S{3,4}( or \S+ protein)?$/io);
	next if ($subfunc =~ /^(putative|probable|hypothetical|unknown)$/o);
	next if ($subfunc =~ /\scomes from/io);
	next if ($subfunc =~ /^unknown( function)?$/io);
	next if ($subfunc =~ /^(putative|probable|hypothetical|unknown|predicted|expressed|conserved).* (protein|gene|coding region)/io);
	next if ($subfunc =~ /^orf,? /io);
	next if ($subfunc =~ /^ORF\s?\d+[lr]?$/io);
	next if ($subfunc =~ /^[a-z]{1,3}\s?\d+$/io);
	next if ($subfunc =~ /^INSERTION ELEMENT/);
	next if ($subfunc =~ /^IS\d+/o);
	next if ($subfunc =~ /^Identified by .*homology/io);
	next if ($subfunc =~ /^conserved hypothetical( .*KD)?( protein)?$/io);
	next if ($subfunc =~ /^uncharacterized( conserved)?( .*KD)? protein/io);
	next if ($subfunc =~ /^(ORF )?(identified|located|predicted) (by|using) (sequence similarity|BLASTX|TBLASTN|GLIMMER|Gen[e]?Mark.*|CRITICA)/io);
	next if ($subfunc =~ /no significant/io);
	next if ($subfunc =~ /unnamed protein product/io);
	next if ($subfunc =~ /^not classified/io);
	next if ($subfunc =~ /^unusual splice$/io);
	next if ($subfunc =~ /^unique hypothetical$/io);
	next if ($subfunc =~ /unknown/io);
	next if ($subfunc =~ /^conserved within genome/io);
	next if ($subfunc =~ /^narrowly conserved/io);
	next if ($subfunc =~ /^elements of external origin/io);
	next if ($subfunc =~ /^observed by proteomics citation/io);
	next if ($subfunc =~ /^some similarities/io);
	next if ($subfunc =~ /^similarity to/io);
	next if ($subfunc =~ /^corresponds to/io);
	next if ($subfunc =~ /^early protein$/io);
	next if ($subfunc =~ /^go_component\: /io);
	next if ($subfunc =~ /^Function Code\:/io);
	next if ($subfunc =~ /^overriding stop codon/io);
	next if ($subfunc =~ /^Accession /io);
	next if ($subfunc =~ /^gln\|/);
	next if ($subfunc =~ /\%\s*identity in.*overlap/io);
	next if ($subfunc =~ /FASTA scores\:/io);
	next if ($subfunc =~ /FASTA hit/io);
	next if ($subfunc =~ /blast. (match|hit)/io);

	$subfunc =~ s/^putative//io;
	$subfunc =~ s/^probable //io;
	$subfunc =~ s/^predicted //io;
	$subfunc =~ s/^(conserved )?hypothetical( protein)?( .*KD)?//io;
	$subfunc =~ s/^Uncharacterized //io;
	$subfunc =~ s/^unknown,? //io;
	
	$subfunc =~ s/, putative$//io;
	$subfunc =~ s/ \(putative\)$//io;
	$subfunc =~ s/, probable .*\{imported\}.*$//io;
	$subfunc =~ s/,? hypothetical protein$//io;
	$subfunc =~ s/,? similar to .*$//io;
	$subfunc =~ s/,? in .* region.*$//io;
	$subfunc =~ s/,? FOR INSERTION( SEQUENCE)? ELEMENT IS\d+$//io;
	
	$subfunc =~ s/^TRANSMEMBRANE PROTEIN$/Transmembrane protein/io;
	$subfunc =~ s/^TRANSCRIPTIONAL REGULATORY PROTEIN$/Transcriptional regulatory protein/io;
	    
	next if ($subfunc =~ m/^\.$/);
	next if ($subfunc =~ /^(putative|probable|hypothetical|conserved|unknown|highly|expressed|enzyme|miscellaneous|weakly|transport|phenotype)$/io);
	next if ($subfunc =~ /^STRUCTURAL ELEMENTS$/io);
	next if ($subfunc =~ /^Cell Exterior$/io);

	if ($subfunc =~ /\S/) { push @out, $subfunc; }
    }
    
    return join(" / ", @out);
}

sub process_rna {
    my($contigID,$cdsP,$prefix,$idNr,$seq,$fh_tbl,$fh_dna) = @_;
    my($loc,@aliases,$func,$trans,$id,$precise);
    
    ($loc,$precise)  = &get_loc($contigID,$cdsP);
    $func    = &get_rna_func($cdsP);
    
    if ($loc)
    {
	my $dna  = &extract_seq($seq,$loc);
	$id = $prefix . "$$idNr";
	$$idNr++;
	if (! $func )
	{
	    $func = "";
	    print STDERR "Could not get func from:\n$$cdsP\n";
	}
	
	if (defined($dna) && $dna && length($dna))
	{
	    print $fh_tbl "$id\t$loc\t$func\n";
	    &display_id_and_seq($id,\$dna,$fh_dna);
	}
	else
	{
	    print STDERR "Could not extract DNA sequence of $loc for $id\n";
	}
    }
    else
    {
	print STDERR "No loc for:\n$$cdsP\n";
    }
}

sub get_rna_func {
    my($cdsP) = @_;
    my $func = "";

    if ($$cdsP =~ /\/product=\"([^\"]*)\"/s)
    {
	$func = $1;
	$func =~ s/\s+/ /gs;
    }
    elsif ($$cdsP =~ /\/gene=\"([^\"]*)\"/s)
    {
	$func = $1;
	$func =~ s/\s+/ /gs;
    }
    elsif ($$cdsP =~ /\/note=\"([^\"]*)\"/s)
    {
	$func = $1;
	$func =~ s/\s+/ /gs;
    }
    return $func;
}



sub pass2 {
    my ($org_id,$dir,$input,$locs,$trans,$assgn,$aliases) = @_;
    my ($func, $fig_id);
    
    open(TBLPEG,">$dir/Features/peg/tbl") || die "could not open $dir/Features/peg/tbl";
    open(FASTAPEG,">$dir/Features/peg/fasta") || die "could not open $dir/Features/peg/fasta";
    open(ASSIGNMENTS,">$dir/assigned_functions") || die "could not open $dir/assigned_functions";

    $idNp = 1;
    $prefixP = "fig|$org_id.peg.";

    print STDERR "Beginning pass2 --- ", (scalar keys %$locs), " protids in locs\n";
    
    foreach $protid (keys(%$locs))
    {
	@locs = sort { $b->[0] <=> $a->[0] } @{$locs->{$protid}};
	push(@tbl,[$protid,$locs[0]->[1]]);
    }
    
    @tbl = sort { $a->[1] =~ /^([^,]+)_(\d+)_(\d+)/;   ($c1, $b1, $e1) = ($1,$2,$3);
		  $b->[1] =~ /^([^,]+)_(\d+)_(\d+)/;   ($c2, $b2, $e2) = ($1,$2,$3);
		  (($c1 cmp $c2) || (&min($b1, $e1) <=> &min($b2, $e2)) || ($a->[0] cmp $b->[0])) } @tbl;
    
    foreach $entry (@tbl)
    {
	($protid,$loc) = @$entry;
	@aliases = ();
	if ($x = $aliases->{$protid})
	{
	    undef %aliases;
	    foreach $y (@$x)
	    {
		foreach $z (@$y)
		{
		    if (! $aliases{$z})
		    {
			$aliases{$z} = 1;
			push(@aliases,$z);
		    }
		}
	    }
	}

	$fig_id = $prefixP . $idNp;
	
	if ($x = $trans->{$protid})
	{
	    if (@$x == 0) { 
		print STDERR "WARNING: No sequence for protid $protid\n";
	    } else {
		if (@$x > 1) { 
		    print STDERR "WARNING: Multiple sequences for protid $protid\n";
		    print STDERR join("\n", @$x), "\n\n";
		}
		
		++$idNp;
		print TBLPEG   "$fig_id\t$loc\t",join("\t",@aliases),"\n";
		print FASTAPEG ">$fig_id\n$x->[0]\n";
	    }
	}
	
	if (defined($x = $assgn->{$protid}) && @$x)
	{
	    $func = join(" / ", @$x);
	    print ASSIGNMENTS "$fig_id\t$func\n" if $func;
	}
    }
    close(TBLPEG);
    close(FASTAPEG);
    close(ASSIGNMENTS);
}

sub verify_dir {
    my($dir) = @_;

    if (-d $dir) { return }
    if ($dir =~ /^(.*)\/[^\/]+$/)
    {
        &verify_dir($1);
    }
    mkdir($dir,0777) || die "could not make $dir";
    chmod 0777,$dir;
}

sub display_id_and_seq {
    my( $id, $seq, $fh ) = @_;

    if (! defined($fh) )  { $fh = \*STDOUT; }

    print $fh ">$id\n";
    &display_seq($seq, $fh);
}

sub display_seq {
    my ( $seq, $fh ) = @_;
    my ( $i, $n, $ln );

    if (! defined($fh) )  { $fh = \*STDOUT; }

    $n = length($$seq);
#   confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
    for ($i=0; ($i < $n); $i += 60)
    {
        if (($i + 60) <= $n)
        {
            $ln = substr($$seq,$i,60);
        }
        else
        {
            $ln = substr($$seq,$i,($n-$i));
        }
        print $fh "$ln\n";
    }
}

sub extract_seq {
    my($seqP,$loc) = @_;
    my($contig,$beg,$end,$contig_seq);
    my($plus,$minus,$len);
    
    $plus = $minus = 0;
    my $strand = "";
    my @loc = split(/,/,$loc);
    my @subseq = ();
    
    if (!defined($seqP))
    {
	warn "Undefined sequence pointer";
	return "";
    }
    
    if (!defined($$seqP))
    {
	warn "Undefined sequence";
	return "";
    }
    
    $len = length($$seqP);
    if ($len == 0)
    {
	warn "Zero-length sequence";
	return "";
    }
    
    foreach $loc (@loc)
    {
        if ($loc =~ /^\S+_(\d+)_(\d+)$/)
        {
            if ($1 < $2)
            {
                $plus++;
            }
            elsif ($2 < $1)
            {
                $minus++;
            }
        }
    }
    
    if ($plus > $minus)
    {
        $strand = "+";
    }
    elsif ($plus < $minus)
    {
        $strand = "-";
    }

    foreach $loc (@loc)
    {
        if ($loc =~ /^(\S+)_(\d+)_(\d+)$/)
        {
            ($contig,$beg,$end) = ($1,$2,$3);
	    if (($beg <= $len) && ($end <= $len))
	    {
		if (($beg < $end) || (($beg == $end) && ($strand eq "+")))
		{
		    $strand = "+";
		    push(@subseq,substr($$seqP,$beg-1,($end+1-$beg)));
		}
		else
		{
		    $strand = "-";
		    push(@subseq,&reverse_comp(substr($$seqP,$end-1,($beg+1-$end))));
		}
	    }
	    else
	    {
		print STDERR "Invalid locus $loc (contig length $len)\n";
	    }
        }
	else
	{
	    print STDERR "Malformed locus $loc\n";
	}
    }
    return join("",@subseq);
}

sub translate {
    my( $dna,$code,$start) = @_;
    my( $i,$j,$ln );
    my( $x,$y );
    my( $prot );

    if (! defined($code))
    {
        $code = &standard_genetic_code;
    }
    $ln = length($dna);
    $prot = "X" x ($ln/3);
    $dna =~ tr/a-z/A-Z/;

    for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++)
    {
        $x = substr($dna,$i,3);
        if ($y = $code->{$x})
        {
            substr($prot,$j,1) = $y;
        }
    }

    if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/))
    {
        substr($prot,0,1) = 'M';
    }
    return $prot;
}

sub standard_genetic_code {

    my $code = {};

    $code->{"AAA"} = "K";
    $code->{"AAC"} = "N";
    $code->{"AAG"} = "K";
    $code->{"AAT"} = "N";
    $code->{"ACA"} = "T";
    $code->{"ACC"} = "T";
    $code->{"ACG"} = "T";
    $code->{"ACT"} = "T";
    $code->{"AGA"} = "R";
    $code->{"AGC"} = "S";
    $code->{"AGG"} = "R";
    $code->{"AGT"} = "S";
    $code->{"ATA"} = "I";
    $code->{"ATC"} = "I";
    $code->{"ATG"} = "M";
    $code->{"ATT"} = "I";
    $code->{"CAA"} = "Q";
    $code->{"CAC"} = "H";
    $code->{"CAG"} = "Q";
    $code->{"CAT"} = "H";
    $code->{"CCA"} = "P";
    $code->{"CCC"} = "P";
    $code->{"CCG"} = "P";
    $code->{"CCT"} = "P";
    $code->{"CGA"} = "R";
    $code->{"CGC"} = "R";
    $code->{"CGG"} = "R";
    $code->{"CGT"} = "R";
    $code->{"CTA"} = "L";
    $code->{"CTC"} = "L";
    $code->{"CTG"} = "L";
    $code->{"CTT"} = "L";
    $code->{"GAA"} = "E";
    $code->{"GAC"} = "D";
    $code->{"GAG"} = "E";
    $code->{"GAT"} = "D";
    $code->{"GCA"} = "A";
    $code->{"GCC"} = "A";
    $code->{"GCG"} = "A";
    $code->{"GCT"} = "A";
    $code->{"GGA"} = "G";
    $code->{"GGC"} = "G";
    $code->{"GGG"} = "G";
    $code->{"GGT"} = "G";
    $code->{"GTA"} = "V";
    $code->{"GTC"} = "V";
    $code->{"GTG"} = "V";
    $code->{"GTT"} = "V";
    $code->{"TAA"} = "*";
    $code->{"TAC"} = "Y";
    $code->{"TAG"} = "*";
    $code->{"TAT"} = "Y";
    $code->{"TCA"} = "S";
    $code->{"TCC"} = "S";
    $code->{"TCG"} = "S";
    $code->{"TCT"} = "S";
    $code->{"TGA"} = "*";
    $code->{"TGC"} = "C";
    $code->{"TGG"} = "W";
    $code->{"TGT"} = "C";
    $code->{"TTA"} = "L";
    $code->{"TTC"} = "F";
    $code->{"TTG"} = "L";
    $code->{"TTT"} = "F";

    return $code;
}

sub reverse_comp {
    my($seq) = @_;

    return ${&rev_comp(\$seq)};
}

sub rev_comp {
    my( $seqP ) = @_;
    my( $rev  );

    $rev =  reverse( $$seqP );
    $rev =~ tr/a-z/A-Z/;
    $rev =~ tr/ACGTUMRWSYKBDHV/TGCAAKYWSRMVHDB/;
    return \$rev;
}

sub min {
    my ($x, $y) = @_;
    
    return (($x < $y) ? $x : $y );
}

sub max {
    my ($x, $y) = @_;
    
    return (($x > $y) ? $x : $y );
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3