[Bio] / FigWebServices / diffsF.cgi Repository:
ViewVC logotype

View of /FigWebServices/diffsF.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (download) (annotate)
Thu Nov 11 17:51:51 2010 UTC (9 years ago) by disz
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.8: +1 -1 lines
FigFam 6 digit problem

# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

use GenoGraphics;

use FIG;
my $fig = new FIG;

use FigFam;
my $default_dir = "$FIG_Config::FigfamsData";

use HTML;

use CGI;
my $cgi = new CGI;

if (0)
{
    my $VAR1;
    eval(join("",`cat /tmp/diffsF_parms`));
    $cgi = $VAR1;
#   print STDERR &Dumper($cgi);
}

if (0)
{
    print $cgi->header;
    my @params = $cgi->param;
    print "<pre>\n";
    foreach $_ (@params)
    {
	print "$_\t:",join(",",$cgi->param($_)),":\n";
    }

    if (0)
    {
	if (open(TMP,">/tmp/diffsF_parms"))
	{
	    print TMP &Dumper($cgi);
	    close(TMP);
	}
    }
    exit;
}

use constant FID    =>  0;
use constant LOCUS  =>  1;
use constant CONTIG =>  2;
use constant START  =>  3;
use constant STOP   =>  4;
use constant LEN    =>  5;
use constant STRAND =>  6;
use constant TYPE   =>  7;
use constant TAXID  =>  8;

my $html = [];
unshift @$html, "<TITLE>Feature Differences</TITLE>\n";
$org1   = $cgi->param('org1');
$org2   = $cgi->param('org2');
$gene   = $cgi->param('gene');
$align1 = $cgi->param('align1');
$align2 = $cgi->param('align2');

if (!((-d $org1) || (-d $org2)))
{
    push(@$html,$cgi->h1("bad parms: gene=$gene align1=$align1 align2=$align2 org1=$org1 org2=$org2"));
}
elsif ($gene = $cgi->param('gene'))
{
    &show_region($fig,$cgi,$html,$org1,$gene);
}
elsif (($gene1 = $cgi->param('align1')) && ($gene2 = $cgi->param('align2')))
{
    &show_alignment($fig, $cgi, $html, $gene1, $gene2, $org1, $org2);
}
else
{
    &list_diffs($fig, $cgi, $html, $org1, $org2);
}
&HTML::show_page($cgi,$html);

sub show_region {
    my($fig,$cgi,$html,$org,$gene) = @_;
    my @feat  = &load_tbl2($org,$gene);
    for ($i=0; ($i < @feat) && ($feat[$i]->[FID] ne $gene); $i++) {}
    if ($i < @feat)
    {
	$first = ($i > 4) ? $i-5 : 0;
	$last  = ($i < (@feat - 5)) ? $i+5 : @feat - 1;
	my $gg = [];
	my $genes = [];
	$min = 10000000;
	$max = 0;
	while ($first <= $last)
	{
	    $f = $feat[$first];
	    $beg = $f->[START];
	    $end = $f->[STOP];
	    $min = &FIG::min($min,$beg);
	    $min = &FIG::min($min,$end);
	    $max = &FIG::max($max,$beg);
	    $max = &FIG::max($max,$end);
	    $fid = $f->[FID];
#	    $fid =~ /\.([a-z]+\.\d+)$/;
#	    $info = $1;
	    push(@$genes,[ &FIG::min($beg,$end),
			   &FIG::max($beg,$end),
			   ($beg < $end) ? "rightArrow" : "leftArrow",
			   ($fid eq $gene) ? "red" : "blue",
			   "",
			   "",
			   "$fid: $beg to $end",
			   ""
			   ] );
	    $first++;
	}
	my $map = ['Region',0,$max+1-$min,&decr_coords($genes,$min)];
	push(@$gg,$map);
	push( @$html, @{ &GenoGraphics::render( $gg, 1000, 4, 1 ) } );
    }
}
	    
sub decr_coords {
    my($genes,$min) = @_;
    my($gene);

    foreach $gene (@$genes)
    {
	$gene->[0] -= $min;
	$gene->[1] -= $min;
    }
    return $genes;
}

sub list_diffs {
    my ($fig, $cgi, $html, $org1, $org2) = @_;
    
    $org1T = &load_tbl1($org1);
    $org2T = &load_tbl1($org2);
    
    my @both;
    my @just1;
    foreach $key (sort { by_locus($org1T->{$a}, $org1T->{$b}) } keys %$org1T)
    {
	$type1  = $org1T->{$key}->[TYPE];
	$locus1 = $org1T->{$key}->[LOCUS];

	if ($org2T->{$key})
	{
	    $type2  = $org2T->{$key}->[TYPE];
	    $locus2 = $org2T->{$key}->[LOCUS];
	    if (($type1 ne $type2) || ($locus1 ne $locus2))
	    {
		$fid1 = $org1T->{$key}->[FID];
		$fid2 = $org2T->{$key}->[FID];
		$link1 = &link($org1,$fid1);
		$link2 = &link($org2,$fid2);
		
		if (($len1 = $org1T->{$key}->[LEN]) < ($len2 = $org2T->{$key}->[LEN])) {
		    $changed = qq(lengthened);
		}
		else {
		    $changed = qq(shortened);
		}
		
		push @both, [ $changed, $fid1, $link1, $len1, $fid2, $link2, $len2, ($len2 - $len1) ];
	    }
	}
	else
	{
	    $fid1 = $org1T->{$key}->[FID];
	    $link1 = &link($org1,$fid1);
	    push @just1, [ $fid1, $link1, $org1T->{$key}->[LEN] ];
	}
    }
    
    my @just2;
    foreach $key (sort { by_locus($org2T->{$a},$org2T->{$b}) } keys %$org2T)
    {
	if (! $org1T->{$key})
	{
	    $type2  = $org2T->{$key}->[TYPE];
	    $locus2 = $org2T->{$key}->[LOCUS];
	    $fid2 = $org2T->{$key}->[FID];
	    $link2 = &link($org2,$fid2);
	    push @just2, [ $fid2, $link2, $org2T->{$key}->[LEN] ];
	}
    }
    
    push @$html, map {
	$align = qq(<A HREF="diffsF.cgi?align1=$_->[1]&align2=$_->[4]&org1=$org1&org2=$org2">align</A>);
	$cgi->h2("$align $_->[0]: $_->[2] ($_->[3] bp) => $_->[5] ($_->[6] bp), diff=$_->[7]")
	}
    sort { 
	abs($b->[7]) <=> abs($a->[7])
	} @both;
    
    
    push @$html, map {
	$cgi->h2("just2: $_->[1] ($_->[2] bp)")
	}
    sort { 
	$b->[2] <=> $a->[2]
	} @just2;
    
    
    push @$html, map {
	$cgi->h2("just1: $_->[1] ($_->[2] bp)")
	}
    sort { 
	$b->[2] <=> $a->[2]
	} @just1;
}

sub show_alignment {
    my ($fig, $cgi, $html, $gene1, $gene2, $org1, $org2) = @_;
    
    my @neighbors;
    my %neighbors;
    my $neighbors;
    if (-s "$org2/neighbors") {
	@neighbors = sort { $a <=> $b} map { m/^(\d+\.\d+)/o ? ($1) : () } `cat $org2/neighbors`;
	%neighbors = map  { $_  => 1 } @neighbors;
	$neighbors = join(" ", @neighbors);
	@neighbors ? push(@$html, $cgi->h2("Close neighbors are: " . join(" ", @neighbors)))
                   : push(@$html, $cgi->h2("No close nieghbors found")) && return
		   ;
    }
    else {
	push(@$html, $cgi->h2("Close neighbors file $org2/neighbors does not exist"))
	    && return;
    }

    
    my $md5exec   = (($_ = `which md5sum`) !~ m/^no/) ? "md5sum" 
                     : (($_ = `which md5`) !~ m/^no/) ? "md5" 
                     : push(@$html, $cgi->h2("Could not find an MD5 checksum tool")) && return
		     ;
    my $chksum = `echo $neighbors | $md5exec`;   chomp $chksum;
    my $tmp_close = "$FIG_Config::temp/tmp_close_$chksum";
    
    my ($size, $mod);
    if ((not ($size = (-s $tmp_close))) || ($mod = (-M $tmp_close > 0.1))) {
	push @$html, $cgi->h2("Rebuilding DB $tmp_close, size=$size, mod=$mod");
	
	system("rm -f $tmp_close");
	foreach my $org (@neighbors) {
	    push @$html, $cgi->h2("Appending fasta for $org to $tmp_close");
	    system("cat $FIG_Config::organisms/$org/Features/peg/fasta >> $tmp_close")
		&& ((push @$html
		     , $cgi->h2("Could not append $FIG_Config::organisms/$org/Features/peg/fasta to $tmp_close"))
		    && return);
	}
	system("$FIG_Config::ext_bin/formatdb -i $tmp_close")
	    && ((push @$html, $cgi->h2("Could not run $FIG_Config::ext_bin/formatdb on $tmp_close, err=$?"))
		&& return);
    }
    
    my $tmp_fid  = "$FIG_Config::temp/tmp_fid_$$";
    
    open(TMP_FID, ">$tmp_fid")
	|| ((push @$html, $cgi->h2("Could not write-open $tmp_fid")) && &cleanup($tmp_fid, $tmp_seqs) && return);
    print TMP_FID "$gene1\n";
    $seq_gene1 =  `$FIG_Config::bin/pull_fasta_entries $org1/Features/peg/fasta < $tmp_fid`;
    $seq_gene1 =~ s/^>fig/>old/;
    
    open(TMP_FID, ">$tmp_fid")
	|| ((push @$html, $cgi->h2("Could not write-open $tmp_fid")) && &cleanup($tmp_fid, $tmp_seqs) && return);
    print TMP_FID "$gene2\n";
    $seq_gene2 =  `$FIG_Config::bin/pull_fasta_entries $org2/Features/peg/fasta < $tmp_fid`;
    $seq_gene2 =~ s/^>fig/>new/;
    
    my ($figfam, $fam_id, @members);
    my @tmp = grep { m/^(\S+)/o && ($1 eq $gene2) } `cat $org2/found`;
    if (not @tmp) {
	push @$html, $cgi->h2("No family found for gene2=$gene2")
    }
    else {
	if ($tmp[0] =~ m/^\S+\t(FIG\d+)/o) {
	    $figfam = new FigFam($fig, $1, $default_dir);
	    $fam_id = $figfam->family_id();
	    push @$html, $cgi->h2("$gene2 is in Family $fam_id");
	    @members = $figfam->list_members();
	}
    	else {
	    $tmp[0] =~ s/\t/ /go;
	    push(@$html, $cgi->h2("Could not parse family from entry: $tmp[0]"))
		&& &cleanup($tmp_fid, $tmp_seqs) && return;
	}
    }
    
    if (@members) {
	push @$html, $cgi->h2(qq($fam_id has ) . (scalar @members) . qq( members));
	if (@members > 15) {
	    @members = grep { $neighbors{$fig->genome_of($_)} } @members;
	    my $s    = (@members == 1) ? qq() : qq(s);
	    my $were = (@members == 1) ? qq(was from a) : qq(were from);
	    push @$html, $cgi->h2((scalar @members) . qq( member$s of $fam_id $were close neighbor$s));
	}
    }
    else {
	push @$html, $cgi->h2("No family members found --- searching with BLASTP");
	if (@members = &find_top_blast_hits($fig, $cgi, $html, $seq_gene2, $tmp_close)) {
	    if (@members > 15) {
		@members = grep { $neighbors{$fig->genome_of($_)} } @members;
		my $s    = (@members == 1) ? qq() : qq(s);
		my $were = (@members == 1) ? qq(was from a) : qq(were from);
		push @$html, $cgi->h2((scalar @members) . qq( BLASTP hit$s $were close neighbor$s));
	    }
	}
	else {
	    push( @$html, $cgi->h2("No BLAST sims for $gene2 against neighbors") )
		&& &cleanup($tmp_fid, $tmp_seqs) && return;
	}
    }
    
    my $tmp_seqs = "$FIG_Config::temp/tmp_seqs_$$";
    open(TMP_SEQS, ">$tmp_seqs")
	|| ((push @$html, $cgi->h2("Could not write-open $tmp_seqs")) && &cleanup($tmp_fid, $tmp_seqs) && return);
    foreach my $id (@members) {
	my $seq = $fig->get_translation($id);
	if ($seq) {
#	    push @$html, $cgi->h2("Got seq for $id");
	    &FIG::display_id_and_seq($id, \$seq, \*TMP_SEQS);
	}
	else {
	    push @$html, $cgi->h2("No seq for $id");
	}
    }
    
    print TMP_SEQS $seq_gene1;
    print TMP_SEQS $seq_gene2;
    
    close(TMP_SEQS)
	|| ((push @$html, $cgi->h2("Could not close $tmp_seqs"))
	    && &cleanup($tmp_fid, $tmp_seqs) && return);
    
    system("$FIG_Config::ext_bin/clustalw -infile=$tmp_seqs -align -outorder=aligned > /dev/null")
	&& ((push @$html, $cgi->h2("clustalw failed on file $tmp_seqs"))
	    && &cleanup($tmp_fid, $tmp_seqs) && return);
    
    push @$html, qq(<DIV><FONT FACE="Courier">\n);
    @tmp = `cat $tmp_seqs.aln`;
    @tmp = map { &fid_link($_) } @tmp;
    push @$html, (@tmp ? (@tmp) : ("No lines read from $tmp_seqs.aln"));
    push @$html, qq(</FONT></DIV>\n);
    
    &cleanup($tmp_fid, $tmp_seqs);
}

sub cleanup {
    my ($tmp_seqs, $tmp_fid) =@_;
    return if $ENV{DEBUG};
    
    system("rm -f $tmp_fid $tmp_seqs*")
	&& ((push @$html, $cgi->h2("Could not remove temporary files"))
	    && return);
}

sub find_top_blast_hits {
    my ($fig, $cgi, $html, $query_seq, $tmp_close) = @_;
    
    my $tmp_query = "$FIG_Config::temp/tmp_query_$$";
    open( TMP_QUERY, ">$tmp_query" )
	|| ((push @$html, $cgi->h2("Could not write-open $tmp_query")) && return);
    print TMP_QUERY $query_seq;
    close(TMP_QUERY) 
	|| ((push @$html, $cgi->h2("Could not close $tmp_query")) && return);
    
    my @hits = `$FIG_Config::ext_bin/blastall -i $tmp_query -d $tmp_close -p blastp -m8 -FF -e1.0e-10 | cut -f2`;
    chomp @hits;
    
    push @$html, $cgi->h2(qq(BLAST found ) . (scalar @hits) . qq( hits\n));
    return @hits;
}

sub load_tbl1
{
    my ($dir) = @_;
    my ($entry, $id, $locus, $contig, $beg, $end, $len, $strand, $taxid, $type);

    open(TBL, "cat $dir/Features/*/tbl |") || die "Could not open $dir";
    
    my $tbl = {};
    while (defined($entry = <TBL>))
    {
	chomp $entry;
	
	($id, $locus) = split /\t/, $entry;
	$id =~ m/^[^\|]+\|(\d+\.\d+)\.([^\.]+)/;
	($taxid, $type) = ($1, $2);
	
	if ((($contig, $beg, $end, $len, $strand) = &from_locus($locus)) 
	   && defined($contig) && $contig
	   && defined($beg)    && $beg
	   && defined($end)    && $end
	   && defined($len)    && $len
           && defined($strand) && $strand
           )
	{
	    $tbl->{"$contig\t$strand$end"} = [ $id, $locus, $contig, $beg, $end, $len, $strand, $type, $taxid ];
	}
	else
	{
	    warn "INVALID ENTRY:\t$entry\n";
	}
    }
    
    return $tbl;
}

sub load_tbl2
{
    my ($dir,$gene) = @_;
    
    my ($entry, $id, $locus, $contig, $beg, $end, $len, $strand, $taxid, $type, $keep);

    open(TBL, "cat $dir/Features/*/tbl |") || die "Could not open $dir";
    
    my $tbl = {};
    while (defined($entry = <TBL>))
    {
	chomp $entry;
	
	($id, $locus) = split /\t/, $entry;
	$id =~ m/^[^\|]+\|(\d+\.\d+)\.([^\.]+)/;
	($taxid, $type) = ($1, $2);
	
	if ((($contig, $beg, $end, $len, $strand) = &from_locus($locus)) 
	   && defined($contig) && $contig
	   && defined($beg)    && $beg
	   && defined($end)    && $end
	   && defined($len)    && $len
           && defined($strand) && $strand
           )
	{
	    push(@{$tbl->{$contig}}, [ $id, $locus, $contig, $beg, $end, $len, $strand, $type, $taxid ]);
	    if ($id eq $gene) { $keep = $contig }
	}
	else
	{
	    warn "INVALID ENTRY:\t$entry\n";
	}
    }
    $keep || die "no contig for gene=$gene";
    return sort { by_locus($a,$b) } @{$tbl->{$keep}};
}

sub from_locus
{
    my ($locus) = @_;
    my ($contig, $beg, $end);
    
    if (  ($locus =~ m/^([^,]+)_(\d+)_\d+/)  && ($contig = $1) && ($beg = $2)
       && ($locus =~ m/[^,]+_\d+_(\d+)$/) && ($end = $1)
       )
    {
	return ($contig, $beg, $end, (1+abs($end-$beg)), (($beg < $end) ? '+' : '-'));
    }
    else
    {
	die "Invalid locus $locus";
    }
    
    return ();
}

sub by_locus
{
    my ($a, $b) = @_;

    my (undef, undef, $A_contig, $A_beg, $A_end, $A_len, $A_strand) = @$a;
    my (undef, undef, $B_contig, $B_beg, $B_end, $B_len, $B_strand) = @$b;

    return (  ($A_contig cmp $B_contig) 
	   || (&FIG::min($A_beg, $A_end) <=> &FIG::min($B_beg, $B_end))
	   || ($B_len <=> $A_len)
	   || ($A_strand cmp $B_strand)
	   );
}

sub link {
    my($org,$fid) = @_;

    return "<a href=diffsF.cgi?org1=$org&gene=$fid>$fid</a>";
}

sub fid_link {
    my($line) = @_;
    
    chomp $line;
    if ($line =~ m/^(fig\S+)\b(.*)$/o) {
	my ($id, $rest) = ($1, $2);
	$rest =~ s/\s/\&nbsp\;/go;
	$line =  qq(<A HREF="http://anno-3.nmpdr.org/anno/FIG/protein.cgi?prot=$id">$id</A>$rest);
    }
    else {
	$line =~ s/\s/\&nbsp\;/go;
    }
    
    return $line . qq(<BR>\n);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3