[Bio] / FigWebServices / anno_server.cgi Repository:
ViewVC logotype

View of /FigWebServices/anno_server.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.16 - (download) (annotate)
Thu Oct 28 19:12:56 2010 UTC (9 years, 3 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_dev_06072011, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, mgrast_dev_04052011, mgrast_dev_02222011
Changes since 1.15: +66 -0 lines
kmer dataset querying changes

use strict;
use ANNO;
use File::Basename;
use FIG;
use ServerThing;

=head1 Annotation Support Server 

For documentation of this server's functions, see L<ANNOserver>.

=cut

my $have_fcgi;
eval {
    require CGI::Fast;
    $have_fcgi = 1;
};

use Data::Dumper;
use FFs;
use FF;
use Kmers;
use FIG_Config;

use YAML;

my $rna_tool = "/vol/search_for_rnas-2007-0625/search_for_rnas";

my $annoServer = new ANNO();

my $kmer_default_name;
my %kmer_dir;
my %kmer;

my $kmer_dir = $FIG_Config::KmerData;
my $kmer_dirH = $FIG_Config::KmerDataDetailed;

if (!defined($kmer_dir) && !defined($kmer_dirH))
{
    die "Kmer directory not specified";
}
elsif (ref($kmer_dirH))
{
    ref($kmer_dirH) eq 'HASH' or die "Invalid \$FIG_Config::kmer_dirH (must be string or hash reference)";
    while (my($name, $ent) = each(%$kmer_dirH))
    {
	if ($ent->{dir} eq '')
	{
	    die "Missing directory entry for kmer data $name";
	}
	elsif (! -d $ent->{dir})
	{
	    die "Kmer data $ent->{dir} does not exist";
	}
	$kmer_dir{$name} = $ent->{dir};
	if ($ent->{default})
	{
	    if ($kmer_default_name)
	    {
		warn "Multiple default kmer directories configured; using $name $ent->{dir}";
	    }
	    $kmer_default_name = $name;
	}
    }
}
elsif (! -d $kmer_dir)
{
    die "Kmer directory $kmer_dir does not exist";
}
else
{
    $kmer_default_name = basename($kmer_dir);
    $kmer_dir{$kmer_default_name} = $kmer_dir;
}

while (my($name, $dir) = each(%kmer_dir))
{
    my $kmers;
    eval { 
	$kmers = Kmers->new($dir);
    };
    if ($kmers)
    {
	$kmer{$name} = $kmers;
    }
    else
    {
	warn "Error loading kmers $name $dir: $@";
	delete $kmer_dir{$name};
    }
}

my $kmer_default = $kmer{$kmer_default_name};
$annoServer->set_kmer_data($kmer_default);

$| = 1;

my $clean_up = 0;

my $header = "Content-type: text/plain\n\n";

my $max_requests = 50;

#
# If no CGI vars, assume we are invoked as a fastcgi service.
#
my $n_requests = 0;
if ($have_fcgi && $ENV{REQUEST_METHOD} eq '')
{
    #
    # Make mysql autoreconnect.
    #

    while (($max_requests == 0 || $n_requests++ < $max_requests) &&
	   (my $cgi = new CGI::Fast()))
    {
	eval {
	    &process_request($cgi);
	};
	if ($@)
	{
	    if (ref($@) ne 'ARRAY')
	    {
		warn "code died, cgi=$cgi returning error\n";
		print $cgi->header(-status => '500 error in body of cgi processing');
		print $@;
	    }
	}
    endloop:
    }
}
else
{
    my $cgi = new CGI();
    &process_request($cgi);
}

exit;


sub process_request
{
    my($cgi) = @_;

    my $function = $cgi->param('function');
    my $arg_str = $cgi->param('args');

    $function or myerror($cgi, "500 missing argument", "missing function argument");

    if ($function eq "assign_function_to_prot") {
	print $cgi->header();
	my @id = $cgi->param('id_seq');

	my %params = map { my $v = $cgi->param($_); defined($v) ? ($_ => $cgi->param($_)) : () }
		qw(-kmer -scoreThreshold -hitThreshold -seqHitThreshold -normalizeScores -detailed 
		   -assignToAll -kmerDataset -determineFamily -returnFamilySims);

	$params{-all} = 1;

	my $kmers;
	my $kmer_fasta = "$FIG_Config::KmerData/extra_prok_seqs.fasta";
	if (defined(my $ds = $params{-kmerDataset}))
	{
	    $kmers = $kmer{$ds};
	    ref($kmers) or myerror($cgi, "500 invalid dataset name $ds");
	    $kmer_fasta = "$kmer_dir{$ds}/extra_prok_seqs.fasta";
	}
	else
	{
	    $kmers = $kmer_default;
	}

	@id or myerror($cgi, "500 missing id_seq", "figfam server missing id_seq argument");

	my @missing;
	foreach my $parm (@id) {
	    my ($id, $seq) = split /,/, $parm;
	    my $triple = [$id, undef, $seq];
	    my @res = $kmers->assign_functions_to_prot_set(-seqs => [$triple], %params);
	    my $res = $res[0];
	    if ($res->[1] || !$params{-assignToAll})
	    {
		print YAML::Dump($res);
	    }
	    elsif ($params{-assignToAll})
	    {
		push(@missing, $triple);
	    }
	}
	if ($params{-assignToAll} && @missing)
	{
	    my @rest = $kmers->assign_functions_using_similarity(-seqs => \@missing,
								 -simDb => $kmer_fasta);
	    
	    print YAML::Dump(@rest);
	}

    } elsif ($function eq "call_genes") {
	my @id = $cgi->param('id_seq');
	@id or myerror($cgi, "500 missing id_seq", "figfam server missing id_seq argument");

	my %params = map { my $v = $cgi->param($_); defined($v) ? ($_ => $cgi->param($_)) : () }
		qw(-geneticCode -minContigLen -verbose);

	my $genetic_code = ($params{-geneticCode} =~ /^(\d+)$/ ? $1 : 11);

	my $min_training_len = ($params{-minContigLen} = ~ /^(\d+)$/ ? $1 : 2000);
	#
	# Create fasta of the contig data.
	#

	my $fh;
	my $tmp = "$FIG_Config::temp/contigs.$$";
	my $tmp2 = "$FIG_Config::temp/contigs.aa.$$";
	my $tbl = "$FIG_Config::temp/tbl.$$";
	my $tbl2 = "$FIG_Config::temp/tbl2.$$";
	open($fh, ">", $tmp);

	foreach my $parm (@id) {
	    my ($id, $seq) = split /,/, $parm;
	    &FIG::display_id_and_seq($id, \$seq, $fh);
	}
	close($fh);
	
	# Training stuff.
	my $trainingParms = "";
	my @trainSet = $cgi->param('training_set');
	if (@trainSet) {
	    my $tmp3 = "$FIG_Config::temp/tbl3.$$";
	    my $fh3;
	    open($fh3, ">", $tmp3);
	    while (@trainSet) {
		my $loc = pop @trainSet;
		my $id = pop @trainSet;
		print $fh3 "$id\t$loc\n";
	    }
	    $trainingParms = "-train=$tmp3";
	    my @trainContigs = $cgi->param('train_seq');
	    if (@trainContigs) {
		undef $fh3;
		my $tmp4 = "$FIG_Config::temp/fasta1.$$";
		open($fh3, ">", $tmp4);
		foreach my $parm (@trainContigs) {
		    my ($id, $seq) = split /,/, $parm;
		    &FIG::display_id_and_seq($id, \$seq, $fh3);
		}
		close($fh3);
		$trainingParms .= ",$tmp4";
	    }
	}
	# Verbose check
	my $verbose = ($params{-verbose} ? "-verbose" : "");
	if ($verbose) {
	    warn "Input file: $tmp; training parameter=$trainingParms\n";
	}
	# Call glimmer
	my $res = system("$FIG_Config::bin/run_glimmer3 $verbose -minlen=$min_training_len $trainingParms -code=$genetic_code 1.1 $tmp > $tbl");
	if ($res != 0)
	{
	    myerror($cgi, "500 glimmer run failed");
	}

	my $fh2;
	open($fh, "<", $tbl);
	open($fh2, ">", $tbl2);
	my $ctr = 1;
	my $encoded_tbl = [];
	while (<$fh>)
	{
	    chomp;
	    my(@a)  = split(/\t/);
	    $a[0] = sprintf("prot_%05d", $ctr++);
	    push(@a, $a[1]);
	    print $fh2 join("\t", @a), "\n";
	    my ($contig, $beg, $end) = ($a[1] =~ /^(\S+)_(\d+)_(\d+)$/);
	    push @$encoded_tbl, [$a[0], $contig, $beg, $end];
	}
	close($fh);
	close($fh2);

	$res = system("$FIG_Config::bin/get_fasta_for_tbl_entries -code=$genetic_code $tmp < $tbl2 > $tmp2");
	if ($res != 0)
	{
	    myerror($cgi, "500 get_fasta_for_tbl_entries failed");
	}
	
	if (!open($fh,"<", $tmp2))
	{
	    myerror($cgi, "Cannot open output file $tmp2");
	}
	print $cgi->header();
	my $out;
	my $buf;
	while (read($fh, $buf, 4096))
	{
	    $out .= $buf;
	}
	close($fh);
	print YAML::Dump([$out, $encoded_tbl]);
	#unlink($tmp);
	#unlink($tmp2);
	#unlink($tbl);
	#unlink($tbl2);

    } elsif ($function eq "find_rnas") {
	my @id = $cgi->param('id_seq');

	my %params = map { my $v = $cgi->param($_); defined($v) ? ($_ => $cgi->param($_)) : () }
		qw(-genus -species -domain);
	
	
	@id or myerror($cgi, "500 missing id_seq", "figfam server missing id_seq argument");

	$params{-genus} or myerror($cgi, "500 missing genus", "figfam server missing genus argument");
	$params{-species} or myerror($cgi, "500 missing species", "figfam server missing species argument");
	$params{-domain} or myerror($cgi, "500 missing domain", "figfam server missing domain argument");

	#
	# Create fasta of the contig data.
	#

	my $fh;
	my $tmp_dir = "$FIG_Config::temp/find_rnas.$$";
	my $log = "$tmp_dir/log";
	&FIG::verify_dir($tmp_dir);
	my $tmp = "$tmp_dir/contigs";
	my $tmp2 = "$tmp_dir/contigs2";
	my $tbl = "$tmp_dir/tbl";
	my $tbl2 = "$tmp_dir/tbl2";

	open($fh, ">", $tmp);

	foreach my $parm (@id) {
	    my ($id, $seq) = split /,/, $parm;
	    &FIG::display_id_and_seq($id, \$seq, $fh);
	}
	close($fh);

	my $cmd = "$rna_tool --tmpdir=$tmp_dir --contigs=$tmp --orgid=1 --domain=$params{-domain} --genus=$params{-genus} --species=$params{-species}";
	warn "Run: $cmd\n";
	#
	# Need to clear the PERL5LIB from the environment since tool is configured to use its own.
	#
	my $res = system("cd $tmp_dir; env PERL5LIB= $cmd > $tbl 2> $log");
	if ($res != 0)
	{
	    myerror($cgi, "500 $rna_tool run failed");
	}

	my $fh2;
	open($fh, "<", $tbl);
	open($fh2, ">", $tbl2);
	my $ctr = 1;
	my $encoded_tbl = [];
	while (<$fh>)
	{
	    chomp;
	    my(@a)  = split(/\t/);

	    my $new = sprintf("rna_%05d", $ctr++);

	    print $fh2 join("\t", $new, $a[1]), "\n";
	    my ($contig, $beg, $end) = ($a[1] =~ /^(\S+)_(\d+)_(\d+)$/);
	    push @$encoded_tbl, [$new, $contig, $beg, $end, $a[2]];
	}
	close($fh);
	close($fh2);

	$res = system("$FIG_Config::bin/get_dna $tmp < $tbl2 > $tmp2");
	if ($res != 0)
	{
	    myerror($cgi, "500 get_dna failed");
	}
	
	if (!open($fh,"<", $tmp2))
	{
	    myerror($cgi, "Cannot open output file $tmp2");
	}
	print $cgi->header();
	my $out;
	my $buf;
	while (read($fh, $buf, 4096))
	{
	    $out .= $buf;
	}
	close($fh);
	print YAML::Dump([$out, $encoded_tbl]);
	#unlink($tmp);
	#unlink($tmp2);
	#unlink($tbl);
	#unlink($tbl2);

    } elsif ($function eq "assign_functions_to_DNA") {
	print $cgi->header();
	my @id = $cgi->param('id_seq');
	my %params = map { my $v = $cgi->param($_); defined($v) ? ($_ => $cgi->param($_)) : () }
		qw(-kmer -minHits -maxGap -kmerDataset);

	my $min_hits = $params{-minHits};
	my $max_gap = $params{-maxGap};

	my $kmer = $params{-kmer};

	my $kmers;
	if (defined(my $ds = $params{-kmerDataset}))
	{
	    $kmers = $kmer{$ds};
	    ref($kmers) or myerror($cgi, "500 invalid dataset name $ds");
	}
	else
	{
	    $kmers = $kmer_default;
	}

	@id or myerror($cgi, "500 missing id_seq", "figfam server missing id_seq argument");
#	 open(L, ">>/tmp/log");
#	 L->autoflush(1);
#	 print L Dumper(\@id);
	foreach my $parm (@id) {
	    my ($id, $seq) = split /,/, $parm;
	    # print L "try $id\n$seq\n";
	    my $res;
	    eval {
#		 print L Dumper($seq, $min_hits, $max_gap);
		$res = $kmers->assign_functions_to_PEGs_in_DNA($kmer, $seq, $min_hits, $max_gap);

#		 print L Dumper($res);
	    };
	    if ($@)
	    {
		myerror($cgi, "500 failure on assign_functions_to_PEGs_in_DNA", $@);
	    }
	    
#	     print L Dumper($res);
	    print YAML::Dump(map { [$id, $_ ] } @$res);
#	     print L "OK\n";
	}
    } elsif ($function eq "get_dataset") {

	my $args = YAML::Load($arg_str);

	my $ds;
	if (defined($ds = $args->{'-kmerDataset'}))
	{
	    my $kmers = $kmer{$ds};
	    ref($kmers) or myerror($cgi, "500 invalid dataset name $ds");
	}
	else
	{
	    $ds = $kmer_default_name;
	}
	print $cgi->header();
	print YAML::Dump($ds);
    } elsif ($function eq "get_vector_basis_sets") {

	my $args = YAML::Load($arg_str);

	my $kmers;
	if (defined(my $ds = $args->{'-kmerDataset'}))
	{
	    $kmers = $kmer{$ds};
	    ref($kmers) or myerror($cgi, "500 invalid dataset name $ds");
	}
	else
	{
	    $kmers = $kmer_default;
	}

	my $dir = $kmers->dir();

	my $res = {};
	my @todo = ([function => "$dir/family.vector.def"], [otu => "$dir/setI"]);
	for my $ent (@todo)
	{
	    my($what, $file) = @$ent;
	    if (open(my $fh, "<", $file))
	    {
		local $/ = undef;
		$res->{$what} = <$fh>;
		close($fh);
	    }
	    else
	    {
		push(@{$res->{errors}}, "Cannot open $file: $!");
	    }
	}

	print $cgi->header();
	print YAML::Dump($res);
    
    } elsif ($function eq "get_active_datasets") {

	my $res = {};
	for my $ds (keys %kmer)
	{
	    my $kmer = $kmer{$ds};
	    my $l = [$kmer->get_available_kmer_sizes()];
	    $res->{$ds} = $l;
	}

	print $cgi->header();
	print YAML::Dump([$kmer_default_name, $res]);
    
    } else {
	ServerThing::RunRequest($cgi, $annoServer);
    }
}

exit;

sub get_string_param
{
    my($cgi, $name) = @_;

    my $str = $cgi->param($name);
    if ($str =~ /^(\S+)/)
    {
	return $1;
    }
    else
    {
	return undef;
    }
    
}

#
#The FIGfam server processes requests of the form:
#
#    1. PLACE-IN-FAMILY takes as input a list of protein sequences.  It
#       returns a list where each element describes the outcome of
#       trying to place the corresponding input sequence into a
#       FIGfam.  Each output can be either
#
#                COULD-NOT-PLACE-IN-FAMILY
#       or
#                ID FUNCTION
#
#       where ID is of the form FIGxxxxxx and FUNCTION is the family
#       function. 
#
#    2. MEMBERS-OF-FAMILIES takes as input a list of FIGfam IDs.  The
#       output is a list of functions for those families
#       (INVALID-FAMILY will be returned for IDs that do not correspond
#       to an active family), as well as a list of the IDs in each family.
#
#    3. SHOULD-BE-MEMBER takes as input a list of 2-tuples
#
#             [FIGfam-ID,protein sequence]
#
#       It returns a list of boolean values indicating whether or not
#       the indicated protein sequence can be placed in the designated
#       family.
#
#    4. ALL-FAMILIES returns a list of [FIGfam-ID,function] tuples.
#
#
#    5. ASSIGN-FUNCTION-TO-PROT is similar to PLACE-IN-FAMILY, except
#       that the returned list contains either
#
#                COULD-NOT-PLACE-IN-FAMILY
#       or
#                ID FUNCTION
#
#       That is, it does not indicate which FIGfam was used to
#       determine the function.  This allows higher-performance
#       alternatives for cases in which multiple FIGfams implement the
#       same function.  The algorithm supported utilizes the underlying
#       FIGfams, but characterizes sets that implement the same
#       function and does not support distinguishing which FIGfam
#       is actually the right subgrouping.
#
#	6. ASSIGN-FUNCTIONS-TO-DNA takes as input a list of DNA
#       sequences.  It returns a list where each element describes 
#       a region of DNA that is believed to be part of a gene encoding
#       a protein sequence that would be placed into a FIGfam
#       successfully, if the whole protein sequence could be
#       determined.  That is, the returned list will contain entrties
#       of either the form
#
#                COULD-NOT-PLACE-ANY-REGIONS-IN-FAMILIES
#       or
#                BEGIN1 END1 FUNCTION1 BEGIN2 END2 FUNCTION2 ...
#
#       where BEGIN and END specify a region (if BEGIN is greater than
#       END, the region described is on the reverse strand) and
#       FUNCTION is the family function of the protein sequence that is
#       believed to be encoded by DNA including the embedded region.
#       Each input sequence can produce an arbitrary number of matched
#       regions, there will be 3 fields for each matched region.  Note
#       that the described region may include frameshifts and embedded
#       stop codons.  The algorithm seeking meaningful sections of DNA
#       assumes that it may have an incomplete, low-quality sequence
#       (and uses an algorithm that attempts to locate meaningful
#       matches even so).

sub myerror
{
    my($cgi, $stat, $msg) = @_;
    print $cgi->header(-status =>  $stat);
    print "$msg\n";
    goto endloop;
}





MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3