[Bio] / FigWebServices / figfam_server_2.cgi Repository:
ViewVC logotype

View of /FigWebServices/figfam_server_2.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.23 - (download) (annotate)
Fri Apr 16 21:37:09 2010 UTC (9 years, 7 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.22: +1 -1 lines
Push down default max requests.

use strict;
use FIG;

=head1 FIGfam Server 

This file contains the functions and used by the FIGfam Server. This server is
used to access data in the FIGfam database-- a large, complex directory tree of
structured files independent of the L<Sapling> database. It contains some of the
data found in Sapling, including genome names, functional assignments, and
aliases, but is not necessarily as complete or as up-to-date as the real
database. As a result, methods of this server that perform functions similar to
those of the Sapling Server (see L<SAP>) may return different results.

For documentation of this server's functions, see L<FFserver>.

=cut

my $have_fcgi;
eval {
    require CGI::Fast;
    $have_fcgi = 1;
};

use Data::Dumper;
use FFs;
use FF;
use KmersOld;
use FIG_Config;

use YAML;

my $ffdir = $FIG_Config::FigfamsData;

my $kmer_dir = $FIG_Config::KmerDataOld;

my $rna_tool = "/vol/search_for_rnas-2007-0625/search_for_rnas";

my $fig = new FIG;
my $ffs = new FFs($ffdir);


if ($kmer_dir eq '')
{
    die "Kmer directory not specified";
}
elsif (! -d $kmer_dir)
{
    die "Kmer directory $kmer_dir does not exist";
}

my $kmers = Kmers->new_using_C($kmer_dir);


$| = 1;

my $clean_up = 0;

my $header = "Content-type: text/plain\n\n";

my $max_requests = 50;

#
# If no CGI vars, assume we are invoked as a fastcgi service.
#
my $n_requests = 0;
if ($have_fcgi && $ENV{REQUEST_METHOD} eq '')
{
    #
    # Make mysql autoreconnect.
    #
    if ($FIG_Config::dbms eq 'mysql')
    {
        my $dbh = $fig->db_handle()->{_dbh};
        $dbh->{mysql_auto_reconnect} = 1;
    }

    while ((my $cgi = new CGI::Fast()) &&
	   ($max_requests == 0 || $n_requests++ < $max_requests))
    {
	eval {
	    &process_request($cgi);
	};
	if ($@)
	{
	    if (ref($@) ne 'ARRAY')
	    {
		warn "code died, cgi=$cgi returning error\n";
		print $cgi->header(-status => '500 error in body of cgi processing');
		print $@;
	    }
	}
    endloop:
    }
}
else
{
    my $cgi = new CGI();
    &process_request($cgi);
}

exit;


sub process_request
{
    my($cgi) = @_;

    my $function = $cgi->param('function');
#    print STDERR "got function=$function\n";

    my $arg_str = $cgi->param('args');
    my @args;
    if ($arg_str)
    {
	eval {
	    @args = YAML::Load($arg_str);
	};
	if ($@)
	{
	    myerror($cgi, "500 bad YAML parse", "YAML parse failed");
	    next;
	}
    }

    $function or myerror($cgi, "500 missing argument", "missing function argument");

    #print STDERR "$function\n";
    if ($function eq "members_of_families") {
	print $cgi->header();

	foreach my $famid (@args) {
	    my $fam;
	    eval {$fam = new FF($famid, $ffs->{dir}); };
	    if ($fam) {
		print YAML::Dump([$famid, $fam->family_function, [$fam->list_members()]]);
	    }
	    else
	    {
		print YAML::Dump(undef);
	    }
	}
    }
    elsif ($function eq "families_containing_peg") {
	print $cgi->header();

	foreach my $fid (@args) {
	    my @fams;
	    eval { @fams = $ffs->families_containing_peg($fid); };
	    print YAML::Dump([$fid, \@fams]);
	}
    } elsif ($function eq "function_of") {
	print $cgi->header();

	foreach my $fid (@args) {
	    my $func;
	    eval { $func = $ffs->function_of($fid); };
	    print YAML::Dump([$fid, $func]);
	}
    } elsif ($function eq "org_of") {
	print $cgi->header();

	foreach my $fid (@args) {
	   my $org;
	   eval { $org = $ffs->org_of($fid); };
	   print YAML::Dump([$fid, $org]);
	}
    } elsif ($function eq "seq_of") {
	print $cgi->header();

	foreach my $fid (@args) {
	   my $seq;
	   eval { $seq = $ffs->seq_of($fid); };
	   print YAML::Dump([$fid, $seq]);
	}
    } elsif ($function eq "aliases_of") {
	print $cgi->header();

	foreach my $fid (@args) {
	   my $aliases;
	   eval { $aliases = $ffs->aliases_of($fid); };
	   print YAML::Dump([$fid, $aliases]);
	}
    } elsif ($function eq "families_implementing_role") {
	print $cgi->header();

	foreach my $role (@args) {
	   my @fams;
	   eval { @fams = $ffs->families_implementing_role($role); };
	   print YAML::Dump([$role, \@fams]);
	}
    } elsif ($function eq "families_with_function") {
	print $cgi->header();

	foreach my $function (@args) {
	   my @fams;
	   eval { @fams = $ffs->families_with_function($function); };
	   print YAML::Dump([$function, \@fams]);
	}
    } elsif ($function eq "families_in_genome") {
        print $cgi->header();

        foreach my $genome (@args) {
           my @fams;
           eval { @fams = $ffs->families_in_genome($genome); };
           print YAML::Dump([$genome, \@fams]);
        }
    } elsif ($function eq "get_subsystem_based_figfams") {
        print $cgi->header();
	print YAML::Dump($ffs->get_subsystem_based_figfams());
    } elsif ($function eq "should_be_member") {

	print $cgi->header();

	foreach my $parm (@args) {
		my ($famid, $seq) = @$parm;
		my $fam = new FF($famid, $ffs->{dir});
		my $res;
		if ($fam) {
			my ($bool, $sims) = $fam->should_be_member($seq), "\n";
			$res = $bool ? 1 : 0;
		}
		print YAML::Dump($res);
	    }
    } elsif ($function eq "all_families") {
	print $cgi->header();
	print YAML::Dump($ffs->all_families(1));
    } elsif ($function eq "assign_function_to_prot") {
	print $cgi->header();
	my @id = $cgi->param('id_seq');

	my $blast = $cgi->param('blast');
	my $min_hits = $cgi->param('min_hits');
	my $assign_to_all = $cgi->param('assign_to_all');

	@id or myerror($cgi, "500 missing id_seq", "figfam server missing id_seq argument");
	
	my $extra_file;
	#
	# If we are invoking the blast-based assignments, use
	# Kmers::assign_functions_to_prot_set to accelerate
	# blasting. Otherwise, invoke Kmers::assign_function_to_prot
	# on each input sequence so we use less memory.
	#
	if ($assign_to_all)
	{
	    my $f = "$FIG_Config::KmerData/extra_prok_seqs.fasta";
	    if (-f $f && -f "$f.phr")
	    {
		$extra_file = $f;
	    }
	}
	if ($extra_file)
	{
	    #
	    # Reformat input to match expected layout.
	    #
	    my @inp = map { my($id, $seq) = split(/,/, $_);  [$id, undef, $seq] } @id;
	    my @res = $kmers->assign_functions_to_prot_set(\@inp, $blast, $min_hits, $extra_file);
	    for my $ent (@res)
	    {
		my($id, @rest) = @$ent;
		print YAML::Dump([$id, \@rest]);
	    }
	}
	else
	{
	    foreach my $parm (@id) {
		my ($id, $seq) = split /,/, $parm; 
		my $res = $kmers->assign_function_to_prot($seq, $blast, $min_hits, $extra_file);
		print YAML::Dump([$id,$res]);
	    }
	}

    } elsif ($function eq "call_genes") {
	my @id = $cgi->param('id_seq');

	my $genetic_code = $cgi->param('genetic_code');
	if ($genetic_code =~ /^(\d+)$/)
	{
	    $genetic_code = $1;
	}
	else
	{
	    $genetic_code = 11;
	}
	
	@id or myerror($cgi, "500 missing id_seq", "figfam server missing id_seq argument");

	#
	# Create fasta of the contig data.
	#

	my $fh;
	my $tmp = "$FIG_Config::temp/contigs.$$";
	my $tmp2 = "$FIG_Config::temp/contigs.aa.$$";
	my $tbl = "$FIG_Config::temp/tbl.$$";
	my $tbl2 = "$FIG_Config::temp/tbl2.$$";
	open($fh, ">", $tmp);

	foreach my $parm (@id) {
	    my ($id, $seq) = split /,/, $parm;
	    &FIG::display_id_and_seq($id, \$seq, $fh);
	}
	close($fh);

	my $res = system("$FIG_Config::bin/run_glimmer3 -code=$genetic_code 1.1 $tmp > $tbl");
	if ($res != 0)
	{
	    myerror($cgi, "500 glimmer run failed");
	}

	my $fh2;
	open($fh, "<", $tbl);
	open($fh2, ">", $tbl2);
	my $ctr = 1;
	my $encoded_tbl = [];
	while (<$fh>)
	{
	    chomp;
	    my(@a)  = split(/\t/);
	    $a[0] = sprintf("prot_%05d", $ctr++);
	    push(@a, $a[1]);
	    print $fh2 join("\t", @a), "\n";
	    my ($contig, $beg, $end) = ($a[1] =~ /^(\S+)_(\d+)_(\d+)$/);
	    push @$encoded_tbl, [$a[0], $contig, $beg, $end];
	}
	close($fh);
	close($fh2);

	$res = system("$FIG_Config::bin/get_fasta_for_tbl_entries -code=$genetic_code $tmp < $tbl2 > $tmp2");
	if ($res != 0)
	{
	    myerror($cgi, "500 get_fasta_for_tbl_entries failed");
	}
	
	if (!open($fh,"<", $tmp2))
	{
	    myerror($cgi, "Cannot open output file $tmp2");
	}
	print $cgi->header();
	my $out;
	my $buf;
	while (read($fh, $buf, 4096))
	{
	    $out .= $buf;
	}
	close($fh);
	print YAML::Dump([$out, $encoded_tbl]);
	#unlink($tmp);
	#unlink($tmp2);
	#unlink($tbl);
	#unlink($tbl2);

    } elsif ($function eq "find_rnas") {
	my @id = $cgi->param('id_seq');

	my $genus = get_string_param($cgi, 'genus');
	my $species = get_string_param($cgi, 'species');
	my $domain = get_string_param($cgi, 'domain');
	
	@id or myerror($cgi, "500 missing id_seq", "figfam server missing id_seq argument");

	$genus or myerror($cgi, "500 missing genus", "figfam server missing genus argument");
	$species or myerror($cgi, "500 missing species", "figfam server missing species argument");
	$domain or myerror($cgi, "500 missing domain", "figfam server missing domain argument");

	#
	# Create fasta of the contig data.
	#

	my $fh;
	my $tmp_dir = "$FIG_Config::temp/find_rnas.$$";
	my $log = "$tmp_dir/log";
	&FIG::verify_dir($tmp_dir);
	my $tmp = "$tmp_dir/contigs";
	my $tmp2 = "$tmp_dir/contigs2";
	my $tbl = "$tmp_dir/tbl";
	my $tbl2 = "$tmp_dir/tbl2";

	open($fh, ">", $tmp);

	foreach my $parm (@id) {
	    my ($id, $seq) = split /,/, $parm;
	    &FIG::display_id_and_seq($id, \$seq, $fh);
	}
	close($fh);

	my $cmd = "$rna_tool --tmpdir=$tmp_dir --contigs=$tmp --orgid=1 --domain=$domain --genus=$genus --species=$species";
	my $res = system("$cmd > $tbl 2> $log");
	if ($res != 0)
	{
	    myerror($cgi, "500 $rna_tool run failed");
	}

	my $fh2;
	open($fh, "<", $tbl);
	open($fh2, ">", $tbl2);
	my $ctr = 1;
	my $encoded_tbl = [];
	while (<$fh>)
	{
	    chomp;
	    my(@a)  = split(/\t/);
	    $a[0] = sprintf("rna_%05d", $ctr++);
	    push(@a, $a[1]);
	    print $fh2 join("\t", @a), "\n";
	    my ($contig, $beg, $end) = ($a[1] =~ /^(\S+)_(\d+)_(\d+)$/);
	    push @$encoded_tbl, [$a[0], $contig, $beg, $end];
	}
	close($fh);
	close($fh2);

	$res = system("$FIG_Config::bin/get_dna $tmp < $tbl2 > $tmp2");
	if ($res != 0)
	{
	    myerror($cgi, "500 get_dna failed");
	}
	
	if (!open($fh,"<", $tmp2))
	{
	    myerror($cgi, "Cannot open output file $tmp2");
	}
	print $cgi->header();
	my $out;
	my $buf;
	while (read($fh, $buf, 4096))
	{
	    $out .= $buf;
	}
	close($fh);
	print YAML::Dump([$out, $encoded_tbl]);
	#unlink($tmp);
	#unlink($tmp2);
	#unlink($tbl);
	#unlink($tbl2);

    } elsif ($function eq "assign_functions_to_DNA") {
	print $cgi->header();
	my @id = $cgi->param('id_seq');
	my $min_hits = $cgi->param('min_hits');
	my $max_gap = $cgi->param('max_gap');
	my $blast = $cgi->param('blast');
	@id or myerror($cgi, "500 missing id_seq", "figfam server missing id_seq argument");
	# open(L, ">>/tmp/log");
	# L->autoflush(1);
	# print L Dumper(\@id);
	foreach my $parm (@id) {
	    my ($id, $seq) = split /,/, $parm;
	    # print L "try $id\n$seq\n";
	    my $res;
	    eval {
		# print L Dumper($seq, $min_hits, $max_gap, $blast);
		$res = $kmers->assign_functions_to_PEGs_in_DNA($seq, $min_hits, $max_gap, $blast);
		# print L Dumper($res);
	    };
	    if ($@)
	    {
		myerror($cgi, "500 failure on assign_functions_to_PEGs_in_DNA", $@);
	    }
	    
	    # print L Dumper($res);
	    print YAML::Dump(map { [$id, $_ ] } @$res);
	    # print L "OK\n";
	}
    } else {
	myerror($cgi,  "500 invalid function", "invalid function $function\n");
    }
}

exit;

sub get_string_param
{
    my($cgi, $name) = @_;

    my $str = $cgi->param($name);
    if ($str =~ /^(\S+)/)
    {
	return $1;
    }
    else
    {
	return undef;
    }
    
}

#
#The FIGfam server processes requests of the form:
#
#    1. PLACE-IN-FAMILY takes as input a list of protein sequences.  It
#       returns a list where each element describes the outcome of
#       trying to place the corresponding input sequence into a
#       FIGfam.  Each output can be either
#
#                COULD-NOT-PLACE-IN-FAMILY
#       or
#                ID FUNCTION
#
#       where ID is of the form FIGxxxxxx and FUNCTION is the family
#       function. 
#
#    2. MEMBERS-OF-FAMILIES takes as input a list of FIGfam IDs.  The
#       output is a list of functions for those families
#       (INVALID-FAMILY will be returned for IDs that do not correspond
#       to an active family), as well as a list of the IDs in each family.
#
#    3. SHOULD-BE-MEMBER takes as input a list of 2-tuples
#
#             [FIGfam-ID,protein sequence]
#
#       It returns a list of boolean values indicating whether or not
#       the indicated protein sequence can be placed in the designated
#       family.
#
#    4. ALL-FAMILIES returns a list of [FIGfam-ID,function] tuples.
#
#
#    5. ASSIGN-FUNCTION-TO-PROT is similar to PLACE-IN-FAMILY, except
#       that the returned list contains either
#
#                COULD-NOT-PLACE-IN-FAMILY
#       or
#                ID FUNCTION
#
#       That is, it does not indicate which FIGfam was used to
#       determine the function.  This allows higher-performance
#       alternatives for cases in which multiple FIGfams implement the
#       same function.  The algorithm supported utilizes the underlying
#       FIGfams, but characterizes sets that implement the same
#       function and does not support distinguishing which FIGfam
#       is actually the right subgrouping.
#
#	6. ASSIGN-FUNCTIONS-TO-DNA takes as input a list of DNA
#       sequences.  It returns a list where each element describes 
#       a region of DNA that is believed to be part of a gene encoding
#       a protein sequence that would be placed into a FIGfam
#       successfully, if the whole protein sequence could be
#       determined.  That is, the returned list will contain entrties
#       of either the form
#
#                COULD-NOT-PLACE-ANY-REGIONS-IN-FAMILIES
#       or
#                BEGIN1 END1 FUNCTION1 BEGIN2 END2 FUNCTION2 ...
#
#       where BEGIN and END specify a region (if BEGIN is greater than
#       END, the region described is on the reverse strand) and
#       FUNCTION is the family function of the protein sequence that is
#       believed to be encoded by DNA including the embedded region.
#       Each input sequence can produce an arbitrary number of matched
#       regions, there will be 3 fields for each matched region.  Note
#       that the described region may include frameshifts and embedded
#       stop codons.  The algorithm seeking meaningful sections of DNA
#       assumes that it may have an incomplete, low-quality sequence
#       (and uses an algorithm that attempts to locate meaningful
#       matches even so).

sub myerror
{
    my($cgi, $stat, $msg) = @_;
    print $cgi->header(-status =>  $stat);
    print "$msg\n";
    goto endloop;
}





MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3