use strict; use FIG; =head1 FIGfam Server This file contains the functions and used by the FIGfam Server. This server is used to access data in the FIGfam database-- a large, complex directory tree of structured files independent of the L database. It contains some of the data found in Sapling, including genome names, functional assignments, and aliases, but is not necessarily as complete or as up-to-date as the real database. As a result, methods of this server that perform functions similar to those of the Sapling Server (see L) may return different results. For documentation of this server's functions, see L. =cut my $have_fcgi; eval { require CGI::Fast; $have_fcgi = 1; }; use Data::Dumper; use FFs; use FF; use KmersOld; use FIG_Config; use YAML; my $ffdir = $FIG_Config::FigfamsData; my $kmer_dir = $FIG_Config::KmerDataOld; my $rna_tool = "/vol/search_for_rnas-2007-0625/search_for_rnas"; my $fig = new FIG; my $ffs = new FFs($ffdir); if ($kmer_dir eq '') { die "Kmer directory not specified"; } elsif (! -d $kmer_dir) { die "Kmer directory $kmer_dir does not exist"; } my $kmers = Kmers->new_using_C($kmer_dir); $| = 1; my $clean_up = 0; my $header = "Content-type: text/plain\n\n"; my $max_requests = 50; # # If no CGI vars, assume we are invoked as a fastcgi service. # my $n_requests = 0; if ($have_fcgi && $ENV{REQUEST_METHOD} eq '') { # # Make mysql autoreconnect. # if ($FIG_Config::dbms eq 'mysql') { my $dbh = $fig->db_handle()->{_dbh}; $dbh->{mysql_auto_reconnect} = 1; } while ((my $cgi = new CGI::Fast()) && ($max_requests == 0 || $n_requests++ < $max_requests)) { eval { &process_request($cgi); }; if ($@) { if (ref($@) ne 'ARRAY') { warn "code died, cgi=$cgi returning error\n"; print $cgi->header(-status => '500 error in body of cgi processing'); print $@; } } endloop: } } else { my $cgi = new CGI(); &process_request($cgi); } exit; sub process_request { my($cgi) = @_; my $function = $cgi->param('function'); # print STDERR "got function=$function\n"; my $arg_str = $cgi->param('args'); my @args; if ($arg_str) { eval { @args = YAML::Load($arg_str); }; if ($@) { myerror($cgi, "500 bad YAML parse", "YAML parse failed"); next; } } $function or myerror($cgi, "500 missing argument", "missing function argument"); #print STDERR "$function\n"; if ($function eq "members_of_families") { print $cgi->header(); foreach my $famid (@args) { my $fam; eval {$fam = new FF($famid, $ffs->{dir}); }; if ($fam) { print YAML::Dump([$famid, $fam->family_function, [$fam->list_members()]]); } else { print YAML::Dump(undef); } } } elsif ($function eq "families_containing_peg") { print $cgi->header(); foreach my $fid (@args) { my @fams; eval { @fams = $ffs->families_containing_peg($fid); }; print YAML::Dump([$fid, \@fams]); } } elsif ($function eq "function_of") { print $cgi->header(); foreach my $fid (@args) { my $func; eval { $func = $ffs->function_of($fid); }; print YAML::Dump([$fid, $func]); } } elsif ($function eq "org_of") { print $cgi->header(); foreach my $fid (@args) { my $org; eval { $org = $ffs->org_of($fid); }; print YAML::Dump([$fid, $org]); } } elsif ($function eq "seq_of") { print $cgi->header(); foreach my $fid (@args) { my $seq; eval { $seq = $ffs->seq_of($fid); }; print YAML::Dump([$fid, $seq]); } } elsif ($function eq "aliases_of") { print $cgi->header(); foreach my $fid (@args) { my $aliases; eval { $aliases = $ffs->aliases_of($fid); }; print YAML::Dump([$fid, $aliases]); } } elsif ($function eq "families_implementing_role") { print $cgi->header(); foreach my $role (@args) { my @fams; eval { @fams = $ffs->families_implementing_role($role); }; print YAML::Dump([$role, \@fams]); } } elsif ($function eq "families_with_function") { print $cgi->header(); foreach my $function (@args) { my @fams; eval { @fams = $ffs->families_with_function($function); }; print YAML::Dump([$function, \@fams]); } } elsif ($function eq "families_in_genome") { print $cgi->header(); foreach my $genome (@args) { my @fams; eval { @fams = $ffs->families_in_genome($genome); }; print YAML::Dump([$genome, \@fams]); } } elsif ($function eq "get_subsystem_based_figfams") { print $cgi->header(); print YAML::Dump($ffs->get_subsystem_based_figfams()); } elsif ($function eq "should_be_member") { print $cgi->header(); foreach my $parm (@args) { my ($famid, $seq) = @$parm; my $fam = new FF($famid, $ffs->{dir}); my $res; if ($fam) { my ($bool, $sims) = $fam->should_be_member($seq), "\n"; $res = $bool ? 1 : 0; } print YAML::Dump($res); } } elsif ($function eq "all_families") { print $cgi->header(); print YAML::Dump($ffs->all_families(1)); } elsif ($function eq "assign_function_to_prot") { print $cgi->header(); my @id = $cgi->param('id_seq'); my $blast = $cgi->param('blast'); my $min_hits = $cgi->param('min_hits'); my $assign_to_all = $cgi->param('assign_to_all'); @id or myerror($cgi, "500 missing id_seq", "figfam server missing id_seq argument"); my $extra_file; # # If we are invoking the blast-based assignments, use # Kmers::assign_functions_to_prot_set to accelerate # blasting. Otherwise, invoke Kmers::assign_function_to_prot # on each input sequence so we use less memory. # if ($assign_to_all) { my $f = "$FIG_Config::KmerData/extra_prok_seqs.fasta"; if (-f $f && -f "$f.phr") { $extra_file = $f; } } if ($extra_file) { # # Reformat input to match expected layout. # my @inp = map { my($id, $seq) = split(/,/, $_); [$id, undef, $seq] } @id; my @res = $kmers->assign_functions_to_prot_set(\@inp, $blast, $min_hits, $extra_file); for my $ent (@res) { my($id, @rest) = @$ent; print YAML::Dump([$id, \@rest]); } } else { foreach my $parm (@id) { my ($id, $seq) = split /,/, $parm; my $res = $kmers->assign_function_to_prot($seq, $blast, $min_hits, $extra_file); print YAML::Dump([$id,$res]); } } } elsif ($function eq "call_genes") { my @id = $cgi->param('id_seq'); my $genetic_code = $cgi->param('genetic_code'); if ($genetic_code =~ /^(\d+)$/) { $genetic_code = $1; } else { $genetic_code = 11; } @id or myerror($cgi, "500 missing id_seq", "figfam server missing id_seq argument"); # # Create fasta of the contig data. # my $fh; my $tmp = "$FIG_Config::temp/contigs.$$"; my $tmp2 = "$FIG_Config::temp/contigs.aa.$$"; my $tbl = "$FIG_Config::temp/tbl.$$"; my $tbl2 = "$FIG_Config::temp/tbl2.$$"; open($fh, ">", $tmp); foreach my $parm (@id) { my ($id, $seq) = split /,/, $parm; &FIG::display_id_and_seq($id, \$seq, $fh); } close($fh); my $res = system("$FIG_Config::bin/run_glimmer3 -code=$genetic_code 1.1 $tmp > $tbl"); if ($res != 0) { myerror($cgi, "500 glimmer run failed"); } my $fh2; open($fh, "<", $tbl); open($fh2, ">", $tbl2); my $ctr = 1; my $encoded_tbl = []; while (<$fh>) { chomp; my(@a) = split(/\t/); $a[0] = sprintf("prot_%05d", $ctr++); push(@a, $a[1]); print $fh2 join("\t", @a), "\n"; my ($contig, $beg, $end) = ($a[1] =~ /^(\S+)_(\d+)_(\d+)$/); push @$encoded_tbl, [$a[0], $contig, $beg, $end]; } close($fh); close($fh2); $res = system("$FIG_Config::bin/get_fasta_for_tbl_entries -code=$genetic_code $tmp < $tbl2 > $tmp2"); if ($res != 0) { myerror($cgi, "500 get_fasta_for_tbl_entries failed"); } if (!open($fh,"<", $tmp2)) { myerror($cgi, "Cannot open output file $tmp2"); } print $cgi->header(); my $out; my $buf; while (read($fh, $buf, 4096)) { $out .= $buf; } close($fh); print YAML::Dump([$out, $encoded_tbl]); #unlink($tmp); #unlink($tmp2); #unlink($tbl); #unlink($tbl2); } elsif ($function eq "find_rnas") { my @id = $cgi->param('id_seq'); my $genus = get_string_param($cgi, 'genus'); my $species = get_string_param($cgi, 'species'); my $domain = get_string_param($cgi, 'domain'); @id or myerror($cgi, "500 missing id_seq", "figfam server missing id_seq argument"); $genus or myerror($cgi, "500 missing genus", "figfam server missing genus argument"); $species or myerror($cgi, "500 missing species", "figfam server missing species argument"); $domain or myerror($cgi, "500 missing domain", "figfam server missing domain argument"); # # Create fasta of the contig data. # my $fh; my $tmp_dir = "$FIG_Config::temp/find_rnas.$$"; my $log = "$tmp_dir/log"; &FIG::verify_dir($tmp_dir); my $tmp = "$tmp_dir/contigs"; my $tmp2 = "$tmp_dir/contigs2"; my $tbl = "$tmp_dir/tbl"; my $tbl2 = "$tmp_dir/tbl2"; open($fh, ">", $tmp); foreach my $parm (@id) { my ($id, $seq) = split /,/, $parm; &FIG::display_id_and_seq($id, \$seq, $fh); } close($fh); my $cmd = "$rna_tool --tmpdir=$tmp_dir --contigs=$tmp --orgid=1 --domain=$domain --genus=$genus --species=$species"; my $res = system("$cmd > $tbl 2> $log"); if ($res != 0) { myerror($cgi, "500 $rna_tool run failed"); } my $fh2; open($fh, "<", $tbl); open($fh2, ">", $tbl2); my $ctr = 1; my $encoded_tbl = []; while (<$fh>) { chomp; my(@a) = split(/\t/); $a[0] = sprintf("rna_%05d", $ctr++); push(@a, $a[1]); print $fh2 join("\t", @a), "\n"; my ($contig, $beg, $end) = ($a[1] =~ /^(\S+)_(\d+)_(\d+)$/); push @$encoded_tbl, [$a[0], $contig, $beg, $end]; } close($fh); close($fh2); $res = system("$FIG_Config::bin/get_dna $tmp < $tbl2 > $tmp2"); if ($res != 0) { myerror($cgi, "500 get_dna failed"); } if (!open($fh,"<", $tmp2)) { myerror($cgi, "Cannot open output file $tmp2"); } print $cgi->header(); my $out; my $buf; while (read($fh, $buf, 4096)) { $out .= $buf; } close($fh); print YAML::Dump([$out, $encoded_tbl]); #unlink($tmp); #unlink($tmp2); #unlink($tbl); #unlink($tbl2); } elsif ($function eq "assign_functions_to_DNA") { print $cgi->header(); my @id = $cgi->param('id_seq'); my $min_hits = $cgi->param('min_hits'); my $max_gap = $cgi->param('max_gap'); my $blast = $cgi->param('blast'); @id or myerror($cgi, "500 missing id_seq", "figfam server missing id_seq argument"); # open(L, ">>/tmp/log"); # L->autoflush(1); # print L Dumper(\@id); foreach my $parm (@id) { my ($id, $seq) = split /,/, $parm; # print L "try $id\n$seq\n"; my $res; eval { # print L Dumper($seq, $min_hits, $max_gap, $blast); $res = $kmers->assign_functions_to_PEGs_in_DNA($seq, $min_hits, $max_gap, $blast); # print L Dumper($res); }; if ($@) { myerror($cgi, "500 failure on assign_functions_to_PEGs_in_DNA", $@); } # print L Dumper($res); print YAML::Dump(map { [$id, $_ ] } @$res); # print L "OK\n"; } } else { myerror($cgi, "500 invalid function", "invalid function $function\n"); } } exit; sub get_string_param { my($cgi, $name) = @_; my $str = $cgi->param($name); if ($str =~ /^(\S+)/) { return $1; } else { return undef; } } # #The FIGfam server processes requests of the form: # # 1. PLACE-IN-FAMILY takes as input a list of protein sequences. It # returns a list where each element describes the outcome of # trying to place the corresponding input sequence into a # FIGfam. Each output can be either # # COULD-NOT-PLACE-IN-FAMILY # or # ID FUNCTION # # where ID is of the form FIGxxxxxx and FUNCTION is the family # function. # # 2. MEMBERS-OF-FAMILIES takes as input a list of FIGfam IDs. The # output is a list of functions for those families # (INVALID-FAMILY will be returned for IDs that do not correspond # to an active family), as well as a list of the IDs in each family. # # 3. SHOULD-BE-MEMBER takes as input a list of 2-tuples # # [FIGfam-ID,protein sequence] # # It returns a list of boolean values indicating whether or not # the indicated protein sequence can be placed in the designated # family. # # 4. ALL-FAMILIES returns a list of [FIGfam-ID,function] tuples. # # # 5. ASSIGN-FUNCTION-TO-PROT is similar to PLACE-IN-FAMILY, except # that the returned list contains either # # COULD-NOT-PLACE-IN-FAMILY # or # ID FUNCTION # # That is, it does not indicate which FIGfam was used to # determine the function. This allows higher-performance # alternatives for cases in which multiple FIGfams implement the # same function. The algorithm supported utilizes the underlying # FIGfams, but characterizes sets that implement the same # function and does not support distinguishing which FIGfam # is actually the right subgrouping. # # 6. ASSIGN-FUNCTIONS-TO-DNA takes as input a list of DNA # sequences. It returns a list where each element describes # a region of DNA that is believed to be part of a gene encoding # a protein sequence that would be placed into a FIGfam # successfully, if the whole protein sequence could be # determined. That is, the returned list will contain entrties # of either the form # # COULD-NOT-PLACE-ANY-REGIONS-IN-FAMILIES # or # BEGIN1 END1 FUNCTION1 BEGIN2 END2 FUNCTION2 ... # # where BEGIN and END specify a region (if BEGIN is greater than # END, the region described is on the reverse strand) and # FUNCTION is the family function of the protein sequence that is # believed to be encoded by DNA including the embedded region. # Each input sequence can produce an arbitrary number of matched # regions, there will be 3 fields for each matched region. Note # that the described region may include frameshifts and embedded # stop codons. The algorithm seeking meaningful sections of DNA # assumes that it may have an incomplete, low-quality sequence # (and uses an algorithm that attempts to locate meaningful # matches even so). sub myerror { my($cgi, $stat, $msg) = @_; print $cgi->header(-status => $stat); print "$msg\n"; goto endloop; }