[Bio] / FigWebServices / rest_seed.cgi Repository:
ViewVC logotype

View of /FigWebServices/rest_seed.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (annotate)
Wed Jul 1 01:01:18 2009 UTC (10 years, 5 months ago) by redwards
Branch: MAIN
CVS Tags: rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2010_0118, rast_rel_2009_07_09
Adding the cgi, js, html, and Web services files for the real time metagenomics pages.

#__perl__

# NOTE THAT use strict will break this!!
use lib '/home/redwards/perl/lib/perl5/site_perl/5.8.7/i686-linux/';
use CGI;
use CGI::Carp qw/fatalsToBrowser/;
use JSON::XS;

use FIG;
use FIG_Config;
use Data::Dumper;
use FigFams;
use FigFam;
use IPC::Open3;
use PinnedRegions;
use URI::Escape;


=pod 

=head1 rest_seed.cgi

YAWS - Yet another web service!

Why: We're using rpc encoding which is basically URL encoding. In this, I call something like 

http://bioseed.mcs.anl.gov/~redwards/FIG/rest_seed.cgi/multiply/2/3/4/5 

and get a response. Why do we need another web service? Mainly because of the Google work. Google pretty much exclusively deals with http requests, and eschews SOAP and other encodings as being too complex. 

The data returned is all in JSON format (http://www.json.org/) which is the Javascript object notation format. JSON is a really light weight markup language that cna handle complex objects quite easily. 

I am also aiming for lightweight code. In this case, we're not going to instantiate anything until we need it. Hopefully.

NOTE: I can't figure out how to encode forward slashes in URLs. We cant escape them with two slashes (\\) since the browser concatenates them, and we cant encode them with url_encode cos the server craps out. For now I am using the regexp forwardslash (case insensitive)

=cut

# a really simple RESTful web service that returns seed data

my $cgi=new CGI qw/:standard/;
my $json= new JSON::XS;

print $cgi->header('text/plain');

# get the query with path so we get the RESTful information
my $abs = $cgi->url(-absolute=>1); # this will be the cgi-bin/rest.cgi stuff
my $rest = $cgi->url(-path_info=>1);
$rest =~ s/^.*$abs\///;

# we need to escape double slashes so that we don't split on them

my @rest=split m#/#, $rest;

#my @rest=split m#(?<!/)/(?!/)#, $rest; # this matches a '/' that is neither preceeded nor followed by a '/', so '//' does not match

map {$rest[$_] =~ s#forwardslash#/#gi} (0 .. $#rest);

my $method = shift @rest;

# there is no good way of passing a null value!!
map {undef $rest[$_] if ($rest[$_] eq "undef" || $rest[$_] eq "null")} (0..$#rest);

#my $result =  $json->encode({url=>$rest, query=>\@rest, result => &{$method}(@rest) });
my $result =  $json->encode({result => &{$method}(@rest) });

print $result, "\n";



=pod

=head1 multiply.

This is a really simple method that just multiplies two numbers! It's great for testing things out

=cut

sub multiply {
        
	my $x = 1;
        map {$x = $x * $_} @_;
        return $x;
}

=head1 returnNumbers

And this method returns a reference to an array, so you can test JSON parsing

=cut

sub returnNumbers {
	return [21, 23];
}




=begin WSDL
_IN alias $string
_RETURN $string
_DOC Retrieve the protein sequence for a given identifier. Input is an alias, output is a sequence
=cut
sub ali_to_seq {
	my ($arg) = @_;
	return stdin_caller("ali_to_seq", $arg);
}

=begin WSDL
_IN pegs $string
_RETURN $string
_DOC Retrieve the set of pegs in order along the chromosome. Input is a comma separated list of pegs, and output is the pegs in order along the genome.
=cut
sub adjacent {
	my ($arg) = @_;
	$arg =~ s/\,\s*/\n/g;
	return stdin_caller("adjacent", $arg);
}

=begin WSDL
_IN peg $string
_RETURN $string
_DOC Get the clusters for a peg by bidirectional best hits. Input is a peg, output is two column table of [peg, cluster]
=cut
sub cluster_by_bbhs {
	my ($arg) = @_;
	return stdin_caller("cluster_by_bbhs", $arg);
}

=begin WSDL
_IN peg $string
_RETURN $string
_DOC Get the clusters for a peg by similarity. Input is a peg, output is two column table of [peg, cluster]
=cut
sub cluster_by_sim {
	my ($arg) = @_;
	return stdin_caller("cluster_by_sim", $arg);
}

=begin WSDL
_IN peg $string
_RETURN $string
_DOC Get the annotations for a peg from all other known sources. Input is a peg, output is two column table of [peg, other function]
=cut
sub external_calls {
	my ($arg) = @_;
	return stdin_caller("external_calls", $arg);
}

=begin WSDL
_IN peg $string
_RETURN $string
_DOC Get the functional annotation of a given protein identifier. Input is a peg, output is a function
=cut
sub function_of {
	my ($arg) = @_;
	my $fig=new FIG;
	return scalar($fig->function_of($arg));
}

=begin WSDL
_IN peg $string
_RETURN $string
_DOC Get the genome(s) that a given protein identifier refers to. Input is a peg, output is a single column table of genomes
=cut
sub genomes_of {
	my ($arg) = @_;
	return stdin_caller("genomes_of", $arg);
}


=begin WSDL
_IN genomeid $string
_RETURN $string
_DOC Get the genus and species of a genome identifier. Input is a genome ID, output is the genus and species of the genome
=cut
sub genus_species {
	my ($arg) = @_;
	my $fig=new FIG;
	return $fig->genus_species($arg);
}


=begin WSDL
_IN peg $string
_RETURN $string
_DOC Get the DNA sequence for a given protein identifier. Input is a peg, output is the DNA sequence in fasta format.
=cut
sub fid2dna {
	my ($arg) = @_;
	return stdin_caller("fid2dna", $arg);
}

=begin WSDL
_IN peg $string
_RETURN $string
_DOC Get the DNA sequence for a set of protein identifiers. Input is a comma-joined list of pegs, output is the DNA sequence in fasta format.
=cut
sub fids2dna {
	my ($arg) = @_;
	my $seq;
	foreach my $peg (split /\,/, $arg) {
		$seq .= stdin_caller("fid2dna", $peg);
	}
	return $seq;
}
		

=begin WSDL
_IN genomeid $string
_RETURN @string
_DOC Get a comma-separated list of all the contigs in a genome
=cut
sub contigs_of {
	my ($genome)=@_;
	my $fig = new FIG;
	return join(",", $fig->contigs_of($genome));
}


=begin WSDL
_IN genomeid $string
_IN location1 $string
_IN location2 $string
_RETURN @string
_DOC Get the DNA sequence for a region in a genome. Input is a genome ID and one or more locations in the form contig_start_stop, output is the DNA sequence in fasta format.
=cut
sub dna_sequence {
	my ($genome, @locations)=@_;
	my $fig = new FIG;
	my $seq=$fig->dna_seq($genome,@locations);
	return $seq;
}



=begin WSDL
_IN genomeid $string
_RETURN $string
_DOC Get all the protein identifiers associated with a genome. Input is a genome id, output is a list of pegs in that genome
=cut
sub pegs_of {
	my ($arg) = @_;
	my $fig = new FIG;
	return (join ",", $fig->pegs_of($arg));
}

=begin WSDL
_IN genomeid $string
_IN contig $string
_RETURN $string
_DOC Get the length of the DNA sequence in a contig in a genome. Input is a genome id and a contig name, return is the length of the contig
=cut
sub contig_ln {
	my $fig = new FIG;
	return $fig->contig_ln(@_);
}

=begin WSDL
_IN genomeid $string
_RETURN $string
_DOC Test whether an organism is Archaeal. Input is a genome identifier, and output is true or false (or 1 or 0)
=cut
sub is_archaeal {
	my ($arg) = @_;
	return stdin_caller("is_archaeal", $arg);
}

=begin WSDL
_IN genomeid $string
_RETURN $string
_DOC Test whether an organism is Bacterial. Input is a genome identifier, and output is true or false (or 1 or 0)
=cut
sub is_bacterial {
	my ($arg) = @_;
	return stdin_caller("is_bacterial", $arg);
}

=begin WSDL
_IN genomeid $string
_RETURN $string
_DOC Test whether an organism is Eukaryotic. Input is a genome identifier, and output is true or false (or 1 or 0)
=cut
sub is_eukaryotic {
	my ($arg) = @_;
	return stdin_caller("is_eukaryotic", $arg);
}

=begin WSDL
_IN genomeid $string
_RETURN $string
_DOC Test whether an organism is a Prokaryote. Input is a genome identifier, and output is true or false (or 1 or 0)
=cut
sub is_prokaryotic {
	my ($arg) = @_;
	return stdin_caller("is_prokaryotic", $arg);
}

=begin WSDL
_IN peg $string
_RETURN $string
_DOC Get the translation (protein sequence) of a peg. Input is a peg, output is the protein sequence
=cut
sub translation_of {
	my ($arg) = @_;
	return stdin_caller("translation_of", $arg);
}

=begin WSDL
_IN peg $string
_RETURN $string
_DOC Get the translation (protein sequence) of a peg. Input is a peg, output is translation
=cut
sub get_translation {
	my ($arg1) = @_;
	my $fig = new FIG;
	my $result = $fig->get_translation($arg1);
	return $result;
}

=begin WSDL
_IN peg $string
_RETURN @string
_DOC Get the location of a peg on its contig. Input is a peg, output is list of loc on contig
=cut
sub feature_location {
	my ($arg1) = @_;
	my $fig = new FIG;
	my @result = ($fig->feature_location($arg1));
	return @result;
}

=begin WSDL
_IN peg $string
_RETURN $string
_DOC Get the aliases of a peg. These are the identifiers that other databases use. Input is a peg, output is tab separated string of aliases
=cut
sub aliases_of {
	my ($arg) = @_;
	return stdin_caller("aliases_of", $arg);
}

=begin WSDL
_IN peg $string
_RETURN $string
_DOC Get the corresponding ids of a peg. These are the identifiers that other databases use. Input is a peg, output is tab separated string of aliases
=cut
sub get_corresponding_ids {
	my ($arg) = @_;
	my $fig = new FIG;
	my @result = $fig->get_corresponding_ids($arg, 1);
	return join("\t", map {join(":", $_->[1], $_->[0])} @result);
}

=begin WSDL
_IN alias $string
_RETURN $string
_DOC Get the FIG ID (peg) for a given external identifier. Input is an identifier used by another database, output is our identifier
=cut
sub alias2fig {
	my ($arg) = @_;
	return stdin_caller("alias2fig", $arg);
}



=begin WSDL
_IN EC_number $string ec code
_RETURN $string ec name
_DOC Get the name for a given E.C. number. Input is an EC number, output is the name
=cut
sub ec_name {
	my $fig = new FIG;
	my $result = $fig->ec_name(@_);
	return $result;
}


=begin WSDL
_IN reaction_number $string reaction code number
_IN genomeid $string
_RETURN $string
_DOC Get a tab-separated list of [subsystem name, functional role, peg, subsystem variant code for that genome] for any given reaction id and genome id. Maps the reaction id to peg, peg to genome, and genome to variant code
=cut
sub reaction_to_role {
	my ($rxn, $genomeid) = @_;
	my $fig = new FIG;
	my @ecs = $fig->catalyzed_by($rxn);
	
	my @return;
	foreach my $ec (@ecs)
	{
		my @ssr = grep {$fig->genome_of($_->[2]) eq $genomeid} $fig->subsystems_for_ec($ec);
		foreach my $ss (@ssr)
		{
			my $sub = $fig->get_subsystem($ss->[0]);
			push @$ss, $sub->get_variant_code_for_genome($fig->genome_of($ss->[2])) if ($sub);
			push @return, join("\t", @$ss);
		}
	}
	return SOAP::Data->type('string')->name('reaction_to_roleReturn')->value(join("\n", @return));
}



=begin WSDL
_IN peg $string
_RETURN $string
_DOC Get the pegs that are coupled to any given peg. Input is a peg, output is list of [protein, score] for things that are coupled to this peg
=cut
sub coupled_to {
	my $fig = new FIG;
	my $return=undef;
	my @result = $fig->coupled_to(@_);
	if (@result)
	{
		$return = join("\n", map {$_->[0].",".$_->[1]} @result);
	}
	return $return;
}

=begin WSDL
_IN peg $string
_RETURN $string
_DOC Get the pegs that may be coupled to this peg through abstract coupling. Input is a peg, output is list of [protein, score] for things that are coupled to this peg
=cut
sub abstract_coupled_to {
	my $fig = new FIG;
	my $return=undef;
	my @result = $fig->abstract_coupled_to(@_);
	if (@result)
	{
		$return = join("\n", map {$_->[0].",".$_->[1]} @result);
	}
	return $return;
}

=begin WSDL
_IN peg_id $string
_IN n_pch_pins $string
_IN n_sims $string
_IN sim_cutoff $string
_IN color_sim_cutoff $string
_IN sort_by $string
_RETURN $string
_DOC Input is a FIG (PEG) ID and ..., output is the pinned regions data
=cut 
sub pinned_region_data {
    my ($peg, $n_pch_pins, $n_sims, $sim_cutoff, $color_sim_cutoff, $sort_by, $fast_color, $sims_from, $region_size) = @_;
    
    my $fig = new FIG;
    
    defined($n_pch_pins)       or $n_pch_pins = 5;
    defined($n_sims)           or $n_sims = 0;
    defined($sim_cutoff)       or $sim_cutoff = 1e-20;
    defined($color_sim_cutoff) or $color_sim_cutoff = 1e-20;
    defined($sort_by)          or $sort_by = '';
    
    defined($fast_color)       or $fast_color = 0;
    defined($sims_from)        or $sims_from = 'blast';
    defined($region_size)      or $region_size = 16000;

     my $pin_desc = {
                     'pegs'                   => [$peg],
                     'collapse_close_genomes' => 0,
                     'n_pch_pins'             => $n_pch_pins,
                     'n_sims'                 => $n_sims, 
                     'show_genomes'           => '',
                     'sim_cutoff'             => $sim_cutoff,
                     'color_sim_cutoff'       => $color_sim_cutoff,
                     'sort_by'                => $sort_by,
                     'show_genomes'           => [],
                   };
                   
    my $maps = &PinnedRegions::pinned_regions($fig, $pin_desc, $fast_color, $sims_from, $region_size);
    my $txt  = Dumper($maps);
    return $txt;
}   


=begin WSDL
_IN complete $string
_IN restrictions $string
_IN domain $string
_RETURN @string
_DOC Get a set of genomes. The inputs are a series of constraints - whether the sequence is complete, other restrictions, and a domain of life (Bacteria, Archaea, Eukarya, Viral, Environmental Genome). Output is a comma separated list of genomes
=cut
sub all_genomes {
	my $fig=new FIG;
	my @genomes=$fig->genomes(@_);
	return \@genomes;
}

=begin WSDL
_IN complete $string
_IN restrictions $string
_IN domain $string
_RETURN @string
_DOC Get a set of genomes. The inputs are a series of constraints - whether the sequence is complete, other restrictions, and a domain of life (Bacteria, Archaea, Eukarya, Viral, Environmental Genome). Output is a comma separated list of genomes.
=cut
sub genomes {
	my $fig = new FIG;
	my @result = $fig->genomes(@_);
	my %genomes = map {($_ => $fig->genus_species($_))}  $fig->genomes(@_);  
	
	return \%genomes;
}


=begin WSDL
_IN pattern1 $string
_IN pattern2 $string
_RETURN @string
_DOC Search and grep through the database. Input is two patterns, first one is used in search_index, second used to grep the results to restrict to a smaller set.
=cut
sub search_and_grep {
	my ($arg1, $arg2) = @_;
	
	my $fig = new FIG;
	
	my ($pegs, $roles) =  $fig->search_index($arg1);
	
	my (@result_list, $entry);
	
	for $entry (@$pegs) {
		push (@result_list, grep(/$arg2/, @$entry));
	}	
	push (@result_list, grep(/$arg2/, @$roles));
	chomp @result_list;
	my $return_value = join ("\n", @result_list); 
	return $return_value;
}



sub search_genome {
	my ($genome, $term)=@_;
	my $fig = new FIG;
	my ($pegs, $roles) =  $fig->search_index($term);
	my $result;
	# For each feature, there is a tuple consisting of the (0) feature ID, (1) the organism name (genus and species), (2) the aliases, (3) the functional role, and (4) the relevant annotator

	foreach my $ent (grep {$_->[0] =~ /fig\|$genome\./} @$pegs) {
		$result->{$ent->[0]}=$ent->[3];
	}
	return $result;
}

=begin WSDL
_IN pattern $string
_RETURN @string
_DOC Search the database. Input is a pattern to search for, output is tab separated list of pegs and roles
=cut
sub simple_search {
	my ($arg1)=@_;
	
	my $fig = new FIG;
	
	my ($pegs, $roles) =  $fig->search_index($arg1);
	
	my (@result_list, $entry);
	
	for $entry (@$pegs) {
		push (@result_list, (join("\t", @$entry)));
	}
	
	# push (@result_list, (join("\t", @$roles)));
	chomp @result_list;
	my $return_value = join ("\n", @result_list);
	return $return_value;
}        


=begin all_subsystem_classifications

Get all the subsystems classifications. Tested with JSON.

=cut
sub all_subsystem_classifications {
	my $fig = new FIG;
	my $output;
	
	my %found;
	map {   
		my @classification=@{$fig->subsystem_classification($_)};
		$#classification=1;
		push @classification, $_;
		push @$output, \@classification;
	} sort {$a cmp $b} ($fig->all_subsystems());
	return $output;
}

=begin all_subsystems_with_roles

Get all the subsystems and their roles. Tested with JSON

=cut
sub all_subsystems_with_roles {
	my $fig = new FIG;
	my $output = $fig->all_subsystems_with_roles();
	return $output;
}


=begin functions_to_subsystems

Get the subsystems for several functions. Tested with JSON

=cut
sub functions_to_subsystems {
	my $fig=new FIG;
	my %fns;
	foreach my $fn (@_) {
		$fn = uri_unescape($fn);
		my @arr = $fig->function_to_subsystems($fn);
		$fns{$fn} = \@arr;
	}
	return \%fns;
}

=begin function_to_subsystems

Get the subsystems for a function. Tested with JSON

=cut
sub function_to_subsystems {
	my $fn = shift;
	$fn =~ s/\%20/ /g;
	my $fig=new FIG;
	my @arr = $fig->function_to_subsystems($fn);
	return \@arr;
}


=begin WSDL
_RETURN $string list of families
_DOC Get all the FIG protein families (FIGfams). No input needed, it just returns a list of all families
=cut
sub all_families {
	my $fig = new FIG;
	my $figfams = new FigFams($fig);
	
	my @out = $figfams->all_families;
	print STDERR Dumper(@out);
	return @out;
}

=begin WSDL
_RETURN $string list of families and funcs
_DOC Get all the FIG protein families (FIGfams) with their assigned functions. No input needed, it just returns a list of all the families and their functions.
=cut
sub all_families_with_funcs {
	my $fig = new FIG;
	my $figfams = new FigFams($fig);
	
	my @out =$figfams->all_families_with_funcs;
	return @out;
}

=begin WSDL
_IN families $string list of famids
_RETURN $string 2 col table, famid, peg
_DOC Get all the pegs in some FIGfams. The input is a tab-separated list of family IDs, and the output is a two column table of [family id, peg]
=cut
sub list_members {
	my ($famids) = @_;
	my $fig = new FIG;
	my $figfams = new FigFams($fig);
	my @in = split(/\t/, $famids);
	warn("Starting 2 list members $famids\n");
    my @out = ();
    foreach my $famid (@in)
    {
		my $famO = new FigFam($fig,$famid);
		foreach my $peg ($famO->list_members)
		{
			push(@out,[$famid,$peg]);
		}
    }
    return @out;
}

=begin WSDL
_IN families $string list of pegs
_RETURN $string returns a 3-column table [PEG,Function,AliasesCommaSeparated]
_DOC Get all the pegs in some FIGfams, their functions, and aliases. Input is a tab-separated list of families, returns a 3-column comma separated table [PEG, Function, Aliases]
=cut
sub CDS_data {
	my ($pegs) = @_;
	my $fig = new FIG;
	my $figfams = new FigFams($fig);
	my @in = split(/\t/, $pegs);
	
	#warn("Starting CDS data $pegs\n");
	#print STDERR &Dumper($pegs);
	
    my @out = ();
    foreach my $peg (@in)
    {
		my @famids = $figfams->families_containing_peg($peg);
		foreach my $famid (@famids)
		{
			push(@out,[$peg,scalar $fig->function_of($peg),[$fig->feature_aliases($peg)]]);
		}
    }
    return @out;
}

=begin WSDL
_IN families $string list of pegs
_RETURN $string a 2-column table [PEG,Sequence]
_DOC Get the protein sequences for a list of FIGfams. Input is a tab-separated list of families, returns a 2-column comma separated table of [PEG, sequence] 
=cut
sub CDS_sequences {
	my ($pegs) = @_;
	my $fig = new FIG;
	my $figfams = new FigFams($fig);
	my @in = split(/\t/, $pegs);
	
	#warn("Starting CDS seq $pegs\n");
	#print STDERR &Dumper($pegs);
    my @out = ();
    foreach my $peg (@in)
    {
		push(@out,[$peg,$fig->get_translation($peg)]);
    }
    return @out;
}

=begin WSDL
_IN sequences $string list of id seq pairs
_RETURN $string returns a 2-column table [Id,FamilyID]
_DOC Tries to put a protein sequence in a family. Input is a tab-separated id and sequence, delimited by new lines. The output is a comma-separated 2-column table [your sequence id, FamilyID] if the sequence is placed in a family.
=cut
sub is_member_of {
	my ($id_seqs) = @_;
	my $fig = new FIG;
	my $figfams = new FigFams($fig);
	#warn("Doing is member $id_seqs\n");
	#print STDERR &Dumper($id_seqs);
	
	my @in = split(/\n/, $id_seqs);
    my @out = ();
    foreach my $pair (@in)
    {
		my($id,$seq) = split(/\t/, $pair);
		my($famO,undef) = $figfams->place_in_family($seq);
		if ($famO)
		{
			push(@out,[$id,$famO->family_id]);
		}
    }
    return @out;
}

=begin WSDL
_IN peg $string
_IN maxN $string
_IN maxP $string
_RETURN $string
_DOC Retrieve the sims (precomputed BLAST hits) for a given protein sequence. Input is a peg, an optional maximum number of hits (default=50), and an optional maximum E value (default=1e-5). The output is a list of sims in modified tab separated (-m 8) format. Additional columns include length of query and database sequences, and method used.
=cut
sub sims {
	my ($peg, $maxN, $maxP)=@_;
	unless (defined $maxN) {$maxN=50}
	unless (defined $maxP) {$maxP=1e-5}
	my $fig=new FIG;
	my $return=undef;
	foreach my $sim ($fig->sims($peg, $maxN, $maxP, 'figx'))
	{
		$return .= join("\t", @$sim). "\n";
	}
	
	return $return;
}



##### INTERNAL METHODS

sub stdin_caller {
	my ($name, $arg) = @_;
	my($rd, $wr, $err, $pid, $std_err, $return_value, @std_out);
	if (!($pid = open3($wr, $rd, $err, "$FIG_Config::bin/$name")))
	{
		die "Cannot run open3 $name: $!";
	}
	
	$wr->write($arg);
	close($wr);
	
	@std_out= <$rd>;
	close($rd);
	waitpid $pid, 0;
	$return_value = join ("", @std_out); 
	return $return_value;
}



MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3