[Bio] / FigWebServices / webservices_mgrast.cgi Repository:
ViewVC logotype

View of /FigWebServices/webservices_mgrast.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (annotate)
Sat Aug 29 22:00:03 2009 UTC (10 years, 5 months ago) by redwards
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.2: +83 -0 lines
updated web services

#__perl__

use strict;
use Carp;
use CGI::Carp qw(fatalsToBrowser); # this makes debugging a lot easier by throwing errors out to the browser
use SOAP::Lite;
use SOAP::Transport::HTTP;

SOAP::Transport::HTTP::CGI   
-> dispatch_to('MgRastWebServices')     
-> handle;


package MgRastWebServices;

use FIGV;
use FIG;
use DBMaster;
use Job48;
use Data::Dumper;
use GenomeMeta;
use Data::Dumper;
use raelib;



# This is a series of methods I added for accessing the mg-RAST. User authentication is currently based only on username (not password), and is handled
# by a separate routine towards the end of the file. If you want to change the user access control, just change that method.
# 

my $dbmaster; # a globally scoped dbmaster so only need to construct it once


=begin WSDL
_IN username $string
_RETURN $string
_DOC Find all of a users jobs. Input is a username and output is a comma separated list of the users jobs. Note that this list includes both the users jobs and jobs of their organization
=cut
sub mg_rast_user_jobs {
	my ($class, $login_name)  = @_;
	my ($userj, $orgj)=$class->_user_mg_rast_jobs($login_name);
	return join(",", @$userj, @$orgj);
}

=begin WSDL
_IN job $string
_IN username $string
_RETURN $string
_DOC Find information about a job. Input is a job id and a username, output is a tuple of [id, user, genome id, genome name, size, number of contigs, number of pegs]
=cut
sub mg_rast_job_info {
	my ($class, $job_id, $login_name)  = @_;
	
	my $job=$class->_validate_mg_rast_user($job_id, $login_name, 'getjob');
	if ($job =~ /Access Error/) {return $job}
	
	my ($sz, $numcontigs, $npegs)=(0,0,0);
	my $orgdir=$class->_validate_mg_rast_user($job_id, $login_name);
	my $figv=FIGV->new($orgdir);
	$npegs=$figv->genome_pegs($job->genome_id);
	
	my $contig_lens = $figv->contig_lengths($job->genome_id);
	while ( my($contig,$len) = each %$contig_lens)
	{
		$sz += $len;
		$numcontigs++
	}
	
	return join(", ", $job->id, $job->user, $job->genome_id, $job->genome_name, $sz, $numcontigs, $npegs);
}

=begin WSDL
_IN username $string
_RETURN $string
_DOC Find information about all the jobs a user has access to. Input is a username and output is a list of tuples of [job id, user, genome id, and genome name]
=cut
sub mg_rast_users_job_info {
	my ($class, $login_name)  = @_;
	my ($userj, $orgj)=$class->_user_mg_rast_jobs($login_name);
	my $output;
	foreach my $jobid (@$userj, @$orgj)
	{
		my $job=$class->_validate_mg_rast_user($jobid, $login_name, 'getjob');
		next if ($job =~ /Access Error/);
		
		my ($coords, $time, $meta);
		eval {$meta = GenomeMeta->new(undef, "/vol/metagenome-48-hour/Jobs.prod/$jobid/meta.xml");};
		if ($@) {$coords = $time = "Error with xml"}
		else {
			eval {$coords = $class->mg_rast_lat_lon($jobid, $login_name)};
			eval {$time   = $meta->get_metadata("optional_info.time");};
			$coords =~ s/^\s+//; $coords =~ s/\s+$//; $coords =~ s/^\s*\,\s*$//;
			$time =~ s/^\s+//; $time =~ s/\s+$//;
		}
		
		$output .= join("\t", $job->id, $job->user, $job->genome_id, 
		$job->genome_name,  $coords, $time). "\n";
	}
	chomp($output);
	return $output;
}


=begin WSDL
_IN job $string
_IN username $string
_IN searchterm $string
_RETURN $string search results
_DOC Search for text in a job. Input is a job, a username, and a term to search for, and the results are the search results from querying that metagenome
=cut
sub mg_search_metagenome {
	my ($class, $job_id, $login_name, $search)  = @_;
	return undef unless ($search);
	
	my $orgdir=$class->_validate_mg_rast_user($job_id, $login_name);
	if ($orgdir =~ /Access Error/ || !(-e $orgdir)) {return $orgdir}
	
	my $figv=FIGV->new($orgdir);
	my $res=$figv->search_features_by_annotation($search, 1);
	my $return;
	
	foreach my $fn (keys %$res)
	{
		$return .= join("\n", map {join("\t", $_, $fn)} @{$res->{$fn}});
	}
	return  $return;
	
}

=begin WSDL
_IN username $string
_IN searchterm $string
_RETURN $string search results
_DOC Search for text in all jobs a user has access to. Input is a username, and a term to search for, and the results are the search results from querying all metagenomes that user has access to
=cut
sub mg_search_users_metagenomes {
	my ($class, $login_name, $search)  = @_;
	return undef unless ($search);
	
	my ($userj, $orgj)=$class->_user_mg_rast_jobs($login_name);
	my $return;
	foreach my $job_id (@$userj, @$orgj)
	{
		
		my $orgdir=$class->_validate_mg_rast_user($job_id, $login_name);
		if ($orgdir =~ /Access Error/ || !(-e $orgdir)) {return $orgdir}
		
		my $figv=FIGV->new($orgdir);
		my $res=$figv->search_features_by_annotation($search, 1);
		
		foreach my $fn (keys %$res)
		{
			$return .= join("\n", map {join("\t", $_, $fn)} @{$res->{$fn}})."\n";
		}
	}
	chomp($return);
	return  $return;
	
}


=begin WSDL
_IN job $string
_IN username $string
_RETURN $string bindings
_DOC Retrieve the data that binds a job to a set of subsystems. Input is a RAST job id and a username, output is the bindings that join that job to the subsystems. This is a tple of subsystem name, protein function, and sequence within the metagenome.
=cut
sub mg_rast_subsystems {
	my ($class, $job_id, $login_name)  = @_;
	
	my $orgdir=$class->_validate_mg_rast_user($job_id, $login_name);
	if ($orgdir =~ /Access Error/ || !(-e $orgdir)) {return $orgdir}
	
	open(IN, "$orgdir/Subsystems/bindings") || return "No subsystems found\n";
	return join("", <IN>);
}

=begin WSDL
_IN job $string
_IN username $string
_RETURN $string counts
_DOC Retrieve the counts of each subsystem in a metagenome. Input is a RAST job id and a username, output is a list of tuples of subsystem name and number of occurences
=cut
sub mg_rast_subsystem_counts {
	my ($class, $job_id, $login_name)  = @_;
	
	my $orgdir=$class->_validate_mg_rast_user($job_id, $login_name);
	if ($orgdir =~ /Access Error/ || !(-e $orgdir)) {return $orgdir}
	
	open(IN, "$orgdir/Subsystems/bindings") || return "No subsystems found\n";

	my $fig=new FIG;
	my $sscount; my $peg;
	while (<IN>) {
		chomp;
		my @a=split /\t/;
		next unless ($fig->usable_subsystem($a[0]));
		$sscount->{$a[0]}->{$a[2]}=1;
		$peg->{$a[2]}->{$a[0]}=1;
	}


# the contribution of a peg to a subsystem is 1/the number of subsystems it is in.
# If a peg is in one subsystem, report it once. If it is in 5 subsystems, give it 0.2

	my %count;
	foreach my $ss (keys %$sscount) {
		foreach my $p (keys %{$sscount->{$ss}}) {
			$count{$ss}+= 1/(scalar(keys %{$peg->{$p}}));
		}
	}


	return map {"$_\t$count{$_}\n"} sort {$a cmp $b} keys %count;
}


=begin WSDL
_IN job $string
_IN username $string
_RETURN $string counts
_DOC Retrieve the counts of each subsystem in a metagenome. Input is a RAST job id and a username, output is a list of tuples of subsystem name and number of occurences. Note that this is an overcount. The count is the number of things something is similar to that is in a subsystem. Therefore, if a single DNA sequence hits many different things, each of which are in subsystems, each will get counted.
=cut
sub mg_rast_subsystem_counts_over {
	my ($class, $job_id, $login_name)  = @_;
	
	my $orgdir=$class->_validate_mg_rast_user($job_id, $login_name);
	if ($orgdir =~ /Access Error/ || !(-e $orgdir)) {return $orgdir}
	
	open(IN, "$orgdir/Subsystems/bindings") || return "No subsystems found\n";
	my %count;
	while (<IN>)
	{
		my @a=split /\t/;
		$count{$a[0]}++;
	}
	
	return map {"$_\t$count{$_}\n"} sort {$a cmp $b} keys %count;
}

=begin WSDL
_IN job $string
_IN username $string
_IN number $string
_RETURN $string bindings
_DOC Retrieve the sequences from a job. Input is a RAST job id and a username, output is the DNA sequences in the job in fasta format. The number is a limit in how many sequences will be returned.
=cut
sub mg_rast_sequences {
	my ($class, $job_id, $login_name, $number)  = @_;
	
	my $orgdir=$class->_validate_mg_rast_user($job_id, $login_name);
	if ($orgdir =~ /Access Error/ || !(-e $orgdir)) {return $orgdir}
	
	my $fasta;
	eval {$fasta = raelib->read_fasta("$orgdir/contigs")};
	if ($@) {return "Error: $@"}
	my $keys = raelib->rand([keys %$fasta]); # randomize the order of the sequences returned. 
	if ($number) {@$keys = splice(@$keys, 0, $number)}
	
	#return join("\n", "keys", @$keys);	
	return join("", map {$_ = ">$_\n".$fasta->{$_}."\n"} @$keys);
}

=begin WSDL
_IN job $string
_IN username $string
_RETURN $string
_DOC Get the location (GPS coordinates) where a sample was taken (if available). This is a single point for every sample, and if there is more than one location for a sample it is the average of all locations. Input is a RAST job id and a username, output is a tuple of latitude and longitude where the sample was taken, if it is known. 
=cut
sub mg_rast_lat_lon {
	my ($class, $job_id, $login_name)  = @_;
	
	my $orgdir=$class->_validate_mg_rast_user($job_id, $login_name);
	if ($orgdir =~ /Access Error/ || !(-e $orgdir)) {return $orgdir}
	
	my $meta = GenomeMeta->new(undef, "/vol/metagenome-48-hour/Jobs.prod/$job_id/meta.xml");
	my $ret = $meta->get_metadata("optional_info.latitude") .",".$meta->get_metadata("optional_info.longitude");
	$ret =~ s/^\s*\,\s*$//;
	return $ret;
}

=begin WSDL
_IN job $string
_IN username $string
_RETURN $string
_DOC Get the location (GPS coordinates) where a sample was taken (if available). These are semi-colon separated tuples of lat-lon. Input is a RAST job id and a username, output is the coordinates where the sample was taken. 
=cut
sub mg_rast_coordinates {
	my ($class, $job_id, $login_name)  = @_;
	
	my $orgdir=$class->_validate_mg_rast_user($job_id, $login_name);
	if ($orgdir =~ /Access Error/ || !(-e $orgdir)) {return $orgdir}
	
	my $meta = GenomeMeta->new(undef, "/vol/metagenome-48-hour/Jobs.prod/$job_id/meta.xml");
	return $meta->get_metadata("optional_info.coordinates");
}

=begin WSDL
_IN job $string
_IN username $string
_RETURN $string
_DOC Get the time that a sample was taken (if available). Input is a RAST job id and a username, output is the date or time that the sample was taken.
=cut
sub mg_rast_time {
	my ($class, $job_id, $login_name)  = @_;
	
	my $orgdir=$class->_validate_mg_rast_user($job_id, $login_name);
	if ($orgdir =~ /Access Error/ || !(-e $orgdir)) {return $orgdir}
	
	my $meta = GenomeMeta->new(undef, "/vol/metagenome-48-hour/Jobs.prod/$job_id/meta.xml");
	return $meta->get_metadata("optional_info.time");
}



=begin WSDL
_IN job $string
_IN genome_id $string
_IN cutoff $string
_IN username $string
_RETURN $string
_DOC Get the number of hits to a given genome in a metagenome. Requires a metagenome id, a genome id, an E value cutoff, and a username.
=cut
sub mg_rast_genome_hits {
	my ($class, $job_id, $genome, $cutoff, $login_name)  = @_;
	
	my $orgdir=$class->_validate_mg_rast_user($job_id, $login_name);
	if ($orgdir =~ /Access Error/ || !(-e $orgdir)) {return $orgdir}


	# we're going to map all the sequences in the genome, and then search for those in the metagenome - quicker this way
	# than looking through all the metagenome hits using mapped_prot_ids
	my $fig = new FIG;
	my %need;
	foreach my $peg ($fig->pegs_of($genome)) {map {$need{$_->[0]}=1} $fig->mapped_prot_ids($peg)}
	
	my $meta = GenomeMeta->new(undef, "/vol/metagenome-48-hour/Jobs.prod/$job_id/meta.xml");
	my $simf = "/vol/metagenome-48-hour/Jobs.prod/$job_id/proc/sims.seed.raw";
	unless (-e $simf) {return "No sims file found for $job_id"}
	my %seen; # only count each hit once
	open(IN, "/vol/metagenome-48-hour/Jobs.prod/$job_id/proc/sims.seed.raw") || die "Can't open /vol/metagenome-48-hour/Jobs.prod/$job_id/proc/sims.seed.raw";
	while (<IN>)
	{
		chomp;
		my @a=split /\t/;
		$seen{$a[0]}++ if ($need{$a[1]} && $a[10] <= $cutoff);
	}
	close IN;
	return scalar(keys %seen);
}




sub _validate_mg_rast_user {
# a common method for validating a user. Please note that at the moment this does not use the password (but it should!)
# this is shared by several things above.

# If the user is valid will return a path to the job directory, otherwise will return "Access Error"

# by default returns the org dir, but if the boolean job is set will return the reference to the job

	my ($class, $dir, $login_name, $getjob)=@_;
		
	my $dbm = $class->_get_dbmaster();

	my $user = $dbm->User->init({ login => $login_name });
	my $ids;
	map {$ids->{$_->_id}=1} @{$user->scopes};


	my $job_id=$FIG_Config::fortyeight_jobs.$dir; # not sure whether config dirs always end in /, but they should!
		next unless (-d $job_id && -e "$job_id/DONE" && !-e "$job_id/DELETE");

	my $job;
	eval {$job = Job48->new($job_id)};
	die $@ if ($@);
	next if ($job->to_be_deleted);

	# we shouldn't do this, because it takes away user control :)
	return $job if ($getjob);
	return $job->orgdir;

	my $rights=$dbm->Rights->get_objects( { name => 'view',
			data_type => 'genome',
			data_id => $job->genome_id,
			});

	foreach my $r (@$rights) {
		if ($ids->{$r->scope->_id})
		{
			return $job if ($getjob);
			return $job->orgdir;
		}
	}
	return "Access Error";
}



sub _user_mg_rast_jobs {
	# a method to extract all the jobs for a user. Returns references to two arrays. 
	# The first is user jobs, and the second is jobs of the users organization
	my ($class, $login_name)=@_;
		
	my $dbm = $class->_get_dbmaster();

	my $user = $dbm->User->init({ login => $login_name });
	my $ownid = $user->get_user_scope->_id;
	my $ids;
	map {$ids->{$_->_id}=1} @{$user->scopes};

	my $ownjobs; my $otherjobs;

	opendir(DIR, $FIG_Config::fortyeight_jobs) || die "Can't open ".$FIG_Config::fortyeight_jobs;
	foreach my $dir (grep {m/^\d+$/} readdir(DIR))
	{

		my $job_id=$FIG_Config::fortyeight_jobs.$dir; # not sure whether config dirs always end in /, but they should!
		next unless (-d $job_id && -e "$job_id/DONE" && !-e "$job_id/DELETE");

		my $job;
		eval {$job = Job48->new($job_id)};
		die $@ if ($@);
		next if ($job->to_be_deleted);

		my $rights=$dbm->Rights->get_objects( { name => 'view',
				data_type => 'genome',
				data_id => $job->genome_id,
				});

		foreach my $r (@$rights) {
			$ownjobs->{$dir}=$job if ($r->scope->_id eq $ownid);
			$otherjobs->{$dir}=$job if ($ids->{$r->scope->_id});
		}


	}
	
	# delete own jobs from list of all jobs
	map {delete $otherjobs->{$_}} keys %$ownjobs;
	return ([keys %$ownjobs], [keys %$otherjobs]);

}

sub _get_dbmaster {
	# an internal method to connect to the database and return the dbmster
	my ($class)=@_;
	return $dbmaster if (defined $dbmaster);
	
	$ENV{DBHOST} = 'bioseed.mcs.anl.gov';

	$dbmaster = DBMaster->new(-database => $FIG_Config::webapplication_db,
			-backend => $FIG_Config::webapplication_backend,
			-host => $FIG_Config::webapplication_host,
			-user => $FIG_Config::webapplication_user || "rast",
			);
	
	return $dbmaster;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3