[Bio] / FigKernelPackages / RAST_submission.pm Repository:
ViewVC logotype

View of /FigKernelPackages/RAST_submission.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (download) (as text) (annotate)
Wed Oct 21 21:45:03 2009 UTC (10 years, 5 months ago) by olson
Branch: MAIN
Changes since 1.5: +28 -3 lines
Add more details to output of get_contigs_from_entrez:

$VAR1 = {
          'length' => '16660',
          'project' => '15760',
          'name' => 'Mycobacterium gilvum PYR-GCK',
          'contents' => '<genbank file>',
          'id' => 'NC_009341',
          'taxonomy_id' => '350054'
        };

package RAST_submission;


use strict;
use Job48;
use JobUpload;
use Data::Dumper;
use FIG;
use FIG_Config;
use gjoseqlib;

use LWP::UserAgent;
use Bio::DB::RefSeq;
use Bio::SeqIO;

use base 'Class::Accessor';

__PACKAGE__->mk_accessors(qw(rast_dbmaster user_dbmaster user_obj project_cache_dir
			     contig_cache_dir max_cache_age ua));

sub new
{
    my($class, $rast_dbmaster, $user_dbmaster, $user_obj) = @_;

    my $self = {
	rast_dbmaster => $rast_dbmaster,
	user_dbmaster => $user_dbmaster,
	user_obj => $user_obj,
	project_cache_dir => "$FIG_Config::var/ncbi_project_cache",
	contig_cache_dir => "$FIG_Config::var/ncbi_contig_cache",
	max_cache_age => 86400,
	ua => LWP::UserAgent->new(),
    };

    &FIG::verify_dir($self->{project_cache_dir});
    &FIG::verify_dir($self->{contig_cache_dir});


    return bless $self, $class;
}

sub get_contig_ids_in_project_from_entrez
{
    my($self, $params) = @_;

    #
    # Determine the project ID to use. Which one we take depends on if
    # we were passed a project id, a tax id, or a contig id.
    #

    my $proj;
    if ($params->{-tax_id})
    {
    }
    elsif ($params->{-contig_id})
    {
	$proj = $self->determine_project_of_contig($params->{-contig_id});
    }
    elsif ($params->{-project_id})
    {
	$proj = $params->{-project_id};
    }

    print STDERR "project is $proj\n";
    my $project_data = $self->retrieve_project_data($proj);

    return $self->check_project_for_redundancy($project_data);
}

sub get_contigs_from_entrez
{
    my($self, $params) = @_;

    my $id_list = $params->{-id};
    if (!ref($id_list))
    {
	$id_list = [$id_list];
    }

    my @ret;
    for my $id (@$id_list)
    {
	my $ent = { id => $id };
	
	my $file = $self->retrieve_contig_data($id);
	
	open(F, "<", $file);
	my $txt = <F>;
	if ($txt =~ /^LOCUS.*?(\d+)\s+bp/)
	{
	    $ent->{length} = $1;
	}

	while (<F>)
	{
	    $txt .= $_;
	    if (/^\s+ORGANISM\s+(.*)/)
	    {
		$ent->{name} = $1;
	    }
	    elsif (/^DBLINK\s+Project:(\d+)/)
	    {
		$ent->{project} = $1;
	    }
	    elsif (/db_xref="taxon:(\d+)"/)
	    {
		$ent->{taxonomy_id} = $1;
	    }
	}
	$ent->{contents} = $txt;

	close(F);

	push(@ret, $ent);
    }

    return \@ret;
}

sub determine_project_of_contig
{
    my($self, $contig_id) = @_;

    my $file = $self->retrieve_contig_data($contig_id);
    open(F, "<", $file) or die "cannot open contig data $file: $!";

    my $proj;
    while (<F>)
    {
	if (/DBLINK\s+Project:(\d+)/)
	{
	    $proj = $1;
	    last;
	}
    }
    close(F);
    return $proj;
    
}

sub check_project_for_redundancy
{
    my($self, $file) = @_;

    my $seqio_object = Bio::SeqIO->new(
				       -file => $file ,
				       -format => "genbank",
				      );

    my @seqs;
    my @ids;
    while ( my $seq = $seqio_object->next_seq ) {
	push(@seqs, [$seq->accession_number, $seq->seq]);
	push(@ids, $seq->accession_number);
    }

    my @redundancy = $self->test_for_redundancy(\@seqs);
    return { ids => \@ids, redundancy_report => \@redundancy };
}

sub test_for_redundancy {
    my($self, $seqs) = @_;

    if (@$seqs < 2)
    {
	return ();
    }
    
    my %lens = map { $_->[0] => length($_->[1]) } @$seqs;
    my $tmp = "tmp.$$.fasta";
    &gjoseqlib::print_alignment_as_fasta($tmp,$seqs);
    system "formatdb -i $tmp -pF";
    my @blastout = `blastall -m8 -i $tmp -d $tmp -p blastn -FF -e 1.0e-100`;
    system "rm $tmp $tmp\.*";
    my @tuples = ();
    my %seen;
    foreach my $hit (map { chomp; [split(/\t/,$_)] } @blastout)
    {
	my($id1,$id2,$iden,undef,undef,undef,$b1,$e1,$b2,$e2) = @$hit;
	if ((! $seen{"$id1/$id2"}) && ($id1 ne $id2))
	{
	    $seen{"$id1/$id2"} = 1;
	    if (($iden >= 98) &&
		(abs($e1 - $b1) > (0.9 * $lens{$id1})))
	    {
		push(@tuples,[$id1,$lens{$id1},$id2,$lens{$id2}]);
	    }
	}
    }
    
    return @tuples;
}

sub retrieve_project_data
{
    my($self, $project) = @_;
    
    my $cached_file = $self->project_cache_dir() . "/$project.gbff";
    if (my(@stat) = stat($cached_file))
    {
	my $last_mod = $stat[9];
	if (time - $last_mod < $self->max_cache_age)
	{
	    return $cached_file;
	}
    }

    my $url = "http://www.ncbi.nlm.nih.gov/sites/entrez?Db=genomeprj&Cmd=Retrieve&list_uids=";
    my $res = $self->ua->get($url.$project);
    if (!$res->is_success)
    {
	die "error retrieving project data: " . $res->status_line;
    }
    my $search_result = $res->content;
  
    my @lines = split ( "\n" , $search_result);
  
    my $nr_seq = 0;
    my $nr_proj = 0;
    my $url_seq = "";
    my $url_proj = "";
    my $genome_name = "";
    
    my $found_genome_information_table = 0;
    
    my $next = "";
    my $id_list  = "";
    foreach my $line ( @lines )
    {
	
	if ($line =~/Genome information:/){
	    $found_genome_information_table = 1;
	    next; # skip table line
	};
	
	$found_genome_information_table = 0 if ( $found_genome_information_table and $line =~ /<\/table>/);
	$id_list .= $line if ( $found_genome_information_table ); # collect id entries
	
	# print $line , "\n"  if (  $found_genome_information_table );
    }
    
    my @ids;
    my @blocks = split "<\/tr>" , $id_list ; 
    foreach my $block (@blocks)
    {
	my @local_ids = $block =~/([^>]+)<\/a><\/td>/gc ; 
	# print join "\t" , @local_ids  , "\n";
	push @ids , $local_ids[0] if ($local_ids[0]);
    }

    my $id_list = join(",", @ids);
    my $query = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=" . $id_list . "&rettype=gb" ;
    print STDERR $query , "\n";
    my $resp = $self->ua->get($query);
    if ($resp->is_success())
    {
	open(F, ">", $cached_file) or die "Cannot open $cached_file for writing: $!";
	print F $resp->content;
	close(F);
	return $cached_file;
    }
    else
    {
	die "Error retrieving data: " . $resp->status_line;
    }
}

sub retrieve_contig_data
{
    my($self, $contig) = @_;
    
    my $cached_file = $self->contig_cache_dir() . "/$contig.gbff";
    if (my(@stat) = stat($cached_file))
    {
	my $last_mod = $stat[9];
	if (time - $last_mod < $self->max_cache_age)
	{
	    return $cached_file;
	}
    }

    my $query = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=" . $contig . "&rettype=gb" ;
    print STDERR $query , "\n";
    my $resp = $self->ua->get($query);
    if ($resp->is_success())
    {
	open(F, ">", $cached_file) or die "Cannot open $cached_file for writing: $!";
	print F $resp->content;
	close(F);
	return $cached_file;
    }
    else
    {
	die "Error retrieving data: " . $resp->status_line;
    }
}


sub submit_RAST_job
{
    my($self, $params) = @_;

    my $res=  { val => [ "params were", $params, $self->user_obj->firstname, $self->user_obj->lastname] } ;
    print STDERR Dumper($res);
    return $res;
}

sub status_of_RAST_job
{
    my($self, $params) = @_;

    my @job_nums;
    my $job_num_param = $params->{-job};
    if (ref($job_num_param) eq 'ARRAY')
    {
	@job_nums = @$job_num_param;
    }
    else
    {
	@job_nums = ($job_num_param);
    }

    my $res = {};
    for my $job_num (@job_nums)
    {
	my $job = $self->rast_dbmaster->Job->init({ id => $job_num });
	if (!ref($job))
	{
	    $res->{$job_num} = { status => 'error', error_msg => 'Job not found'};
	    next;
	}
	
	if (!$self->user_may_access_job($job))
	{
	    $res->{$job_num} = { status => 'error', error_msg => 'Access denied' };
	    next;
	}

	my $dir = $job->dir;
	if (open(E, "<$dir/ERROR"))
	{
	    local $/;
	    undef $/;
	    my $emsg = <E>;
	    close(E);
	    $res->{job_num} = { status => 'error', error_msg => $emsg };
	    next;
	}

	#
	# Retrieve status flags from the meta file (not the database,
	# so that we can get the very latest state).
	#

	#
	# For now we only check status.export because that is what the
	# bulk API cares about.
	#
	
	my $status_list = [];
	my $cur_stage;
	my $stages = $job->stages();
	my %status;
	for my $stage (@$stages)
	{
	    my $status = $job->metaxml->get_metadata($stage) || 'not_started';
	    $status{$stage} = $status;
	    push(@$status_list, [$stage => $status]);
	    if ($status ne 'complete')
	    {
		$cur_stage = $stage;
	    }
	}

	$res->{$job_num} = { status => $status{'status.export'}, verbose_status => $status_list };
    }
    return $res;
}

sub retrieve_RAST_job
{
    my($self, $params) = @_;

    my $job_id = $params->{-job};
    my $format = $params->{-format};

    my $job = $self->rast_dbmaster->Job->init({ id => $job_id });

    if (!ref($job))
    {
	return { status => 'error', error_msg => 'Job not found'};
    }
    
    if (!$self->user_may_access_job($job))
    {
	return { status => 'error', error_msg => 'Access denied' };
    }

    #
    # Map the given output format to a file.
    #

    my %type_map = (genbank => "%s.gbk",
		    genbank_stripped => "%s.ec-stripped.gbk",
		    embl => "%s.embl",
		    embl_stripped => "%s.ec-stripped.embl",
		    gff3 => "%s.gff",
		    gff3_stripped => "%s.ec-stripped.gff",
		    gtf => "%s.gtf",
		    rast_tarball => "%s.tgz",
		    );

    my $file_pattern = $type_map{lc($format)};
    if (!defined($file_pattern))
    {
	return { status => 'error', error_msg => "Format $format not found" };
    }

    #
    # Find the download file.
    #

    my $dir = $job->download_dir();
    my $file = sprintf($file_pattern, $job->genome_id);
    my $path = "$dir/$file";
    if (!open(F, "<", $path))
    {
	return { status => 'error', error_msg => "Cannot open download file $path"}; 
    }

    local $/;
    undef $/;
    my $txt = <F>;
    return { status => 'ok', contents => $txt };
}

sub user_may_access_job
{
    my($self, $job_id) = @_;

    return 1;
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3