[Bio] / FigKernelScripts / get_NCBI_genome_attributes.pl Repository:
ViewVC logotype

View of /FigKernelScripts/get_NCBI_genome_attributes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Thu Mar 13 20:44:26 2008 UTC (11 years, 11 months ago) by arodri7
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
get_NCBI_genome_attributes searches the NCBI internet sites for genome attributes such as project id, taxonomy id, genbank and refseq acc, gold card id and more sequence information

#!/usr/bin/env /home/arodri7/FIGdisk/bin/run_perl

use Data::Dumper;
use Carp;
use FIG_Config;
use LWP::Simple;

# state the ncbi and gold urls
#my $ncbi_genome_search_url = "http://www.ncbi.nlm.nih.gov/sites/entrez?db=genome&cmd=search&term=";
my $ncbi_genome_search_url = "http://www.ncbi.nlm.nih.gov/sites/entrez?db=genome&cmd=Search&dopt=DocSum&term=txid";
my $ncbi_genome_search_url2 = "%5BOrganism%3Anoexp%5D";
my $ncbi_genome_project_url = "http://www.ncbi.nlm.nih.gov/sites/entrez?Db=genomeprj&Cmd=Retrieve&list_uids=";
my $gold_card_url = "http://genomesonline.org/GOLD_CARDS/";

# attribute names
my $attrib_names = {};
$attrib_names->{ncbi_org_name} = "INSTITUTE_ORGANISM_NAME";
$attrib_names->{type} = "GENOME_SEQ_TYPE";
$attrib_names->{ncbi_taxon} = "NCBI_TAXONOMY_ID";
$attrib_names->{refseq} = "REFSEQ_ACC";
$attrib_names->{gbk} = "GENBANK_ACC";
$attrib_names->{genome_project} = "GENOME_PROJECT_ID";
$attrib_names->{pubmed} = "PUBMED_ID";
$attrib_names->{gold_card_id} = "GOLD_CARD";
$attrib_names->{institute} = "GENOME_INSTITUTE";
$attrib_names->{contacts} = "GENOME_CONTACT";

my $table_lines ={};
my $orgD = $ARGV[0];

# get the organism name and taxonomy id from the organism directory
my ($fig_GenomeId, $orgName, $orgTax);
if ($orgD =~ /Organisms/){
    chop ($orgD) if ($orgD =~ /\/$/);
    ($fig_GenomeId) = ($orgD) =~ m/.*\/(.*)$/;

    $orgTax = `cat "$orgD/TAXONOMY_ID"`;
    chomp $orgTax;
}
else{
    ($fig_GenomeId) = `cat "$orgD/GENOME_ID"`;
    chomp $fig_GenomeId;

    ($orgTax) = ($fig_GenomeId) =~ /(.*)\./;
}

$orgName = `cat "$orgD/GENOME"`;
chomp $orgName;

    

# Verify that the tax id in the directory for the organism name matches that at the NCBI site for that Taxonomy name
if ( my $form = LWP::Simple::get("$ncbi_genome_search_url" . $orgTax . "$ncbi_genome_search_url2") )
{
    $form =~ s/\n//ig;
    my ($search_result) = ($form) =~ /begin Results - - - - - -(.*)- - - - - - - - end Results/;
    my (@sections) = ($search_result) =~ /<dl>(.*?)<\/dl>/ig;
    foreach my $section (@sections){
	my ($link, $name, $type) = ($section) =~ /(http:\/\/www\.ncbi\.nlm\.nih\.gov\/sites\/entrez\?Db=genome\&amp\;Cmd=ShowDetailView\&amp\;TermToSearch=.*?)\">(.*?)<.*Replicon Type: <b>(.*?)</;
	if ( my $form2 = LWP::Simple::get("$link")){
	    $form2 =~ s/\n//ig;
	    my ($genome_project) = ($form2) =~ /http:\/\/www\.ncbi\.nlm\.nih\.gov\/sites\/entrez\?Db=genomeprj\&amp\;Cmd=Retrieve\&amp\;list_uids=(.*?)\">Genome Project/;
	    if (my $form3 = LWP::Simple::get($ncbi_genome_project_url . $genome_project)){
		$form3 =~ s/\n//ig;
		my ($ncbi_org_name) = ($form3) =~ /<tr bgcolor=\"\#CCCCFF\"><td class=\"arial\" height=\"16\" nowrap=\"nowrap\">\&\#160\;\&\#160\;<b>Genome Projects<\/b>.*?<img src=\"http:\/\/www\.ncbi\.nlm\.nih\.gov\/Gifs\/head\.gif\" \/>.*?<i>(.*?)</;
		
		my $data={};
		if ($ncbi_org_name ne $orgName){
		    $data = &get_GOLD_info($orgName);
		}
		else {
		    $data = &get_GOLD_info($orgTax);
		}
		
		if (keys %$data > 0){
		    my %history;
		    open (FH, ">>$orgD/Attributes/NCBI_ATTRIBUTES");
		    foreach my $index (keys %$data){
			foreach my $attribute_key (sort keys %{$data->{$index}}){
			    my $printLine = "$fig_GenomeId\t" . $attrib_names->{$attribute_key} . "\t" . $data->{$index}->{$attribute_key} . "\n";
			    print FH $printLine if (!$history->{$printLine});
			    $history->{$printLine}=1;
			}
		    }
		    close FH;
		}
		else{
		    print "$fig_GenomeId\t$orgName\n";
		}
	    }
	    last;
	}
    }
}
else{
    print "$fig_GenomeId\t$orgName\n";
}


sub get_GOLD_info{
    my ($input) = @_;

    # state the ncbi and gold urls
    my $ncbi_genome_name_search_url = "http://www.ncbi.nlm.nih.gov/sites/entrez?db=genome&cmd=search&term=";
    my $ncbi_genome_search_url = "http://www.ncbi.nlm.nih.gov/sites/entrez?db=genome&cmd=Search&dopt=DocSum&term=txid";
    my $ncbi_genome_search_url2 = "%5BOrganism%3Anoexp%5D";
    my $ncbi_genome_project_url = "http://www.ncbi.nlm.nih.gov/sites/entrez?Db=genomeprj&Cmd=Retrieve&list_uids=";
    my $gold_card_url = "http://genomesonline.org/GOLD_CARDS/";

    my $data = {};

    # decide what type of input (tax or org name)
    my $search_url;
    if (($input =~ /^\d+/) && ($input !~ /[a-zA-Z_]/)){               # this is a taxonomy id
	$search_url = $ncbi_genome_search_url . $input . $ncbi_genome_search_url2;
    }
    else{                              # this is an organism name
	my $name = $input;
	$name =~ s/\s/%20/ig;
	$search_url = $ncbi_genome_name_search_url . $name;
    }

    if ( my $form = LWP::Simple::get($search_url))
    {
	$form =~ s/\n//ig;
	my ($search_result) = ($form) =~ /begin Results - - - - - -(.*)- - - - - - - - end Results/;
	my (@sections) = ($search_result) =~ /<dl>(.*?)<\/dl>/ig;
	my $qty = 0;
	foreach my $section (@sections){
	    my ($link, $name);
	    ($link, $name, $data->{$qty}->{type}) = ($section) =~ /(http:\/\/www\.ncbi\.nlm\.nih\.gov\/sites\/entrez\?Db=genome\&amp\;Cmd=ShowDetailView\&amp\;TermToSearch=.*?)\">(.*?)<.*Replicon Type: <b>(.*?)</;
	    if ( my $form2 = LWP::Simple::get("$link")){
		$form2 =~ s/\n//ig;
		($data->{$qty}->{genome_project}) = ($form2) =~ /http:\/\/www\.ncbi\.nlm\.nih\.gov\/sites\/entrez\?Db=genomeprj\&amp\;Cmd=Retrieve\&amp\;list_uids=(.*?)\">Genome Project/;
		if (my $form3 = LWP::Simple::get($ncbi_genome_project_url . $data->{$qty}->{genome_project})){
		    $form3 =~ s/\n//ig;
		    my ($table_info, $pubs_info);
		    ($data->{$qty}->{ncbi_taxon}, $data->{$qty}->{ncbi_org_name}) = ($form3) =~ /Lineage:.*PopUpMenu2_Set\(GPLineage(.*?)\)\; return false\;\" onmouseout=\"PopUpMenu2_Hide\(\)\; return false\;\"><i>(.*?)<\/i><\/a><table/;
		    
		    ($data->{$qty}->{gold_card_id}) = ($form3) =~ /http:\/\/genomesonline\.org\/GOLD_CARDS\/(.*?)\.html/;
		    ($table_info) = ($form3) =~ /Genome information:.*?(<table.*?<\/table>)/;
		    ($pubs_info) = ($form3) =~ /Publications:.*?(<ul>.*?<\/ul>)/;
		    
		    # get the genbank and refseq info
		    my (@table_rows) = ($table_info) =~ /<tr.*?>(.*?)<\/tr>/ig;
		    shift(@table_rows); # throw out the header row
		    my (@gbk_ids, @refseq_ids);
		    foreach my $row (@table_rows){
			my (@cols) = ($row) =~ /<td.*?>(.*?)<\/td>/ig;
			my ($refseqid) = ($cols[1]) =~ /<a.*?>(.*?)<\/a>/;
			my ($gbkid) = ($cols[2]) =~ /<a.*?>(.*?)<\/a>/;
			push (@refseq_ids, $refseqid); push(@gbk_ids, $gbkid);
		    }
		    $data->{$qty}->{gbk} = join (";", @gbk_ids);
		    $data->{$qty}->{refseq} = join(";", @refseq_ids);
		    
		    # get the pubmed info
		    my (@pubmed_ids, @contacts);
		    if ($pubs_info){
			my (@pubs) = ($pubs_info) =~ /<li>(.*?)<\/li>/ig;
			foreach my $pub (@pubs){
			    my ($pubid, $contact) = ($pub) =~ /site=entrez\&cmd=Retrieve\&db=PubMed\&list_uids=(.*?)\&dopt=Abstract\">(.*?)</;
			    push (@pubmed_ids, $pubid); push(@contacts, $contact);
			}
		    }
		    $data->{$qty}->{pubmed} = join(";", @pubmed_ids);
		    $data->{$qty}->{contacts} = join(";", @contacts);
		    
		    # get the information from GOLD card
		    if ($data->{$qty}->{gold_card_id}){
			if (my $form4 = LWP::Simple::get($gold_card_url . $data->{$qty}->{gold_card_id} . ".html")){
			    $form4 =~ s/\n//ig;
			    ($data->{$qty}->{institute}) = ($form4) =~ /INSTITUTIONS<\/td><td align=left><font color=00688b><a href=\".*?>(.*?)</;
			}
		    }
		    else{
			($data->{$qty}->{institute}) = ($form3) =~ /<tr bgcolor=\"\#CCCCFF\"><td class=\"arial\" height=\"16\" nowrap=\"nowrap\">\&\#160\;\&\#160\;<b>Genome Projects<\/b>.*?<img src=\"http:\/\/www\.ncbi\.nlm\.nih\.gov\/Gifs\/head\.gif\" \/>.*?\(Project ID:.*?at (.*?)</;
		    }
		}
	    }
	    $qty++;
	}
    }
    return $data;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3