[Bio] / FigKernelScripts / get_journals_for_frole.pl Repository:
ViewVC logotype

View of /FigKernelScripts/get_journals_for_frole.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Wed Feb 28 15:44:21 2007 UTC (13 years, 3 months ago) by hwang
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Files to get journal information from PubMed

#!/usr/local/bin/perl
#This is similar to getting pubmeds for pegs. 
#There is a filter to remove Genome papers and to retrieve papers using the name of the functional
#role as the search query.


use strict;
use LWP;
use XML::LibXML; 
use FigWebServices::SeedComponents::PubMed;

my $request;
my $response;

my $numArgs = $#ARGV + 1;

if ($numArgs eq 0) {
    print "Provide a functional role (separated by tabs)\n";
    print "useage: get_journals_for_frole functional_role";
    exit;
}

my @query_array  = split(/\t/, $ARGV[0]);

# The following are urls to search PubMed/Entrez
my $entrez_base = "http://eutils.ncbi.nlm.nih.gov/entrez/";
my $ncbi_base = "http://www.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?";
my $gi_url = $ncbi_base.
             "cmd=llink&db=Pubmed&dbFrom=Protein&retmax=1&usehistory=y&from_uid=";
my $sp_url = "http://us.expasy.org/cgi-bin/get-sprot-entry?";
my $uni_url = "http://www.ebi.uniprot.org/uniprot-srv/uniProtView.do?proteinAc=";
my $journal_url = "$entrez_base"."eutils/esummary.fcgi?db=pubmed&id=";
my $url_format = "&retmode=xml";
my $gene_id_url = $entrez_base."/eutils/efetch.fcgi?db=gene&dopt=gene_pubmed&id=";
my $search_term_url;
my %uniq_pubmed = ();

foreach my $functional_role (@query_array) {
    next if ($functional_role =~ /hypothetical protein/i);
    
    $search_term_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$functional_role&retmode=xml";
    if (&test_url_results($search_term_url)) {
	&get_search_by_frole($search_term_url);
    }
}


&print_pubmed;

sub test_url_results {

    my $url = $_[0];
    
    # Searches Pubmed and Returns the number of results
    $request=LWP::UserAgent->new();
    $response=$request->get($url);
    my $results= $response->content;
    #die unless 
    
    if ($results ne "") {
	return $results;	
    }
    else {
	return;
    }
}

sub get_search_by_frole {
    
    my $results_url = $_[0];
    return if (!$results_url);

    #print STDERR "made it past $results_url\n";
    my $parser=XML::LibXML->new;

    $response=$request->get($results_url);
    my $results= $response->content;
    
    return unless $response->is_success;
    my @pubmed_numbers;
    
    while($results =~ m/<Id>(.*)<\/Id>/g) {
	push (@pubmed_numbers, $1);
    }
    
    &get_filtered_pubmed_links(\@pubmed_numbers);
    
}

sub get_filtered_pubmed_links { 
 
    my $pubmed_array = $_[0];
    my @pubmed_in = @{$pubmed_array};
    
    foreach my $pubmed_number (@pubmed_in) {
	
       	# Creates the URL to search Pubmed
	my $baseurl="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&db=gene&id=";
	my $url=$baseurl.$pubmed_number;
  
	# Searches Pubmed to parse out Genome papers. 
	$request=LWP::UserAgent->new();
	$response=$request->get($url);
	my $results= $response->content;
	next unless $response->is_success;

	#Check to see if the word Genome is in the title. If it is, we don't include that paper.
	my $pmid_title = &FigWebServices::SeedComponents::PubMed::pmid_to_title($pubmed_number);
	next if ($pmid_title =~ m/Genome/i);
	
	

	my $links_cutoff;
	my @all_links;
	
	while ( ($results =~ m/<Id>(.*)<\/Id>/g) && ($links_cutoff < 80) ) {
	    push (@all_links, $1);
	    $links_cutoff++; 
	    
	}
    
	#Filtering out papers that may be genome papers. 
	#If there are a lot of genes associated with the paper, then it is most likely a genome paper. 
 	next if ($links_cutoff == 80);
	
	#If there are less 5 genes associated with it, then most likely it is not a genome paper
	if ($links_cutoff < 5) {
	       &add_to_uniq_pubmed($results, $pubmed_number);
	       
	       next;
	}

	@all_links = sort @all_links; 
	my $links_total = scalar (@all_links);
	my $genome_num;
	
	#Now we have to evalue the pmid to see if the genes are sequential
       
	for(my $i=0; $i<= $links_total; $i++) {
	    my $prev_links_num = $all_links[$i];
	    my $current_links_num = $all_links[$i+1];

	    if ($prev_links_num +5 < $current_links_num) {
		$genome_num++;
	    }
	}

	#If there are less than 5 sequential genome numbers, than we keep the paper.
	if ($genome_num <= 5) {
	    
	    &add_to_uniq_pubmed($results, $pubmed_number);
	}    
	
    }
}

sub add_to_uniq_pubmed {

    my ($results_in, $pubmed_number) = @_;
    $uniq_pubmed{$pubmed_number} = $pubmed_number;
    
}

sub print_pubmed {
    
    foreach my $k (keys %uniq_pubmed) {
	print "$k ";
    }
    
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3