[Bio] / FigKernelScripts / get_pubmed_journals.pl Repository:
ViewVC logotype

View of /FigKernelScripts/get_pubmed_journals.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Tue Mar 20 18:23:33 2007 UTC (12 years, 8 months ago) by hwang
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.4: +3 -3 lines
update comment that this is for peg curation

# This script takes in a gi/uni/sp id and output the pubmed articles. Filters include pubmed articles are
# not more than 10 years old. Each pubmed article has less than 80 links attached to it.
# Use this script for peg curation.
#!/usr/local/bin/perl
use strict;
use LWP;
use XML::LibXML; 
use FigWebServices::SeedComponents::PubMed;

my $request;
my $response;
my $query;

my $debug=1;
my $link_cutoff=80 ; # if the PUBMED Entry has more than 80 links attached to it, ignore it
my $pubmed_cutoff=50; # if the PUBMED Entry has more than 40 papers attached to it, ignore it. This is only for gi numbers.

my $numArgs = $#ARGV + 1;

if ($numArgs eq 0) {
    print "Provide a gi or sp id\n";
    print "useage: get_pubmed_journals \"gi|xxxx sp|xxxx\"\n";
    exit;
}

my @query_array = split(/\s+/, $ARGV[0]);

# The following are urls to search PubMed
my $accepted_year = 1900;
my $entrez_base = "http://eutils.ncbi.nlm.nih.gov/entrez/";
my $ncbi_base = "http://www.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?";
my $gi_url = $ncbi_base.
             "cmd=llink&db=Pubmed&dbFrom=Protein&retmax=1&usehistory=y&from_uid=";
my $filter = "&filters=on&pmfilter_PDatRange_MinYear=1995&pmfilter_PDatRange_MinMonth=02".
    "&pmfilter_PDatRange_MinDay=05&pmfilter_PDatRange_MaxYear=2006";
my $sp_url = "http://us.expasy.org/cgi-bin/get-sprot-entry?";
my $uni_url = "http://www.ebi.uniprot.org/uniprot-srv/uniProtView.do?proteinAc=";
my $journal_url = "$entrez_base"."eutils/esummary.fcgi?db=pubmed&id=";
my $url_format = "&retmode=xml";
my $gene_id_url = $entrez_base."/eutils/efetch.fcgi?db=gene&dopt=gene_pubmed&id=";

my %uniq_pubmed = ();


# Loop through each query to get the PubMed journal
foreach my $query (@query_array) {
    my $url;
    my $results;
    my $id;
    my $id_num;
    # See if it is gi/uni/sp
    if ($query =~ m/([g|s|u].*)\|(.*)/) {
	$id = $1;
	$id_num = $2;
    }
    if ($query =~ m/(GeneID:)(.*)/){
	$id = $1;
	$id_num = $2;
    }
    
    #print "$query";
    
    if ($id =~ m/gi/) {
	$url = "$gi_url$id_num";
    }
    if ($id =~ m/sp/) {
	$url = "$sp_url$id_num";
    }
    if ($id =~ m/uni/) {
	$url = "$uni_url$id_num";
    }

    if ($id =~ m/Gene/) {
	$url = "$gene_id_url$id_num$url_format";
    }
    
    #print $url;
    # Make sure there is a reponse for the url
    $results = &test_url_results($url);
    
    
    if ($id =~ m/gi/) {
	$results = &test_gi_results($results);
	&parse_gi_results($results);
    }
    if ($id =~ m/Gene/) {
	&parse_geneid_results($results);
    }
    if (($id =~ m/sp/) || ($id =~ m/uni/)){
	&parse_results($results);
    }
    
    
    
}

&print_pubmed;
    
sub test_gi_results {
    
    my $test_gi_results = $_[0];
    #print $test_gi_results;
    if (! ($test_gi_results =~ m/<Link>/) ) {
	return;
    }
    
    return $test_gi_results;
    
}

sub test_url_results {

    my $url = $_[0];
    
    # Searches Pubmed and Returns the number of results
    $request=LWP::UserAgent->new();
    $response=$request->get($url);
    my $results= $response->content;
    #die unless 
    
    if ($results ne "") {
	return $results;	
    }
    else {
	print "no url";
	return;
    }
}


sub parse_gi_results {

    my $results = $_[0];
    if ($results eq "") {
	return;
    }

    my $parser=XML::LibXML->new;
    
    # Uses a DOM based XML parser to process returned results
    my $domtree=$parser->parse_string($results);
   
    my @Records=$domtree->getElementsByTagName("LinkSet"); 
    my $i=0;

    # Extracts element data for regex processing and output formatting
    my $links=$Records[$i]->getElementsByTagName("Link");

    chomp($links);

    # split the result into the individual PUBMED record identifier
    my @pubmed_numbers=(split " ", $links);
    my $pubmed_count=$#pubmed_numbers+1;

    my @current_pubmed;
    
    if ($#pubmed_numbers <= $pubmed_cutoff) { 
	&get_pubmed_links(\@pubmed_numbers);
    }
}

sub check_year {
    my $pubmed_ref = $_[0];
    my @pubmed_year_in = @{$pubmed_ref};
    my @pubmed_year_out;
        
    foreach (@pubmed_year_in) {
	
	#print "\n in loop $_\n";
	my $url = "$journal_url"."$_"."$url_format";
	my $esearch_results = &test_url_results($url);
	$esearch_results =~ m/<*PubDate.*>(\d+)(.*)<\/Item>/;
	my $year = $1;
	#print "$_ $year\n";
	if (($year != "") && ($year >= $accepted_year)) {
	    
	    push (@pubmed_year_out, $_)
	    }
    }
    return @pubmed_year_out;
}


sub parse_results {
    
    my $results = $_[0];
    my $total_sp_links;
    my @sp_array;
    my $pubmed_numbers;
    
        
    while ( $results =~ m/PubMed=\d*/g)
    {
	$pubmed_numbers = $&;
	$pubmed_numbers =~ s/PubMed\=//;
	push(@sp_array, $pubmed_numbers);
    }
    
    &get_pubmed_links(\@sp_array);
    
    

}

sub parse_geneid_results {
    
    my $results = $_[0];
    my @gene_array;
    
    
    while ($results =~ m/<PubMedId>([^<]*)<\/PubMedId>/g)
    {
	my $pubmed_number = $1;
	#print "$pubmed_number\n";
	push(@gene_array, $pubmed_number);
    }
   
    &get_pubmed_links(\@gene_array);
   
}

sub get_pubmed_links { 
 
    my $pubmed_array = $_[0];
    my @pubmed_in = @{$pubmed_array};
    foreach my $pubmed_number (@pubmed_in) {
	
        #print "anything $pubmed_number ";
	# Creates the URL to search Pubmed
	my $baseurl="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?db=protein&dbfrom=pubmed&id=";
	my $url=$baseurl.$pubmed_number;
  
	# Searches Pubmed and Returns the number of results
	# as well as the session information needed for results retrieval
	$request=LWP::UserAgent->new();
	$response=$request->get($url);
	my $results= $response->content;
	next unless $response->is_success;

	#Check to see if the word Genome is in the title. If it is, we don't include that paper.
	my $pmid_title = &FigWebServices::SeedComponents::PubMed::pmid_to_title($pubmed_number);
	next if ($pmid_title =~ m/genome/i);
	
	#Check the total number of links before getting the links
	#If it is less than or equal to the limit of links, we will continue processing.
	my $total_links;
	$total_links++ while ( ($results =~ m/<Link>/g) && ($total_links <= $link_cutoff));
	
	next if ($total_links >= $link_cutoff);
	&add_to_uniq_pubmed($results, $pubmed_number);
    }
	
}


sub add_to_uniq_pubmed {

    my ($results_in, $pubmed_number) = @_;
    $uniq_pubmed{$pubmed_number} = $pubmed_number;
    
}

sub print_pubmed {
    
    foreach my $k (keys %uniq_pubmed) {
	print "$k ";
    }
    
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3