[Bio] / FigKernelScripts / get_journals_for_frole.pl Repository:
ViewVC logotype

View of /FigKernelScripts/get_journals_for_frole.pl

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.1 - (download) (as text) (annotate)
Wed Feb 28 15:44:21 2007 UTC (13 years, 4 months ago) by hwang
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Files to get journal information from PubMed

#This is similar to getting pubmeds for pegs. 
#There is a filter to remove Genome papers and to retrieve papers using the name of the functional
#role as the search query.

use strict;
use LWP;
use XML::LibXML; 
use FigWebServices::SeedComponents::PubMed;

my $request;
my $response;

my $numArgs = $#ARGV + 1;

if ($numArgs eq 0) {
    print "Provide a functional role (separated by tabs)\n";
    print "useage: get_journals_for_frole functional_role";

my @query_array  = split(/\t/, $ARGV[0]);

# The following are urls to search PubMed/Entrez
my $entrez_base = "http://eutils.ncbi.nlm.nih.gov/entrez/";
my $ncbi_base = "http://www.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?";
my $gi_url = $ncbi_base.
my $sp_url = "http://us.expasy.org/cgi-bin/get-sprot-entry?";
my $uni_url = "http://www.ebi.uniprot.org/uniprot-srv/uniProtView.do?proteinAc=";
my $journal_url = "$entrez_base"."eutils/esummary.fcgi?db=pubmed&id=";
my $url_format = "&retmode=xml";
my $gene_id_url = $entrez_base."/eutils/efetch.fcgi?db=gene&dopt=gene_pubmed&id=";
my $search_term_url;
my %uniq_pubmed = ();

foreach my $functional_role (@query_array) {
    next if ($functional_role =~ /hypothetical protein/i);
    $search_term_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$functional_role&retmode=xml";
    if (&test_url_results($search_term_url)) {


sub test_url_results {

    my $url = $_[0];
    # Searches Pubmed and Returns the number of results
    my $results= $response->content;
    #die unless 
    if ($results ne "") {
	return $results;	
    else {

sub get_search_by_frole {
    my $results_url = $_[0];
    return if (!$results_url);

    #print STDERR "made it past $results_url\n";
    my $parser=XML::LibXML->new;

    my $results= $response->content;
    return unless $response->is_success;
    my @pubmed_numbers;
    while($results =~ m/<Id>(.*)<\/Id>/g) {
	push (@pubmed_numbers, $1);

sub get_filtered_pubmed_links { 
    my $pubmed_array = $_[0];
    my @pubmed_in = @{$pubmed_array};
    foreach my $pubmed_number (@pubmed_in) {
       	# Creates the URL to search Pubmed
	my $baseurl="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&db=gene&id=";
	my $url=$baseurl.$pubmed_number;
	# Searches Pubmed to parse out Genome papers. 
	my $results= $response->content;
	next unless $response->is_success;

	#Check to see if the word Genome is in the title. If it is, we don't include that paper.
	my $pmid_title = &FigWebServices::SeedComponents::PubMed::pmid_to_title($pubmed_number);
	next if ($pmid_title =~ m/Genome/i);

	my $links_cutoff;
	my @all_links;
	while ( ($results =~ m/<Id>(.*)<\/Id>/g) && ($links_cutoff < 80) ) {
	    push (@all_links, $1);
	#Filtering out papers that may be genome papers. 
	#If there are a lot of genes associated with the paper, then it is most likely a genome paper. 
 	next if ($links_cutoff == 80);
	#If there are less 5 genes associated with it, then most likely it is not a genome paper
	if ($links_cutoff < 5) {
	       &add_to_uniq_pubmed($results, $pubmed_number);

	@all_links = sort @all_links; 
	my $links_total = scalar (@all_links);
	my $genome_num;
	#Now we have to evalue the pmid to see if the genes are sequential
	for(my $i=0; $i<= $links_total; $i++) {
	    my $prev_links_num = $all_links[$i];
	    my $current_links_num = $all_links[$i+1];

	    if ($prev_links_num +5 < $current_links_num) {

	#If there are less than 5 sequential genome numbers, than we keep the paper.
	if ($genome_num <= 5) {
	    &add_to_uniq_pubmed($results, $pubmed_number);

sub add_to_uniq_pubmed {

    my ($results_in, $pubmed_number) = @_;
    $uniq_pubmed{$pubmed_number} = $pubmed_number;

sub print_pubmed {
    foreach my $k (keys %uniq_pubmed) {
	print "$k ";

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3