[Bio] / FigKernelScripts / get_journals_for_frole.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/get_journals_for_frole.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : hwang 1.1 #!/usr/local/bin/perl
2 :     #This is similar to getting pubmeds for pegs.
3 :     #There is a filter to remove Genome papers and to retrieve papers using the name of the functional
4 :     #role as the search query.
5 :    
6 :    
7 :     use strict;
8 :     use LWP;
9 :     use XML::LibXML;
10 :     use FigWebServices::SeedComponents::PubMed;
11 :    
12 :     my $request;
13 :     my $response;
14 :    
15 :     my $numArgs = $#ARGV + 1;
16 :    
17 :     if ($numArgs eq 0) {
18 :     print "Provide a functional role (separated by tabs)\n";
19 :     print "useage: get_journals_for_frole functional_role";
20 :     exit;
21 :     }
22 :    
23 :     my @query_array = split(/\t/, $ARGV[0]);
24 :    
25 :     # The following are urls to search PubMed/Entrez
26 :     my $entrez_base = "http://eutils.ncbi.nlm.nih.gov/entrez/";
27 :     my $ncbi_base = "http://www.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?";
28 :     my $gi_url = $ncbi_base.
29 :     "cmd=llink&db=Pubmed&dbFrom=Protein&retmax=1&usehistory=y&from_uid=";
30 :     my $sp_url = "http://us.expasy.org/cgi-bin/get-sprot-entry?";
31 :     my $uni_url = "http://www.ebi.uniprot.org/uniprot-srv/uniProtView.do?proteinAc=";
32 :     my $journal_url = "$entrez_base"."eutils/esummary.fcgi?db=pubmed&id=";
33 :     my $url_format = "&retmode=xml";
34 :     my $gene_id_url = $entrez_base."/eutils/efetch.fcgi?db=gene&dopt=gene_pubmed&id=";
35 :     my $search_term_url;
36 :     my %uniq_pubmed = ();
37 :    
38 :     foreach my $functional_role (@query_array) {
39 :     next if ($functional_role =~ /hypothetical protein/i);
40 :    
41 :     $search_term_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$functional_role&retmode=xml";
42 :     if (&test_url_results($search_term_url)) {
43 :     &get_search_by_frole($search_term_url);
44 :     }
45 :     }
46 :    
47 :    
48 :     &print_pubmed;
49 :    
50 :     sub test_url_results {
51 :    
52 :     my $url = $_[0];
53 :    
54 :     # Searches Pubmed and Returns the number of results
55 :     $request=LWP::UserAgent->new();
56 :     $response=$request->get($url);
57 :     my $results= $response->content;
58 :     #die unless
59 :    
60 :     if ($results ne "") {
61 :     return $results;
62 :     }
63 :     else {
64 :     return;
65 :     }
66 :     }
67 :    
68 :     sub get_search_by_frole {
69 :    
70 :     my $results_url = $_[0];
71 :     return if (!$results_url);
72 :    
73 :     #print STDERR "made it past $results_url\n";
74 :     my $parser=XML::LibXML->new;
75 :    
76 :     $response=$request->get($results_url);
77 :     my $results= $response->content;
78 :    
79 :     return unless $response->is_success;
80 :     my @pubmed_numbers;
81 :    
82 :     while($results =~ m/<Id>(.*)<\/Id>/g) {
83 :     push (@pubmed_numbers, $1);
84 :     }
85 :    
86 :     &get_filtered_pubmed_links(\@pubmed_numbers);
87 :    
88 :     }
89 :    
90 :     sub get_filtered_pubmed_links {
91 :    
92 :     my $pubmed_array = $_[0];
93 :     my @pubmed_in = @{$pubmed_array};
94 :    
95 :     foreach my $pubmed_number (@pubmed_in) {
96 :    
97 :     # Creates the URL to search Pubmed
98 :     my $baseurl="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&db=gene&id=";
99 :     my $url=$baseurl.$pubmed_number;
100 :    
101 :     # Searches Pubmed to parse out Genome papers.
102 :     $request=LWP::UserAgent->new();
103 :     $response=$request->get($url);
104 :     my $results= $response->content;
105 :     next unless $response->is_success;
106 :    
107 :     #Check to see if the word Genome is in the title. If it is, we don't include that paper.
108 :     my $pmid_title = &FigWebServices::SeedComponents::PubMed::pmid_to_title($pubmed_number);
109 :     next if ($pmid_title =~ m/Genome/i);
110 :    
111 :    
112 :    
113 :     my $links_cutoff;
114 :     my @all_links;
115 :    
116 :     while ( ($results =~ m/<Id>(.*)<\/Id>/g) && ($links_cutoff < 80) ) {
117 :     push (@all_links, $1);
118 :     $links_cutoff++;
119 :    
120 :     }
121 :    
122 :     #Filtering out papers that may be genome papers.
123 :     #If there are a lot of genes associated with the paper, then it is most likely a genome paper.
124 :     next if ($links_cutoff == 80);
125 :    
126 :     #If there are less 5 genes associated with it, then most likely it is not a genome paper
127 :     if ($links_cutoff < 5) {
128 :     &add_to_uniq_pubmed($results, $pubmed_number);
129 :    
130 :     next;
131 :     }
132 :    
133 :     @all_links = sort @all_links;
134 :     my $links_total = scalar (@all_links);
135 :     my $genome_num;
136 :    
137 :     #Now we have to evalue the pmid to see if the genes are sequential
138 :    
139 :     for(my $i=0; $i<= $links_total; $i++) {
140 :     my $prev_links_num = $all_links[$i];
141 :     my $current_links_num = $all_links[$i+1];
142 :    
143 :     if ($prev_links_num +5 < $current_links_num) {
144 :     $genome_num++;
145 :     }
146 :     }
147 :    
148 :     #If there are less than 5 sequential genome numbers, than we keep the paper.
149 :     if ($genome_num <= 5) {
150 :    
151 :     &add_to_uniq_pubmed($results, $pubmed_number);
152 :     }
153 :    
154 :     }
155 :     }
156 :    
157 :     sub add_to_uniq_pubmed {
158 :    
159 :     my ($results_in, $pubmed_number) = @_;
160 :     $uniq_pubmed{$pubmed_number} = $pubmed_number;
161 :    
162 :     }
163 :    
164 :     sub print_pubmed {
165 :    
166 :     foreach my $k (keys %uniq_pubmed) {
167 :     print "$k ";
168 :     }
169 :    
170 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3