[Bio] / FigKernelScripts / subsys2pubmed.pl Repository:
ViewVC logotype

View of /FigKernelScripts/subsys2pubmed.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Wed Mar 21 20:13:13 2007 UTC (12 years, 7 months ago) by hwang
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.3: +18 -6 lines
change script to add new attribute after each subsystem versus after all subsystems have been computed

#This purpose of this script is to retrieve literature for one subsystem or for all subsystem.
#Information is stored in a temporary file in case things the system goes down because PUBMED limits
#the number of information retrieved.
#This script averages 10,000 retrieval a day. Currently it is purposely slow because entrez do have a
#regulation of 300 retrieval and then a wait of 3 seconds. You may change the script to make it go 
#faster but be aware of entrez's guidelines.
#The last part of the script reads the temporary file and stores the information in the attribute 
#database
#
#This script should be run on biologin machine. Currently the bio-ppc* attribute server is not connected
#to the attribute database.

#!/usr/bin/perl
use FIG;
use strict;
use FIG_Config;
use FigWebServices::SeedComponents::PubMed;

my $fig= new FIG;

my $subsys = $ARGV[0];
my $numArgs = $#ARGV + 1;
my @all_subs;
if ($numArgs eq 0) {

    print "No subsystem was provided. So using all subsystems\n";
    @all_subs= grep {$fig->usable_subsystem($_)} $fig->all_subsystems;
    
}
else {
    push (@all_subs, $subsys);
}

my %all_roles = ();
my %all_pegs = ();

#Create a temporary file to store literature


#Get all functional roles in subsystems and store them in a hash
#Get all pegs in subsystems and store them in a hash
my $literature_file;
&set_hash;


sub set_hash{


    foreach my $subsys (@all_subs) {

	$literature_file = "literature$$";
	%all_roles = ();
	%all_pegs = ();

	open (OUT, ">$FIG_Config::temp/$literature_file") or print "can't write to file\n";

	my @output = &FIG::run_gathering_output("$FIG_Config::bin/pegs_in_subsystem", $subsys);
	print "subsystem $subsys\n";
	foreach (@output) {
	    
	    my ($role,$peg) = split(/\t/,$_);
	    $peg =~ s/\n//;
	    $all_roles{$role} = $role;
	    $all_pegs{$peg} = $peg;
	}

    &get_lit_for_pegs;
    &save_lit_as_attributes;
    sleep 3;
    } 
    
}


sub get_lit_for_roles {
    while ( my ($fr_name,$value) = each %all_roles) {
	
	my $fr_output = &FIG::run_gathering_output("$FIG_Config::bin/get_journals_for_frole", "$fr_name");
	next if ($fr_output eq  "L-asparaginase (EC 3.5.1.1)");
	if ($fr_output) {
	    print "$fr_name\t$fr_output\n";
	    print OUT "$fr_name\t$fr_output\n";
	}
    }
}



sub get_lit_for_pegs{
    while ( my ($peg,$value) = each %all_pegs) {
	
	my @aliases = $fig->feature_aliases($peg);
	my @gid = grep {/.*gi.*/} @aliases; 
	my @spid = grep {/.*sp.*/} @aliases; 
	my @geneid = grep {/.*GeneID.*/} @aliases;
	my @all_ids;
	push (@all_ids, @gid,@spid,@geneid);
    
	my $all_ids_query = join(" ", @all_ids);
	my $fr_name = $fig->function_of($peg);
	my $peg_output;
    
	if($all_ids_query) {
	    $peg_output = &FIG::run_gathering_output("$FIG_Config::bin/get_pubmed_journals", "$all_ids_query");
	}
   
	if ($peg_output) {
	    $fr_name = $fig->function_of($peg);
	    print "$fr_name\t$peg_output\n";
	    print OUT "$fr_name\t$peg_output\n";
	}
    }

    close OUT;
}


sub save_lit_as_attributes {
    open(ATT, "$FIG_Config::temp/$literature_file") || print "can't find file";
    my @att = <ATT>;
    while ( my ($role, $value) = each(%all_roles) ) {
	my @arg;
	push (@arg, "-s", "$literature_file", "-r", "$role");
	my $literature = &FIG::run_gathering_output("$FIG_Config::bin/explore_subsys_pmid", @arg);
	
	next if (! $literature);
	
	my @literature_array = split(/\s+/,$literature);
	
	foreach (@literature_array) {
	    print "Role:$role\tROLE_PUBMED_NOTCURATED\t";
	    my $output_table = &FigWebServices::SeedComponents::PubMed::get_author_date_title($_); 
	    print $output_table;
	    my @att = $fig->get_attributes("Role:$role", "ROLE_PUBMED_NOTCURATED", $output_table);
	    if (! @att) {
		$fig->add_attribute("Role:$role", "ROLE_PUBMED_NOTCURATED", $output_table);
	    }
	}
    }

    close ATT;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3