[Bio] / FigKernelScripts / run_model_generation_v3.pl Repository:
ViewVC logotype

View of /FigKernelScripts/run_model_generation_v3.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Wed Nov 12 18:29:31 2008 UTC (11 years ago) by dejongh
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2009_07_09, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, HEAD
Changes since 1.2: +2 -2 lines
switch to scenario_directory

# _*_ Perl _*_
#
# This script is a wrapper for the RAST server. It will call all scripts required
#  to generate a model automatically based on scenarios. In addition it will
#  create a tar file of information that the user can receive.
#

use strict;
use FIGV;
use Scenario;

my $orgdir;
while ((@ARGV > 0) && ($ARGV[0] =~ /^-/))
{
    my $arg = shift @ARGV;
    if ($arg =~ /^-orgdir/)
    {
	$orgdir = shift @ARGV;
    }
    else
    {
	die "Unknown option $arg\n";
    }
}
my $fig;
if($orgdir)
{
    $fig = new FIGV($orgdir);
}
else
{
    $fig = new FIGV;
}
my ($genome_id) = @ARGV;

chomp $genome_id;

unless (scalar(@ARGV) == 1)
{
    print STDERR "\nusage: run_model_generation <genome_id>\n";
    exit -1;
}

# We need to do a couple of steps
# 1. Generate input, output, biomass and cofactor files.
# 2. Check the model to generate information files, and trim biomass
# 3. Generate spreadsheet files of the reaction network.
# 4. Generate MPS file of reaction network.
# 5. Tarball the information files


#Lets make a check here. If the scenarios haven't been run yet for this genome run them.
if(!opendir(GENO_SCEN, $fig->scenario_directory($genome_id)))
{ 
    system(("$FIG_Config::bin/run_scenarios",$genome_id));
}
else { closedir(GENO_SCEN);  }

#Step 1
system(("$FIG_Config::bin/find_model_information_v2",$genome_id,"outputs",$orgdir));
system(("$FIG_Config::bin/find_model_information_v2",$genome_id,"biomass",$orgdir));
#finding inputs is now dependent on the biomass file for increased return
system(("$FIG_Config::bin/find_model_information_v2",$genome_id,"inputs",$orgdir));
system(("$FIG_Config::bin/find_model_cofactors",$genome_id,$orgdir));
system(("$FIG_Config::bin/find_missing_lib_inputs_v2",$genome_id,1,$orgdir));


#Step 2
system(("$FIG_Config::bin/check_model_v2",$genome_id,$orgdir));

#Step 3
system(("$FIG_Config::bin/make_model_spreadsheets",$genome_id,$orgdir));

#Step 4
system(("$FIG_Config::bin/make_MPS_model_v2",$genome_id,$orgdir));

system(("$FIG_Config::bin/run_flux_optimization",$genome_id,$orgdir));


### Statistics Step
# Some stats for the user 

my $filebase = $fig->scenario_directory($genome_id)."/Analysis/";
my %active_inputs;
open(ACTIVE,">".$filebase."inputs_active.txt");
open(FLUX,$filebase."flux.output");
while(<FLUX>){
    chomp($_);
    my @parts = split(" ",$_);
    if($parts[1] =~/^T\d+$/){
	if($parts[3] > 0){	
	    my $cid = $parts[1];
	    $cid =~s/T/C/;
	    print ACTIVE "$cid\t$parts[3]\n"; 
	}
    }
}
close(ACTIVE);

my %counts; 
open(BVAL,$filebase."biomass_valid.txt");
my @lines = <BVAL>;
close(BVAL);
$counts{"biomass_valid"} = scalar(@lines);
undef @lines;
open(BTOT,$FIG_Config::global."/Models/ec_biomass.txt");
@lines = <BTOT>;
close(BTOT);
$counts{"biomass_total"} = scalar(@lines);
undef @lines;
open(SVAL,$filebase."scenarios_valid.txt");
@lines = <SVAL>;
close(SVAL);
$counts{"scenarios_valid"} = scalar(@lines);
undef @lines;
open(STOT,$filebase."info_scenarios.txt");
while(<STOT>)
{
    if(!/^Scenario/) {last;}
    else  { push @lines, $_; }
}
close(STOT);
$counts{"scenarios_total"} = scalar(@lines)+$counts{"scenarios_valid"};
undef @lines;
open(UINS,$filebase."scenario_inputs_not_transported_or_produced.txt");
@lines = <UINS>;
close(UINS);
$counts{"unknown_inputs"} = scalar(@lines);
undef @lines;
open(INS,$filebase."inputs.txt");
@lines = <INS>;
close(INS);
$counts{"inputs"} = scalar(@lines);
undef @lines;

open(OUT, ">".$filebase."readme.txt");
print OUT "Readme.txt - Automatic Model Generation\n\n";

print OUT "Overview\n";
print OUT "\tThis readme describes the information provided from the \n";
print OUT "\tgeneration of a metabolic model for the genome, $genome_id.\n";
print OUT "\tThe metabolic model is provided in two formats:\n";
print OUT "\tMPS and a series of tab-separated text files.\n";
print OUT "\tPlease see below for more info on the files produced.\n";
print OUT "\tAll compound (C#####) and reaction (R#####) identifiers are from the KEGG database (www.genome.jp/kegg/).\n"; 
print OUT "\tThe model is not intended to be complete but will reflect the state of annotation for the genome in the SEED and NMPDR.\n";
print OUT "\tFor those unfamiliar with Scenarios and Flux Balance Modeling please refer to\n";
print OUT "\t'Toward the automated generation of genome-scale metabolic networks in the SEED' at http://www.biomedcentral.com/1471-2105/8/139/abstract\n";
print OUT "\t and Bernard Palsson's in silico organism publications at http://gcrg.ucsd.edu/publications/index.html respectively.\n\n"; 
print OUT "Statistics\n";
print OUT "\tBiomass produced/standard model:".$counts{biomass_valid}."/".$counts{biomass_total}."\n";
print OUT "\twhere 'standard_model' is the number of biomass compounds in the published JR904 E. coli model that can be mapped to KEGG ids.\n";
print OUT "\tNumber of Scenarios that define reaction network: $counts{scenarios_total}\n";
print OUT "\tInputs of predicted scenarios not in the medium or produced:".$counts{unknown_inputs}. "\n\n";
print OUT "Output Files\n";
print OUT "\t-reactions_spreadsheet.txt : reactions, reversibility, associated genes, scenarios and subystems\n"; 
print OUT "\t-compounds_spreadsheet.txt : compounds in reaction network\n";
print OUT "\t-transports_spreadsheet.txt : input and output compounds and evidence\n";   
print OUT "\t-inputs.txt : primary compounds in the medium\n";
print OUT "\t-secondary_compounds.txt : mostly water-soluble vitamins,minerals and ions in the medium\n";
print OUT "\t-inputs_info.txt : compounds not in medium required for fully connected reaction network\n";
print OUT "\t-inputs_active.txt : input fluxes from optimization solution \n"; print OUT "\t-outputs.txt : compounds associated with excretion fluxes\n";
print OUT "\t-biomass_valid.txt : compounds in biomass function of standard model and also reachable outputs of predicted scenarios\n";
print OUT "\t-biomass_lib_compounds_not_produced.txt : compounds in biomass function of standard model not produced by model\n";
print OUT "\t-biomass_candidate_compounds_not_produced.txt : compounds producedby predicted scenarios for genome but not reachable with current inputs\n";
print OUT "\t-scenarios_valid.txt : scenarios predicted for genome\n";
print OUT "\t-scenario_info.txt : possible dead end scenarios\n";
print OUT "\t-flux_info.txt : compounds produced in excess and compounds from the medium that are not utilized  \n";
print OUT "\t-flux.output : solution to optimization problem\n";
print OUT "\t-MPS Model ($genome_id.mps) : input to glpsol\n\n";
print OUT "Evidence\n";
print OUT "\tAll reactions listed in reactions_spreadsheet.txt defining the core of the reaction network are associated with scenarios and roles in subsystems in the SEED and NMPDR.\n";
print OUT "\tCompounds associated with input and output fluxes are assigned a class type based on the available evidence.\n";
print OUT "\tInput fluxes with annotation evidence in the model's genome are designated 'evidence_for_input'.\n";
print OUT "\tInput and output fluxes found in other published models but without annotation evidence in this genome are designated 'common_input_primary','common_output_primary' or 'common_input_secondary'.\n";
print OUT "\tInput and output fluxes without annotation evidence or not present in other published models are designated as 'abstract'.\n\n";     
print OUT "Optimizing Growth of Model\n";
print OUT "\tA flux-balance optimization problem maximizing biomass generation using the MPS file as input is solved using glpsol.\n";
print OUT "\tglpsol can be found at http://www.gnu.org/software/glpk/glpk.html\n";
print OUT "\tThe solution to the optimization problem is in flux.output.\n";
print OUT "\tThe first line contains the flux value for the biomass growth.\n\n";
print OUT "Expanding/Troubleshooting the Model\n";
print OUT "\tA number of the output files will help you in determining where to expand the model or for discovery of errors.\n";
print OUT "\tIf the file flux_info.txt indicates that 0 biomass was generated there is a conflict in the biomass equation.\n";
print OUT "\tAn example of this situation occurs if both NAD and NADH are components of biomass.\n";
print OUT "\tProducing one of these compounds consumes the other causing flux thru the biomass to be impossible.\n"; 
print OUT "\tTo resolve a conflict like this you must place both conflicting compound ids in the file trouble_cpds.txt (/FIGdist/FIG/Data/Global/Models/trouble_cpds.txt).\n";
print OUT "\tContact your SEED admin to edit this file.\n";
print OUT "\tCompounds listed in the second section of flux_info.txt are compounds produced in excess but without an output (excretion) flux like those in outputs.txt.\n";
print OUT "\tThese compounds provide guidance on what areas of metabolism curators could explore to improve the genome's annotation state and expand the model.\n\n";
print OUT "Acknowledgements\n";
print OUT "\tThe system for generating metabolic models was developed at Hope College, Holland, MI and\n";
print OUT "\tArgonne National Laboratory. Individuals involved are Matt DeJongh, Mike Kubal, Kevin Formsma,\n";
print OUT "\tRoss Overbeek and undergraduate students from Hope College.\n";

close(OUT);

#Step 5
# Tar archive consists of the following files - update list
#

chdir $filebase;
system(("tar","cfz","model_files.tgz","inputs.txt","secondary_compounds.txt","outputs.txt","biomass_valid.txt","scenarios_valid.txt","biomass_candidate_compounds_not_produced.txt","biomass_lib_compounds_not_produced.txt","inputs_info.txt","inputs_active.txt","reactions_spreadsheet.txt","compounds_spreadsheet.txt","transports_spreadsheet.txt", "scenarios_info.txt","readme.txt","scenario_inputs_not_transported_or_produced.txt","flux_info.txt",$genome_id.".mps","flux.output"));

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3