[Bio] / FigKernelScripts / run_model_generation.pl Repository:
ViewVC logotype

View of /FigKernelScripts/run_model_generation.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.17 - (download) (as text) (annotate)
Wed Nov 12 18:29:31 2008 UTC (11 years ago) by dejongh
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2009_07_09, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, HEAD
Changes since 1.16: +2 -2 lines
switch to scenario_directory

# _*_ Perl _*_
#
# This script is a wrapper for the RAST server. It will call all scripts required
#  to generate a model automatically based on scenarios. In addition it will
#  create a tar file of information that the user can receive.
#

use strict;
use FIGV;
use Scenario;

my $orgdir;
while ((@ARGV > 0) && ($ARGV[0] =~ /^-/))
{
    my $arg = shift @ARGV;
    if ($arg =~ /^-orgdir/)
    {
	$orgdir = shift @ARGV;
    }
    else
    {
	die "Unknown option $arg\n";
    }
}
my $fig;
if($orgdir)
{
    $fig = new FIGV($orgdir);
}
else
{
    $fig = new FIGV;
}
my ($genome_id) = @ARGV;

chomp $genome_id;

unless (scalar(@ARGV) == 1)
{
    print STDERR "\nusage: run_model_generation <genome_id>\n";
    exit -1;
}

# We need to do a couple of steps
# 1. Generate input, output, biomass and cofactor files.
# 2. Check the model to generate information files, and trim biomass
# 3. Generate SBML file of the reaction network.
# 4. Generate MPS file of reaction network.
# 5. Tarball the information files


#Lets make a check here. If the scenarios haven't been run yet for this genome run them.
if(!opendir(GENO_SCEN, $fig->scenario_directory($genome_id)))
{ 
    system(("$FIG_Config::bin/run_scenarios",$genome_id));
}
else { closedir(GENO_SCEN);  }

#Step 1
system(("$FIG_Config::bin/find_model_information",$genome_id,"outputs",$orgdir));
system(("$FIG_Config::bin/find_model_information",$genome_id,"biomass",$orgdir));
#finding inputs is now dependent on the biomass file for increased return
system(("$FIG_Config::bin/find_model_information",$genome_id,"inputs",$orgdir));
system(("$FIG_Config::bin/find_model_cofactors",$genome_id,$orgdir));
system(("$FIG_Config::bin/find_missing_lib_inputs",$genome_id,1,$orgdir));

#Step 2
system(("$FIG_Config::bin/check_model",$genome_id,$orgdir));

#Step 3
system(("$FIG_Config::bin/make_SBML_model",$genome_id,$orgdir));

#Step 4
system(("$FIG_Config::bin/make_MPS_model",$genome_id,$orgdir));

system(("$FIG_Config::bin/run_flux_optimization",$genome_id,$orgdir));

### Statistics Step
# Some stats about coverage for the user 
#  and for looking at testing information as well 

my %counts; 
my $filebase = $fig->scenario_directory($genome_id)."/Analysis/";

open(BVAL,$filebase."biomass_valid.txt");
my @lines = <BVAL>;
close(BVAL);
$counts{"biomass_valid"} = scalar(@lines);
undef @lines;
open(BTOT,$filebase."biomass.txt");
@lines = <BTOT>;
close(BTOT);
$counts{"biomass_total"} = scalar(@lines);
undef @lines;
open(SVAL,$filebase."scenarios_valid.txt");
@lines = <SVAL>;
close(SVAL);
$counts{"scenarios_valid"} = scalar(@lines);
undef @lines;
open(STOT,$filebase."info_scenarios.txt");
while(<STOT>)
{
    if(!/^Scenario/) {last;}
    else  { push @lines, $_; }
}
close(STOT);
$counts{"scenarios_total"} = scalar(@lines)+$counts{"scenarios_valid"};
undef @lines;
open(UINS,$filebase."inputs_to_scenarios_not_transported_or_produced.txt");
@lines = <UINS>;
close(UINS);
$counts{"unknown_inputs"} = scalar(@lines);
undef @lines;
open(INS,$filebase."inputs.txt");
@lines = <INS>;
close(INS);
$counts{"inputs"} = scalar(@lines);
undef @lines;

#calc a timestamp to indicate when the model was created.
my $time = gmtime();

my $readme_txt = "Readme.txt - Automatic Model Generation\n" .
    "Data generated on $time GMT.\n\n" .
    "Overview\n".
    "\tThis readme describes the information provided from the generation of a metabolic model for\n" .
    "\tyour genome, $genome_id. Your metabolic model which was generated is provided in two formats:\n" .
    "\tSBML(www.sbml.org) and MPS. Please see the other sections for more information on the model. \n" .
    "\tAll compound (C#####) and reaction (R#####) identifiers are from the KEGG database (www.genome.jp/kegg/).\n". 
    "\tThe model is not intended to be complete but should be accurate for the scope of\n" .
    "\tmetabolism covered. For those unfamiliar with Scenarios and Flux Balance Modeling please refer to\n".
    "\t'Toward the automated generation of genome-scale metabolic networks in the SEED' at http://www.biomedcentral.com/1471-2105/8/139/abstract\n" .
    "\t and Bernard Palsson's in silico organism publications at http://gcrg.ucsd.edu/publications/index.html respectively.\n\n". 
    "Statistics\n".
    "\tSecond column is percent and third column is valid/predicted.\n".
    "\tFor biomass 'valid' is how many biomass compounds are reachable given the computed set of inputs,\n".
    "\tand 'predicted' is the number of compounds capable of being produced by the scenarios computed for this genome\n".
    "\tand also in the library of biomass compounds based on the biomass definitions of published models.\n".
    "\tFor scenarios 'valid' is the number of scenarios reachable from the computed set of inputs,\n".
    "\tand 'predicted' is the total number of scenarios computed for this genome.\n";
$readme_txt = $readme_txt . "\tBiomass\t\t".(100*$counts{"biomass_valid"}/$counts{"biomass_total"})."%\t\t".$counts{"biomass_valid"}."/".$counts{"biomass_total"}."\n" if($counts{"biomass_total"});
$readme_txt = $readme_txt . "\tScenarios\t".(100*$counts{"scenarios_valid"}/$counts{"scenarios_total"})."%\t\t".$counts{"scenarios_valid"}."/".$counts{"scenarios_total"}."\n" if($counts{"scenarios_total"});
$readme_txt = $readme_txt . "\n\tInputs of predicted scenarios not transported in or produced\t\t".$counts{"unknown_inputs"}."\n\n" .
    "Output Files\n" .
    "\t-inputs.txt\n".
    "\t-secondary_compounds.txt\n".
    "\t-outputs.txt\n".
    "\t-biomass_valid.txt\n".
    "\t-scenarios_valid.txt\n".
    "\t-inputs_to_scenarios_not_transported_or_produced.txt\n".
    "\t-info_biomass.txt\n".
    "\t-info_scenarios.txt\n".
    "\t-info_flux.txt\n".
    "\t-SBML Model ($genome_id\_FBA_Model.xml)\n".
    "\t-MPS Model ($genome_id.mps)\n\n".
    "Model Environment Interactions\n".
    "\tThe compounds transported in from the extracellular environment into the cell are listed in\n".
    "\tinputs.txt and secondary_compounds.txt. The compounds in inputs.txt are considered primary or main\n".
    "\tcompounds. Compounds in secondary_compounds.txt are non-primary metabolites that are capable of\n".
    "\tdiffusing or being transported into most bacterial cells.\n".
    "\tThe compounds potentially transported out of the cell are listed in outputs.txt.\n".
    "\tHowever, the model actually contains sink reactions for all compounds.\n".
    "\tThe sink reactions allow the model obtain zero net gain/loss for every compound.\n".
    "\tThe sink reactions  also serve a diagnostic purpose and indicate possible starting points for new scenarios\n".
    "\t or suggest potentially new excretion transporters.\n\n".
    "Reaction and Biomass\n".
    "\tThe reactions that are included in the model come from scenarios that have been determined to be reachable\n".
    "\tbased upon the computed set of inputs and given secondary compounds. A scenario\n".
    "\tis a small defined sub-section of a larger metabolic pathway. Each scenario is implemented\n".
    "\tby a number of reactions. The list of reachable(valid) scenarios used is located in the file scenarios_valid.txt.\n".
    "\tLocated in info_scenarios.txt is the list of scenarios predicted for this genome but unreachable with computed inputs.\n".
    "\tThe reachable compounds constituting the biomass are located in biomass_valid.txt.\n".
    "\tThe file info_biomass.txt list unreachable biomass compounds found in the biomass library and produced by predicted scenarios.\n\n".
    "Models\n".
    "\tThe models are created in two formats SBML ($genome_id\_FBA_Model.xml) and MPS ($genome_id.mps).\n".
    "\tThe SBML format includes information for each reaction about the associated gene, scenario and subsystem.\n".
    "\tUsing the MPS formatted file as input, an flux-balance optimization problem seeking to maximize flux through the biomass reaction\n".
    "\twas solved use glpsol. More info on glpsol can be found at http://www.gnu.org/software/glpk/glpk.html .\n".
    "\tParsed output from glpsol is located in info_flux.txt. The first line contains the flux value for the biomass growth.\n".
    "\tThe second section lists the compound sinks used but not listed in outputs.txt. The third section lists\n".
    "\tinput compounds that were not needed to generate any biomass but included in the model.\n".
    "\tThe raw output from glpsol is in the file flux.output. The ATP Synthesis reaction(ATPSYN) is included in each model due to\n".
    "\tKEGG lacking a precise reaction id.\n\n".
    "Expanding/Troubleshooting the Model\n".
    "\tA number of the output files will identify opportunities to expand the model or for discovery\n".
    "\tof errors. If the file info_flux.txt indicates that 0 biomass was generated then\n".
    "\ta conflict exists in the biomass equation. For example, if both NAD and NADH are included as\n".
    "\tbiomass components, the flux through the biomass reaction will be zero since maximizing NAD minimizes NADH.\n".
    "\tTo resolve a conflict like this, put both compound IDs in the file trouble_cpds.txt (/FIGdist/FIG/Data/Global/Models/trouble_cpds.txt).\n".
    "\tContact your SEED admin to edit this file. Any compounds listed in the second section of\n".
    "\tinfo_flux.txt are compounds produced in excess that are not known to be excreted like the compounds in output.txt.\n".
    "\tThese compounds should be examined as potential starting points for new scenarios or for addition to outputs.txt .\n". 
    "\tThe file inputs_to_scenarios_not_transported_or_produced.txt lists compounds that can not be generated by any scenario (valid or not),".
    "\tor be transported in and are therefore NOT included in the model. Scenarios may need to be added to generate these scenario inputs.\n".
    "\tAlso these compounds should be considered for addition to inputs.txt. The lists of unreachable scenarios and biomass compounds\n".
    "\tprovide areas of metabolism to focus on for developing new subsystems and scenarios, and\n";
    "\tultimately expanding the metabolic reconstruction and the model.\n\n".
    "Acknowledgements\n".
    "\tThe system for generating metabolic models was developed at Hope College, Holland, MI and\n".
    "\tArgonne National Laboratory. Individuals involved are Matt DeJongh, Mike Kubal, Kevin Formsma,\n".
    "\tRoss Overbeek and undergraduate students from Hope College.\n";
open(OUT, ">".$filebase."readme.txt");
print OUT $readme_txt;
close(OUT);

#Step 5
# Tar archive consists of the following files
#
# inputs.txt - Model inputs (based on library of known inputs)
# secondary_compounds.txt - non-primary compounds transported in by most bacteria
# outputs.txt - Model outputs (based on library of known outputs)
# biomass_valid.txt - Model biomass that is reachable/connected
# scenarios_valid.txt - Scenarios used in the reaction model
# info_biomass.txt - Information about detected but unreachable biomass
# info_scenarios.txt - Information about unused or dead-end scenarios
# inputs_to_scenarios_not_transported_or_produced.txt - Inputs not in library but needed for scenarios
# flux.output - Results of the glpsolver maximization
# info_flux.txt - Information on flux output of model
# readme.txt -Stats and information for the end-user
# Two model files, .xml and .mps formats
#
chdir $filebase;
system(("tar","cfz","model_files.tgz","inputs.txt","secondary_compounds.txt","outputs.txt",
	"biomass_valid.txt","scenarios_valid.txt","info_biomass.txt",
	"info_scenarios.txt","readme.txt",
	"inputs_to_scenarios_not_transported_or_produced.txt","info_flux.txt",$genome_id."_FBA_Model.xml",
	$genome_id.".mps","flux.output"));

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3