[Bio] / Sprout / LoadShrubGenomes.pl Repository:
ViewVC logotype

View of /Sprout/LoadShrubGenomes.pl

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.2 - (download) (as text) (annotate)
Thu Feb 5 10:03:33 2015 UTC (5 years, 1 month ago) by parrello
Branch: MAIN
Changes since 1.1: +0 -0 lines
checkpoint before a big update from GIT

#!/usr/bin/perl -w

# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
# This file is part of the SEED Toolkit.
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.

=head1 Shrub Genome Loader

	LoadShrubGenomes directory genome1 genome2 ...
This method loads one or more genomes from repository directories into the
Shrub database. The genome data will be assembled into load files for
each table, and then the tables loaded directly from the files.

The positional parameters are the name of the directory containing the
genome exchange directories, plus the names of the genomes to be loaded.
The command-line parameters listed in L<Shrub/new_for_script> are accepted
as well as the following.

=over 4

=item all

Load all genomes in the directory.

=item missing

Load only genomes that are not already in the database.

=item clear

Truncate the tables before loading.



	use strict;
	use Shrub;
	use ShrubLoader;

	# Declare the option variables.
	my ($missing, $all, $clear);
	print "Connnecting to database.\n";
	# Create the database object.
	my $shrub = Shrub->new_for_script(0, missing => \$missing, all => \$all, clear => \$clear);
	if (! $shrub) {
		print "LoadShrubGenomes [ options ] genomeDirectory genome1 genome2 ...";
	} else {
		# We are connected. Create the loader utility object.
		my $loader = ShrubLoader->new($shrub);
		# Get the statistics object.
		my $stats = $loader->stats;
		# Get the positional parameters.
		my ($genomeDir, @genomes) = @ARGV;
		# Verify the genome directory.
		if (! $genomeDir) {
			die "No genome directory specified.";
		} elsif (! -d $genomeDir) {
			die "Invalid genome directory $genomeDir.";
		# Get the list of genomes to load.
		if ($all) {
			if (scalar @genomes) {
				die "ALL option specified along with a list of genome IDs. Use one or the other.";
			@genomes = grep { $_ =~ /^\d+\.\d+$/ } $loader->OpenDir($genomeDir);
		# This will be the list of genomes we actually process.
		my $genomeList;
		# In normal mode, it's the incoming list.
		if (! $missing) {
			$genomeList = \@genomes;
		} else {
			# If the "missing" option is specified. Create a list of only genomes not already
			# in the database.
			print "Checking for existing genomes in genome list.\n";
			$genomeList = [];
			my @skipList;
			for my $genome (@genomes) {
				if ($shrub->Exists(Genome => $genome)) {
					$stats->Add(genomeSkipped => 1);
					push @skipList, $genome;
				} else {
					push @$genomeList, $genome;
			# Display the genomes skipped.
			my $n = scalar(@skipList);
			if (@skipList) {
				for (my $i = 0; $i < $n; $i += 10) {
					my $i1 = $i + 9;
					if ($i1 >= $n) {
						$i1 = $n - 1;
					print "Skipping genomes: " . join(", ", @skipList[$i .. $i1]) . "\n";
		# Get the list of objects we are loading.
		my @tables = qw(Genome Contig Feature Function Protein Role
						Feature2Contig Function2Feature Function2Role Genome2Contig
						Genome2Feature Protein2Feature Protein2Function Protein2Role);
		if ($clear) {
			# Here we need to clear the tables first.
			print "CLEAR option specified.\n";
		# Prepare for loading.
		# We must insure we only load one of each function into the database, but functions are keyed
		# by ID, not name. This is a hash of functions by MD5.
		# Proteins are shared objects, and we only want one copy of each protein. In addition, for
		# each protein, we want to associate a single function. This is a two-dimensional hash that
		# tracks the number of times a function is associated with a protein. The functions are keyed by
		# an MD5 on the function name and the value is the function ID.

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3