[Bio] / FigKernelScripts / difference_between_two_databases.pl Repository:
ViewVC logotype

View of /FigKernelScripts/difference_between_two_databases.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Sat Jan 3 23:33:46 2009 UTC (10 years, 10 months ago) by redwards
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2009_07_09, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, HEAD
code for generating the delta between two databases

#__perl__
#

=pod

=head1 difference_between_two_databases

Take two directories that contain seed formatted nr data, find the differences between them, and create a new directory with just the delta. We read the size of the database using fastacmd, and write upgrade information to XML, so that it can be added to databases.xml. (Although at the moment I didn't write directly to databases.xml). Finally we formatdb the database.

=cut

use strict;
use Getopt::Std;
use FortyEightMeta::SimDB;
my %opts;

getopts('n:f:t:d:', \%opts);

unless ($opts{n} && $opts{f} && $opts{t} && $opts{d}) {
	die <<EOF;
$0
-n name of the database (SEED, rdp, etc)
-f source version (from)
-t destination version (to)
-d directory to store the output

EOF
}

my $simdb = FortyEightMeta::SimDB->new($FIG_Config::mgrast_database_def);
$simdb or die "Cannot open database description file $FIG_Config::mgrast_database_def";

# check that we have the versions requested 
my $allversions;
map {$allversions->{$_->[0]}->{$_->[1]}=1} $simdb->get_all_versions();
die "We do not have a $opts{n} version $opts{f}. Available versions are: ". join(" ", keys %{$allversions->{$opts{n}}}) unless ($allversions->{$opts{n}}->{$opts{f}});
die "We do not have a $opts{n} version $opts{t}. Available versions are: ". join(" ", keys %{$allversions->{$opts{n}}}) unless ($allversions->{$opts{n}}->{$opts{t}});


# now we need to get the peg.synonyms file and process the from and to.
# we are just going to hash the xxx ids and see what is missing. This maybe 
# an issue as we move away from xxx ids to md5 sums, so we may just need to
# rewrite this regexp here.

my $from_synonyms = $simdb->get_pegsyn($opts{n}, $opts{f});
unless (-e $from_synonyms) {die "$from_synonyms not found\n"}
my $to_synonyms = $simdb->get_pegsyn($opts{n}, $opts{t});
unless (-e $to_synonyms) {die "$to_synonyms not found\n"}

print STDERR "Comparing the synonyms in $from_synonyms and $to_synonyms\n";

my %ori; my %diff;
open(IN, $from_synonyms) || die "Can't open from_synonyms";
while (<IN>) {
	/^(xxx\d+)/;
	unless ($1) {die "$_ is not an xxx id\n"}
	$ori{$1}=1;
}
close IN;
open(IN, $to_synonyms) || die "can't open $to_synonyms";
while (<IN>) {
	/^(xxx\d+)/;
	unless ($1) {die "$_ is not an xxx id\n"}
	unless ($ori{$1}) {$diff{$1}=$_}
}

unless (scalar(keys %diff)) {die "There are no differences between $opts{f} and $opts{t}\n"}
print STDERR "Creating a database of ", scalar(keys %diff), " proteins\n";

# figure out the fasta file information
my @dbinfo = $simdb->databases({$opts{n}=>$opts{t}});
my $fastaf = $dbinfo[0]->{'files'}->[0]->{'fasta'};
my $uspib  =  $dbinfo[0]->{'files'}->[0]->{'uspib'};

die "Can't find a fasta file for $opts{n} version $opts{t} (expected it to be $fastaf)" unless (-e $fastaf);

# figure out the fasta file
my @odbinfo = $simdb->databases({$opts{n}=>$opts{f}});
my $ofastaf = $odbinfo[0]->{'files'}->[0]->{'fasta'};
print STDERR "Can't find a fasta file for $opts{n} version $opts{f} (expected it to be $ofastaf)" unless (-e $ofastaf);

# make the directory and write the appropriate proteins to the output file
mkdir $opts{d}, 0755 unless (-d $opts{d});
open(OUT, ">$opts{d}/nr") || die "can't write to $opts{d}/nr";
open(IN, "$fastaf") || die "Can't read from $fastaf";
my $print; 
while (<IN>)
{
	if (/^>(\S+)/ && $diff{$1} && delete $diff{$1}) {$print = 1} # the delete is there because in 018c there are some duplicates
	elsif (/^>/) {$print = 0}
	print OUT $_ if ($print);
}
close IN;
close OUT;

# format the database
my $formatdb=`which formatdb`;
chomp($formatdb);
if ($formatdb) 
{
	my $err = `$formatdb -pT -i $opts{d}/nr -t '$opts{f}-to-$opts{t}'`;
	chomp $err;
	print STDERR "$err\n" if ($err);
}
else
{
	print STDERR "formatdb not found. Database not formatted\n";
}




# make the xml
#

# We need to get the sizes of the fasta files. We'll just use fastacmd as is easier than counting ourselves



my ($fsize, $nsize, $tsize)=(0, 0, 0); # from size, new size, to size
eval {
	my $fastacmd=`fastacmd -I -d $opts{d}/nr`;
	if ($fastacmd =~ /([\d\,]+)\s*total letters/) {$nsize=$1}
	$nsize =~ s/\,//g;
};
unless ($nsize > 0) {print STDERR "We couldn't figure out the database size of $opts{d}/nr. Please check the appropriate uspib\n"}

eval {
	my $fastacmd=`fastacmd -I -d $fastaf`;
	if ($fastacmd =~ /([\d\,]+)\s*total letters/) {$tsize=$1}
	$tsize =~ s/\,//g;
};
unless ($tsize > 0) {print STDERR "We couldn't figure out the database size of $fastaf. Please add that to the XML\n"}

eval {
	my $fastacmd=`fastacmd -I -d $ofastaf`;
	if ($fastacmd =~ /([\d\,]+)\s*total letters/) {$fsize=$1}
	$fsize =~ s/\,//g;
};
unless ($fsize > 0) {print STDERR "We couldn't figure out the database size of $fastaf. Please add that to the XML\n"}

$uspib = int(($uspib/$tsize)*$nsize);

if (open(OUT, ">$opts{d}/description.xml")) 
{
print OUT <<EOF;
<?xml version="1.0"?>
    <db_version version="$opts{f}-to-$opts{t}">
      <db_file name="nr"
               fasta="$opts{d}/nr"
               peg_synonyms="$to_synonyms"
               type="protein"
               dir="sims.seed"
	       uspib="$uspib"
               abbr="u">
      </db_file>
      <db_upgrade from="$opts{f}"
                to="$opts{t}"
                size_of_new="$tsize"
		size_of_old="$fsize"
                />
    </db_version>
EOF
close OUT;
print STDERR "The XML description was written to $opts{d}/description.xml, please add that to databases.xml\n";
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3