[Bio] / FigKernelScripts / install_features_batch.pl Repository:
ViewVC logotype

View of /FigKernelScripts/install_features_batch.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Tue Mar 11 21:26:06 2008 UTC (11 years, 8 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Add new script to install a batch of genomid.tbl / genomeid.fasta files. Initial use
was to install Mark D'Souza's batch of RFAM-computed RNAs.

Does a ton of validation before allowing the install to continue.

#
# Install the RFAM-computed RNA files.
#
# We are given a directory containing files of the form genomeid.tbl and genomeid.fasta.
#
# Collect them up, make sure we have matching sets, and then walk the list of genomes,
# checking to be sure we don't smack existing feature ids.
#

use strict;
use FIG;
use FIG_Config;
use File::Copy;

@ARGV == 2 or die "Usage: $0 newdata-dir org-dir\n";

my $dir = shift;
my $org_dir = shift;

-d $dir or die "Source directory $dir does not exist\n";
-d $org_dir or die "Organism directory $org_dir does not exist\n";

my %data;

opendir(D, $dir) or die "Cannot opendir $dir: $!";

while (my $f = readdir(D))
{
    my $p = "$dir/$f";
    next unless -f $p;
    if ($f =~ /^(\d+\.\d+)\.(fasta|tbl)/)
    {
	$data{$1}->{$2} = $p;
    }
}
closedir(D);

#
# Check to see we have matched pairs.
#

my @genomes = sort { &FIG::by_genome_id($a, $b) } keys %data;

my $errors;
for my $genome (@genomes)
{
    my $tbl = $data{$genome}->{tbl};
    my $fasta = $data{$genome}->{fasta};

    if (not($fasta and $tbl))
    {
	warn "Incomplete data for $genome\n";
	$errors++;
    }
}

if ($errors)
{
    die "Aborting, errors found during initial scan\n";
}

my %deleted;
my $n_genomes = @genomes;
print "Validating data for $n_genomes genomes\n";

for my $genome (@genomes)
{
    my $tbl = $data{$genome}->{tbl};
    my $fasta = $data{$genome}->{fasta};

    #
    # See if we have existing features.
    #

    my ($ftype, $nmin, $nmax, $nfeats);
    eval {
	($ftype, $nmin, $nmax, $nfeats) = validate_tbl($tbl, $fasta, $genome);
    };
    if ($@)
    {
	warn "Validation error for new data for $tbl $fasta:\n$@\n";
	$errors++;
	next;
    }

    if (-f "$org_dir/$genome/DELETED")
    {
	warn "$genome is deleted\n";
	$deleted{$genome}++;
	next;
    }

    my $ofeat_dir = "$org_dir/$genome/Features/$ftype";

    my($otbl, $ofasta);

    $data{$genome}->{ftype} = $ftype;
    
    if (-d $ofeat_dir)
    {
	$otbl = "$ofeat_dir/tbl";
	$ofasta = "$ofeat_dir/fasta";

	$data{$genome}->{otbl} = $otbl;
	$data{$genome}->{ofasta} = $ofasta;

	my ($otype, $omin, $omax, $ofeats);	
	eval {
	    ($otype, $omin, $omax, $ofeats) = validate_tbl($otbl, $ofasta, $genome);
	};
	if ($@)
	{
	    warn "Validation error for existing data for $otbl $ofasta:\n$@\n";
	    $errors++;
	    next;
	}

	if ($otype ne $ftype)
	{
	    warn "Validated type for $otbl does not match expected $ftype\n";
	    $errors++;
	}

	if ($nmin <= $omax)
	{
	    warn "Range for new data in $tbl overlaps range for old data: old: $omin-$omax new: $nmin-$nmax\n";
	    $errors++;
	}
	print "$genome $ftype: old range $omin-$omax, new range $nmin-$nmax\n";
    }
    else
    {
	print "$genome $ftype: new feature range $nmin-$nmax\n";
    }
}

if ($errors)
{
    die "Aborting, errors found during validation\n";
}

#
# We are in good shape. We can go ahead and install by
# concatenating the tbl and fasta files where ones exist currently,
# or by creating the new feature dir and copying otherwise.
#

for my $genome (@genomes)
{
    if ($deleted{$genome})
    {
	warn "Skipping deleted genome $genome\n";
	next;
    }
    my $tbl = $data{$genome}->{tbl};
    my $fasta = $data{$genome}->{fasta};

    my $ftype = $data{$genome}->{ftype};

    if ($tbl eq '' or $fasta eq '' or $ftype eq '')
    {
	die "Bad data for $genome\n" . Dumper(\%data);
    }

    my $ofeat_dir = "$org_dir/$genome/Features/$ftype";

    my $ftype = $data{$genome}->{ftype} = $ftype;

    print "Install $genome\n";
    if (-d $ofeat_dir)
    {
	my $otbl = $data{$genome}->{otbl};
	my $ofasta = $data{$genome}->{ofasta};

	if ($otbl eq '' or $ofasta eq '')
	{
	    die "Bad data for $genome\n" . Dumper(\%data);
	}
	my $time = time;

	copy($otbl, "$otbl.$time") or die "Error copying $otbl to $otbl.$time: $!";
	copy($ofasta, "$ofasta.$time") or die "Error copying $ofasta to $ofasta.$time: $!";

	&run("cat $tbl >> $otbl");
	&run("cat $fasta >> $ofasta");
    }
    else
    {
	&FIG::verify_dir($ofeat_dir);
	copy($tbl, "$ofeat_dir/tbl") or die "Error copying $tbl to $ofeat_dir/tbl: $!";
	copy($fasta, "$ofeat_dir/fasta") or die "Error copying $fasta to $ofeat_dir/fasta: $!";
    }
}

print "compute counts...\n";
&run("compute_genome_counts", @genomes);
print "load features...\n";
&run("load_features", @genomes);

sub validate_tbl
{
    my($tbl, $fasta, $genome) = @_;

    my($type, $max, $min);
    my %ids;
    my @feats;
    open(T, "<$tbl") or die "cannot open $tbl: $!";
    while (<T>)
    {
	chomp;
	my($id, $loc, $rest) = split(/\t/, $_, 3);

	if ($id =~ /^fig\|(\d+\.\d+)\.([^.]+)\.(\d+)$/)
	{
	    my($g, $t, $num) = ($1, $2, $3);

	    $max = ($num > $max or !defined($max)) ? $num : $max;
	    $min = ($num < $min or !defined($min)) ? $num : $min;

	    $ids{$id}++;
	    if ($g ne $genome)
	    {
		warn "$tbl: genome in fid $id mismatch to genome in file $genome\n";
		$errors++;
		next;
	    }
	    $type = $t unless defined($type);
	    if ($type ne $t)
	    {
		die "Inconsistent types in $tbl ($t != $type)\n";
	    }
	    push(@feats, [$id, $loc, $rest]);
	}
	else
	{
	    die "$tbl: cannot parse id $id\n";
	}
    }
    close(T);
    #
    # Scan the fasta looking for the matching ids.
    #
    my $errors;
    open(F, "<$fasta") or die "cannot open $fasta: $!";
    while (<F>)
    {
	chomp;
	if (/^>(\S+)/)
	{
	    my $id = $1;
	    if ($ids{$id})
	    {
		delete $ids{$id};
	    }
	    else
	    {
		warn "$id from $fasta not present in $tbl\n";
		$errors++;
	    }
	}
    }
    close(F);
    if (%ids)
    {
	my $str = join(" ", sort keys %ids);
	warn "IDs from tbl $tbl not found in fasta $fasta: $str\n";
	$errors++;
    }

    if ($errors)
    {
	die "validation of $tbl $fasta failed with $errors errors\n";
    }
	

    return($type, $min, $max, [@feats]);
}

sub run
{
    my $rc = system(@_);
    if ($rc != 0)
    {
	die "Error $rc running cmd: @_\n";
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3