[Bio] / FigKernelScripts / flatten_attribute_files.pl Repository:
ViewVC logotype

View of /FigKernelScripts/flatten_attribute_files.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Fri Jan 21 20:39:29 2011 UTC (9 years, 1 month ago) by redwards
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, mgrast_release_3_0, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.1: +1 -1 lines
working on all subdirectories, not just a set of them

# __perl__

=pod

=head1 flatten attributes

There is a problem that the transaction_log files for attributes get so bloated it takes for ever to parse them for a new load. This script flattens them without doing any db access, so is quick, and will significantly reduce your load time!

We are going to do this two ways: 

(i) If there are only ADD commands in the transaction_log, we're just going to hash them to get the unique values and then dump them out.

(ii) If there  are DELETE commands too, then we'll parse out all the ADDs and DELETEs and deal with them separately. This means that we have two copies of the data, but most of the files are redundant, and so it should not be that burdensome.

=cut




use strict;
use File::Find;

for (my $dircount=9; $dircount>0; $dircount--) {
	print STDERR "Directories $dircount*\n";
	my @tls;
	my @dirs = `ls -d $FIG_Config::organisms/$dircount*`;
	chomp @dirs;
	print STDERR "Found ", scalar(@dirs), " dirs\n";
	foreach my $d (@dirs) {
		my @tltemp = `find $d -name transaction_log`;
		chomp @tltemp;
		push @tls, @tltemp;
	}
	print STDERR "Found ", scalar(@tls), " tls\n";

	foreach my $file (@tls) {
		print STDERR "$file\n";
		# we make a backup of the file, and then operate on that, overwriting the original :)
		&backup_file($file);
		my $addonly = 1;
		my %all;
		my $data;
		my @comments;
		open(IN, "${file}~") || die "can't open ${file}~";
		while (<IN>) {
			next if (/^\s+$/); # skip blank lines.
			$all{$_}=1; # the hash of every line, including newlines and comments. If there are only adds, this is what we'll dump
			if (/^\s*\#/) {push @comments, $_; next}

			chomp;
			my @a=split /\t/;
			if ($#a > 4) {print STDERR "Too many columns at $_?\n"}
			# hash the data, setting the value to +1 for an add, -1 for a delete
			# whatever we get to at the end is the final command to print
			if ($a[0] eq "ADD") {
				$data->{$a[1]}->{$a[2]}->{$a[3]}->{$a[4]}=1;
			}
			elsif ($a[0] eq "DELETE") {
				$data->{$a[1]}->{$a[2]}->{$a[3]}->{$a[4]}=-1;
				$addonly=0;
			}
			else {
				print STDERR "What is a '$a[0]' at $_\n";
			}
		}

		open(OUT, ">$file") || die "can't write to $file";
		if ($addonly) {
			print OUT join("", keys %all);
		}
		else {
			foreach my $i (keys %$data) {
				foreach my $j (keys %{$data->{$i}}) {
					foreach my $k (keys %{$data->{$i}->{$j}}) {
						foreach my $l (keys %{$data->{$i}->{$j}->{$k}}) {
							if ($data->{$i}->{$j}->{$k}->{$l} == 1) {
								print OUT join("\t", "ADD", $i, $j, $k, $l), "\n";
							}
							elsif ($data->{$i}->{$j}->{$k}->{$l} == -1) {
								print OUT join("\t", "DELETE", $i, $j, $k, $l), "\n";
							}
							else {
								print STDERR "What is ", $data->{$i}->{$j}->{$k}->{$l}, " at $i and $j and $k in $file?\n";
							}
						}
					}
				}
			}
		}
		close OUT;
	}

}



sub backup_file {
	my $file=shift;
	`mv $file ${file}~`;
	return 1;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3