[Bio] / FigKernelScripts / build_nr.pl Repository:
ViewVC logotype

View of /FigKernelScripts/build_nr.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.22 - (download) (as text) (annotate)
Mon Feb 18 22:32:20 2008 UTC (12 years, 1 month ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.21: +42 -15 lines
Add -no-duplicates support for build_anno_clearinghouse and build_nr.

########################################################################
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

# usage: build_nr

use DB_File;

my $tmpD = "$FIG_Config::temp";
my $tmp  = "$tmpD/tmp$$";
#my $tmp  = "$tmpD/tmp28968";

my $sort_size;

my $usage = <<END;
usage: build_nr [opts] SourcesInFile OldNR OldPegSyns NR PEGsynonyms";
	-sort-size size		Max amount of memory to devote to sorting
	-skip-duplicates	Remove duplicates (otherwise dups are a fatal error)
	-no-duplicates		Assert no duplicates in input - saves RAM, can be dangerous
				if dups do exist
	-synonyms syns-file	Haven't a clue. Doesn't seem to be used.
	-emit-singletons	Assign peg.synonyms ids to singleton sequences
	-rev			Sort candidate sets into reverse length order (produces
				synonym sets that are not all the same length).
	-singleton-file file	Write singletons to the given file
	-singleton-index file	Create singleton btree in the given file
	-index file		Create indexed peg synonyms in file.t and file.f.
END

my($sources,$nrF,$synF,$oldsyns,$oldnr);

my $syns_file;
my $skip_duplicates;
my $no_duplicates;
my $emit_singleton_sets;
my $dup_show_max = 100;
my $rev_block_sort;
my($singleton_file, $singleton_fh, $singleton_index, %singleton);
my($pegsyn_index, %indext, %indexf, $pegsyn_tie_t, $pegsyn_tie_f);
my($pegsyn_tmp_file_f, $pegsyn_tmp_fh_f);
my($pegsyn_tmp_file_t, $pegsyn_tmp_fh_t);

while ((@ARGV > 0) && ($ARGV[0] =~ /^-/))
{
    my $arg = shift @ARGV;
    if ($arg =~ /^-syn/)
    {
	$syns_file = shift;
    }
    elsif ($arg =~ /^-sort-size/)
    {
	my $sz = shift;
	$sort_size = "-S $sz";
    }
    elsif ($arg =~ /^-rev/)
    {
	$rev_block_sort++;
    }
    elsif ($arg =~ /^-skip-dup/)
    {
	$skip_duplicates++;
    }
    elsif ($arg =~ /^-no-dup/)
    {
	$no_duplicates++;
    }
    elsif ($arg =~ /^-emit-sin/)
    {
	$emit_singleton_sets++;
    }
    elsif ($arg =~ /^-singleton-file/)
    {
	$singleton_file = shift;
	$singleton_fh = new FileHandle(">$singleton_file");
	$singleton_fh or die "Cannot write singleton file $singleton_file: $!\n";
    }
    elsif ($arg =~ /^-singleton-index/)
    {
	$singleton_index = shift;
	unlink $singleton_index if $singleton_index;
	my $tie = tie %singleton, 'DB_File', $singleton_index, O_RDWR | O_CREAT, 0666, $DB_BTREE;
	$tie or die "Cannot write singleton index $singleton_index: $!\n";
    }
    elsif ($arg =~ /^-index/)
    {
	$pegsyn_index = shift;
	unlink "$pegsyn_index.t" if -f "$pegsyn_index.t";
	unlink "$pegsyn_index.f" if -f "$pegsyn_index.f";

	$pegsyn_tie_t = tie %indext, 'DB_File', "$pegsyn_index.t", O_RDWR | O_CREAT, 0666, $DB_BTREE;
	$pegsyn_tie_t or die "Cannot create $pegsyn_index.t: $!\n";
	$pegsyn_tie_f = tie %indexf, 'DB_File', "$pegsyn_index.f", O_RDWR | O_CREAT, 0666, $DB_BTREE;
	$pegsyn_tie_f or die "Cannot create $pegsyn_index.f: $!\n";

	#
	# We also open two temp files, one for the forward index, one for the reverse.
	# We sort into the files, then use the sorted files to insert the
	# index entries into the btree.
	#
	# Open the files for writing, to ensure we can do so. Then reopen as a
	# pipe into a sort.
	#

	$pegsyn_tmp_file_f = "$FIG_Config::temp/pegsyn_tmp_f";
	$pegsyn_tmp_fh_f = new FileHandle(">$pegsyn_tmp_file_f");
	$pegsyn_tmp_fh_f or die "Cannot open $pegsyn_tmp_file_f for writing: $!";
	close($pegsyn_tmp_fh_f);
	$pegsyn_tmp_fh_f = new FileHandle("| sort > $pegsyn_tmp_file_f");
	$pegsyn_tmp_fh_f or die "Cannot open sort pipe to $pegsyn_tmp_file_f: $!";
	
	$pegsyn_tmp_file_t = "$FIG_Config::temp/pegsyn_tmp_t";
	$pegsyn_tmp_fh_t = new FileHandle(">$pegsyn_tmp_file_t");
	$pegsyn_tmp_fh_t or die "Cannot open $pegsyn_tmp_file_t for writing: $!";
	close($pegsyn_tmp_fh_t);
	$pegsyn_tmp_fh_t = new FileHandle("| sort > $pegsyn_tmp_file_t");
	$pegsyn_tmp_fh_t or die "Cannot open sort pipe to $pegsyn_tmp_file_t: $!";

	print "Writing pegsyn index $pegsyn_index\n";
    }
    else
    {
	die "Unknown option $arg\n";
    }
}

(
 ($sources = shift @ARGV) &&
 ($oldnr   = shift @ARGV) &&
 ($oldsyns = shift @ARGV) &&
 ($nrF     = shift @ARGV) &&
 ($synF    = shift @ARGV)
)
    || die $usage;

if ($no_duplicates and $skip_duplicates)
{
    die "The -no-duplicates and -skip-duplicates arguments are mutually exclusive\n";
}

use strict;
use FIG;
my($file,$id,$seq,$n,%type,$curr,$last,@ids,$t1,$t2,$i,%syn,$syns,%same,$extra);

$| = 1;

### One pass through the old peg.synonyms to get the "next available xxx id"
###
print "Reading old syns\n";
my $next_syn = 0;
if (open(OLDSYN,"<$oldsyns"))
{
    while (defined($_ = <OLDSYN>))
    {
	if ($_ !~ /^xxx\d+/)
	{
	    die "BAD peg.synonyms: $oldsyns: SEE $_";
	}

	while ($_ =~ /xxx(\d+)/g)
	{
	    $next_syn = ($1 > $next_syn) ? $1 : $next_syn;
	}
    }
    close(OLDSYN);
}
$next_syn++;

open(ALL,">$tmp.all") || die "could not open $tmp.all: $!";
### Pick up old existing xxx entries
###

print "Reading old NR\n";
open(OLDNR,"<$oldnr") || die "could not open $oldnr: $!";

##############
### Now build a fasta file that sums all of the sources.  We add new xxx entries after this
### first capture the xxx entries from the old one -- they never go away
###
{
    local $/ = "\n>";
    while (defined($_ = <OLDNR>))
    {
	chomp;
	if ($_ =~ /^>?(xxx\d+)[^\n]*\n(.*)/s)
	{
	    $id  =  $1;
	    $seq =  $2;
	    $seq =~ s/\s//gs;
	    print ALL ">$id\n$seq\n";
	}
    }
    close(OLDNR);
}

#
# Check for duplicate ids. These should not happen, but we have
# huge badness if they do.
#
# For large jobs we can't afford the memory for keeping all sequences
# in RAM, and we can't afford the time to keep them in a tie.
#
# Instead, we just watch for dups within the files.
#
# This is pretty imposing as well. A better solution might be to assume
# we have no dups, and build stuff as normal. In parallel with this
# processing, we open a pipe to a sort | uniq -c and. When we finish
# writing the seqs, we spin through the output of the sort to find
# any duplicate entries. We should use much less memory this way.
#

#my %id_seen;
#my $id_file = "$FIG_Config::temp/idseen.$$.tmp";
#tie %id_seen, 'DB_File', $id_file, O_CREAT | O_RDWR, 0666, $DB_HASH;

print "Reading sources\n";
foreach $file (`sort -u $sources`)
{
    chomp $file;

    #
    # We need to check here if the file is a fasta from a live SEED.
    #
    # If it is, we may see multiple entries for any given ID. We need
    # to keep only the last one.
    #
    my $is_seed;
    my %id_seq;
    my %id_seen;
    if ($file =~ m,/Organisms/\d+\.\d+/Features/peg,)
    {
	$is_seed++;
#	print "$file is a seed file\n";
    }

    my $dup_count;
    if (open(TMP,"<$file"))
    {
	local $/ = "\n>";
    sequence:
	while (defined($_ = <TMP>))
	{
	    chomp;
	    if ($_ =~ /^>?(\S+)[^\n]*\n(.*)/s)
	    {
		$id  =  $1;
		$seq =  $2;

		#
		# It's possible, maybe, sometime (if we more extensively
		# parse say the NCBI NR format) that one ID line will
		# turn into multiple ids. Support that here.
		#
		my @pids = reformat_id($id);
			
		$seq =~ s/\s//gs;
		if (&good_seq($seq))
		{
		    if ($seq =~ s/>//g)
		    {
			print STDERR "$file has embedded >s\n";
		    }
		    for my $id (@pids)
		    {
			#
			# We can have SEED duplicates within the file.
			#
			if ($is_seed)
			{
			    $id_seq{$id} = $seq;
			}
			else
			{
			    if (!$no_duplicates)
			    {
				if ($id_seen{$id})
				{
				    if ($skip_duplicates)
				    {
					$dup_count++;
					if ($dup_count == $dup_show_max)
					{
					    warn "Duplicate ID $id  found in $file (Prior was in $id_seen{$id})\n";
					    warn "(Not showing any more duplicates)\n";
					}
					elsif ($dup_count < $dup_show_max)
					{
					    warn "Duplicate ID $id  found in $file (Prior was in $id_seen{$id})\n";
					}
					next sequence;
				    }
				    else
				    {
					die "Duplicate ID $id  found in $file (Prior was in $id_seen{$id})\n";
				    }
				}
				$id_seen{$id}= $file;
			    }
			    
			    print ALL ">$id\n$seq\n";
			}
		    }
		}
	    }
	}
	close(TMP);

	if ($is_seed)
	{
	    for my $id (keys %id_seq)
	    {
		print ALL ">$id\n$id_seq{$id}\n";
	    }
	}
    }
    else
    {
	print STDERR "could not open $file\n";
    }
}
#undef %id_seen;
close(ALL);
###
### End of the build from the given sources -- we still lack some xxx entries, but that comes later
###
### Build entries with seq that is reversed, the first amino acid trimmed, etc.  This will
### allow us to sort them into "groups of equivalent sequences
###"
open(ALL,"<$tmp.all") || die "could not reopen $tmp.all: $!";
my $sort_cmd = " sort $sort_size -T $tmpD >$tmp.normalized";
open(NORMALIZED,"| $sort_cmd")
    || die "could not open sort cmd $sort_cmd: $!";

print "writing normalized sources\n";
while (defined($_ = <ALL>) && ($_ =~ /^>(\S+)/))
{
    $id = $1;
    $seq = <ALL>;
    $n = length($seq);
    $seq =  reverse(substr($seq,1,$n-2));
    $seq =~ tr/a-z/A-Z/;
    print NORMALIZED "$seq\t$id\n";
}
close(ALL);
close(NORMALIZED);
xxx:
print "Reading normalized and processing\n";
(open(NORMALIZED,"<$tmp.normalized") && (defined($last = <NORMALIZED>)))
    || die "could not open $tmp.normalized: $!";

%type  = ( "xxx"   => 0,
	   "sp"    => 1,
	   "uni"   => 2,
	   "pir"   => 3,
	   "pirnr" => 4,
	   "gi"    => 5,
	   "fig"   => 6,
	   "tr"    => 7,
	   "tn"    => 8,
	   "nrl"   => 10
);

### Now we build the new synonyms, adding xxx entries to the NR when necessary
###

open(SYN,">$synF") 
    || die "could not open $synF: $!";

#
# Hash mapping from an existing peg id to a new xxx id.
# Used to add the new sequences to the NR.
#
my %add_to_all;

#
# Size of the tied %add_to_all on the big runs was under 400M, should be able to stay in memory.
#
#my $all_file = "$FIG_Config::temp/all.$$.tmp";
#tie %add_to_all, 'DB_File', $all_file, O_CREAT | O_RDWR, 0666, $DB_HASH;

my $min_ids_in_set = $emit_singleton_sets ? 0 : 1;

while ($last)
{
    ($curr,$id) = split(/\t/,$last);
    chop $id;
    @ids = ([$id,length($curr)+1,$curr]);
    my $last_sequence = undef;
    while (defined($last = <NORMALIZED>) && 
	   (($seq,$id) = split(/\t/,$last)) &&
	   (&FIG::same_seqs($curr,$seq) || ($last_sequence && &FIG::same_seqs($seq,$last_sequence))))
    {
	chomp $id;
	push(@ids,[$id,length($seq)+1,$seq]);
	$last_sequence = $seq;
    }

    if (@ids == 1)
    {
	print $singleton_fh "$ids[0]->[0]\n" if ($singleton_fh);
	$singleton{$ids[0]->[0]} = 1 if $singleton_index;
    }


    if (@ids > $min_ids_in_set)
    {
	&dump_a_set(\@ids,\$next_syn,\*SYN,\%add_to_all,\%same);
    }
}
close(SYN);
close(NORMALIZED);
#unlink("$tmp.normalized");

###
### Now we build the new NR by extracting from the complete set of input sequences.
##
open(ALL,"<$tmp.all") || die "could not open $tmp.all: $!";
open(NR,">$nrF") || die "could not open $nrF: $!";

{
    local $/ = "\n>";
    while (defined($_ = <ALL>))
    {
	chomp;
	if ($_ =~ /^>?(\S+)[^\n]*\n(.*)/s)
	{
	    $id  =  $1;
	    $seq =  $2;
	    if (! $same{$id})   # drop entries from the peg.synonyms that are not principle synonyms
	    {
		&display_id_and_seq($id,\$seq,\*NR);
	    }
	    
	    if (my $new_ps = $add_to_all{$id})
	    {
		&display_id_and_seq($new_ps,\$seq,\*NR);
	    }
	}
    }
}
close(NR);
close(ALL);
#unlink("$tmp.all");

if ($pegsyn_index)
{
    print "Awaiting finish of sort of from-index\n";
    if (!close($pegsyn_tmp_fh_f))
    {
	warn "Error closing pipe to $pegsyn_tmp_file_f: $! $?\n";
    }
    print "Done\n";
    
    print "Awaiting finish of sort of to-index\n";
    if (!close($pegsyn_tmp_fh_t))
    {
	warn "Error closing pipe to $pegsyn_tmp_file_t: $! $?\n";
    }
    print "Done\n";
    
    print "Writing to-index\n";
    open(F, "<$pegsyn_tmp_file_f") or die  "Cannot open $pegsyn_tmp_file_f: $!";
    while (<F>)
    {
	chomp;
	my($k,$v) = split(/\t/);
	$indexf{$k} = $v;
    }
    close(F);
    
    print "Writing from-index\n";
    open(T, "<$pegsyn_tmp_file_t") or die  "Cannot open $pegsyn_tmp_file_t: $!";
    while (<T>)
    {
	chomp;
	my($k,$v) = split(/\t/);
	$indext{$k} = $v;
    }
    close(T);
}


if ($pegsyn_index)
{
    untie %indext;
    untie %indexf;
    undef $pegsyn_tie_t;
    undef $pegsyn_tie_f;
}

if ($singleton_index)
{
    untie %singleton;
}

$singleton_fh->close() if $singleton_fh;


#
# There's a fair bit of history here.
#
#
# In addition, it's possible, maybe, sometime (if we more extensively
# parse say the NCBI NR format) that one ID line will
# turn into multiple ids. Support that here by returning a list of
# reformatted ids.
#
#

sub reformat_id
{
    my($id) = @_;
    
		$id =~ s/\cA.*$//;
#		$id =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/;  allow multiple |s
#		$id =~ s/\|\s*$//;

		#
		# for gi numbers, delete all the stuff after the gi|number
		#

 		$id =~ s/^(gi\|\d+).*$/$1/g;
# 		{
# 		    print "fixed $id\n";
# 		}
# 		else
# 		{
# 		    print "no $id\n";
# 		}
			

#		push(@pids, $id);

		#
		# Recognize NCBI's multi-bar id lines.
		#

#		my @bars = split(/\|/, $id);
#
#		while (@bars)
#		{
#		    #
#		    # Reassemble pairs of elements pulled off the front of the list of split-up elements.
#		    #
#		    push(@pids, join("|", splice(@bars, 0, 2)));
#		}
    return ($id);
}

sub display_id_and_seq {
    my( $id, $seq, $fh ) = @_;
    
    if (! defined($fh) )  { $fh = \*STDOUT; }
    
    print $fh ">$id\n";
    &display_seq($seq, $fh);
}

sub display_seq {
    my ( $seq, $fh ) = @_;
    my ( $i, $n, $ln );
    
    if (! defined($fh) )  { $fh = \*STDOUT; }

    $n = length($$seq);
#   confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
    for ($i=0; ($i < $n); $i += 60)
    {
	if (($i + 60) <= $n)
	{
	    $ln = substr($$seq,$i,60);
	}
	else
	{
	    $ln = substr($$seq,$i,($n-$i));
	}
	print $fh "$ln\n";
    }
}

sub good_seq {
    my($seq) = @_;

    my $xs = ($seq =~ tr/xX//);
    my $ln = length($seq);
    return (($ln - $xs) > 25);
}

#
# $ids is a list of lists [$id, $len, $seq]
#
sub dump_a_set {
    my($ids,$next_synP,$fh_syn,$add_to_all,$same) = @_;

    my @tmp = sort { if     (($a->[0] =~ /^xxx/) && ($b->[0] !~ /^xxx/))    { -1 }
		     elsif  (($b->[0] =~ /^xxx/) && ($a->[0] !~ /^xxx/))    {  1 }
		     elsif ($rev_block_sort)
		     {
		         ($b->[1] <=> $a->[1]) or &by_ids($a->[0],$b->[0]);
		     }
		     else
		     {
		         ($a->[1] <=> $b->[1]) or &by_ids($a->[0],$b->[0]);
		     }
		   } @$ids;


    my @ids = ();
    my %seen;
    foreach my $x (@tmp)
    {
	if (! $seen{$x->[0]})
	{
	    push(@ids,$x);
	    $seen{$x->[0]} = 1;
	}
    }

#    print Dumper(\@ids);
    while (my $tuple = shift @ids)
    {
	my($ps,$ln,$seq1) = @$tuple;
	my @set = ($tuple);
	my @left = ();
	foreach my $tuple2 (@ids)
	{
	    my($id2,$ln2,$seq2) = @$tuple2;
	    if ($ln2 > $ln)
	    {
		push(@left,$tuple2);
	    }
	    elsif (&FIG::same_seqs($seq1,$seq2))
	    {
		push(@set,$tuple2);
	    }
	    else
	    {
		push(@left,$tuple2);
	    }
	}

	#
	# Singletons don't go into peg.synonyms.
	#
	if (@set < 2)
	{
	    if (!$emit_singleton_sets)
	    {
		@ids = @left;
		next;
	    }
	}

	if ($ps !~ /^xxx/)
	{
	    my $new_ps = "xxx" . ('0' x (8 - length($$next_synP))) . $$next_synP;
	    unshift(@set,[$new_ps, $ln, $seq1]);
	    $add_to_all{$ps} = $new_ps;
	    # print $fh_all ">$new_ps\n$seq1\n";
	    $$next_synP++;
	}

	#
	# $set[0] is the xxx identifier for this set. The code directly above
	# allocates a new identifier in the event that we came in without one assigned.
	#

	for (my $i=1; ($i < @set); $i++) { $same->{$set[$i]->[0]} = 1 }

	my $syns = join(";",map { $_->[0] . "," . $_->[1] } @set[1..$#set]);
	print $fh_syn "$set[0]->[0],$set[0]->[1]\t$syns\n";

	if ($pegsyn_index)
	{
	    #$indext{$set[0]->[0]} = "$set[0]->[1]:$syns";
	    #for my $s (@set[1..$#set])
	    #{
	    #$indexf{$s->[0]} = "$s->[1]:$set[0]->[0]";
	    #}
	    #
	    # Defer the actual btree inserts until later. Instead,
	    # write to the sort pipe that goes into the tmpiles.
	    print $pegsyn_tmp_fh_t "$set[0]->[0]\t$set[0]->[1]:$syns\n";
	    for my $s (@set[1..$#set])
	    {
		print $pegsyn_tmp_fh_f "$s->[0]\t$s->[1]:$set[0]->[0]\n";
		
	    }
	    
	}

	@ids = @left;
    }
}

sub by_ids {
    my($id1,$id2) = @_;
    my($t1,$t2);

    $id1 =~ /^([^\|]+)/; $t1 = $1;
    $id2 =~ /^([^\|]+)/; $t2 = $1;
    if (defined($t1) && defined($t1 = $type{$t1}) &&
	defined($t2) && defined($t2 = $type{$t2}))
    {
	return (($t1 <=> $t2) or ($id1 cmp $id2));
    }
    elsif (($id1 =~ /\|/)  && ($b !~ /\|/))   {  return 1 }
    elsif (($id2 =~ /\|/)  && ($a !~ /\|/))   {  return -1 }
    elsif (defined($t1))                      {  return -1 }
    elsif (defined($t2))                      {  return 1 }
    else  { return ($id1 cmp $id2) }
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3