[Bio] / FigKernelScripts / build_nr_md5.pl Repository:
ViewVC logotype

View of /FigKernelScripts/build_nr_md5.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Thu Jun 6 17:29:14 2013 UTC (6 years, 5 months ago) by olson
Branch: MAIN
CVS Tags: rast_rel_2014_0729, rast_rel_2014_0912, HEAD
md5 nr builder

########################################################################
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

#
# Build an NR using the MD5 checksums of the input sequences.
#

# usage: build_nr

use DB_File;

use Digest::MD5 'md5_hex';

my $tmpD = "$FIG_Config::temp";
my $tmp  = "$tmpD/tmp$$";
#my $tmp  = "$tmpD/tmp28968";

my $usage = <<END;
usage: build_nr [opts] SourcesInFile NR PEGsynonyms nr-len-btree fig-ids";
	-index file		Create indexed peg synonyms in file.t and file.f.
END

my($sources,$nrF,$synF,$oldsyns,$oldnr);

my $syns_file;
my $skip_duplicates;
my $no_duplicates;
my $emit_singleton_sets;
my $dup_show_max = 100;
my $rev_block_sort;
my($singleton_file, $singleton_fh, $singleton_index, %singleton);
my($pegsyn_index, %indext, %indexf, $pegsyn_tie_t, $pegsyn_tie_f);
my($pegsyn_tmp_file_f, $pegsyn_tmp_fh_f);
my($pegsyn_tmp_file_t, $pegsyn_tmp_fh_t);
my($len_btree_file, $fig_ids_file);
while ((@ARGV > 0) && ($ARGV[0] =~ /^-/))
{
    my $arg = shift @ARGV;
    if(0)
    {
    }
    else
    {
	die "Unknown option $arg\n";
    }
}

(
 ($sources = shift @ARGV) &&
 ($nrF     = shift @ARGV) &&
 ($synF    = shift @ARGV) &&
 ($len_btree_file    = shift @ARGV) &&
 ($fig_ids_file    = shift @ARGV)
)
    || die $usage;

use strict;
use FIG;

$| = 1;

#
# Read sources. 
#
# We compute a checksum for each, and if we have not seen the checksum before, we
# write to the generated NR.
#
# The peg.synonyms then comes from the list of ids we keep in the hash
# of md5 checksums.
#

my %seq_len;
my %md5_seen;

my $seq_count = 0;

open(NR, ">$nrF") or die;
open(PS, "| sort -S 1G -u > $synF.2col") or die;

my %len_btree;
unlink($len_btree_file);
my $len_btree_tie = tie %len_btree, 'DB_File', $len_btree_file, O_RDWR | O_CREAT, 0666, $DB_BTREE;

$len_btree_tie or die "Cannot create btree $len_btree_file: $!\n";

my %source_seen;
print "Reading sources\n";
open(SRC, "<", $sources) or die "Cannot read NR sources $sources: $!";
foreach my $file (<SRC>)
{
    chomp $file;
    next if $source_seen{$file};
    $source_seen{$file}++;

    print "$file\n";

    #
    # We need to check here if the file is a fasta from a live SEED.
    #
    # If it is, we may see multiple entries for any given ID. We need
    # to keep only the last one.
    #
    my $is_seed;
    my %id_seq;
    my %id_seen;
    if ($file =~ m,/Organisms/\d+\.\d+/Features/peg,)
    {
	$is_seed++;
#	print "$file is a seed file\n";
    }

    my $dup_count;
    if (open(TMP,"<$file"))
    {
	local $/ = "\n>";
    sequence:
	while (defined($_ = <TMP>))
	{
	    chomp;
	    if ($_ =~ /^>?(\S+)[^\n]*\n(.*)/s)
	    {
		my $id  =  $1;
		my $seq =  $2;

		if (!$is_seed && $id =~ /^fig\|/ && $file !~ /rast.*jobs/)
		{
		    die "BAD input: fig identifier in non-SEED data file $file";
		}

		#
		# It's possible, maybe, sometime (if we more extensively
		# parse say the NCBI NR format) that one ID line will
		# turn into multiple ids. Support that here.
		#
		my @pids = reformat_id($id);
			
		$seq =~ s/\s//gs;
		if (&good_seq($seq))
		{
		    if ($seq =~ s/>//g)
		    {
			print STDERR "$file has embedded >s\n";
		    }
		    for my $id (@pids)
		    {
			#
			# We can have SEED duplicates within the file.
			#
			if ($is_seed)
			{
			    $id_seq{$id} = $seq;
			}
			else
			{
			    handle_sequence($id, $seq);
			}
		    }
		}
	    }
	}
	close(TMP);

	if ($is_seed)
	{
	    for my $id (keys %id_seq)
	    {
		handle_sequence($id, $id_seq{$id});
	    }
	}
    }
    else
    {
	print STDERR "could not open $file\n";
    }
}
close(SRC);
#undef %id_seen;
close(ALL);
close(NR);
close(PS);

#
# Process the 2-column table form of the peg syn, and generate the
# peg.synonyms, nr-len.btree, and ids.fig.
#


open(PS2C, "<", "$synF.2col") or die;
open(PS, ">", $synF) or die;
open(FIG, ">", $fig_ids_file) or die;

my($id1, $len, $id2);
my($cur);
my $saw_fig;
while (<PS2C>)
{
    if (($id1, $len, $id2) = /^([^,]+),(\d+)\t(\S+)/)
    {
	if ($cur ne $id1)
	{
	    if ($cur)
	    {
		print PS "\n";
	    }
	    print PS "$id1,$len\t$id2,$len";
	    $len_btree{$id1} = $len;
	    $cur = $id1;
	    undef $saw_fig;
	}
	else
	{
	    print PS ";$id2,$len";
	}

	if (!$saw_fig and $id2 =~ /^fig\|(\d+\.\d+)/)
	{
	    print FIG "$id1\t$1\n";
	    $saw_fig++;
	}
    }
}
print PS "\n";

close(FIG);
close(PS);
untie %len_btree;

exit;

sub handle_sequence
{
    my($id, $seq) = @_;

    my $d = md5_hex(uc($seq));

    my $mid = "gnl|md5|$d";
    if (!exists($md5_seen{$d}))
    {
	$md5_seen{$d} = 1;
	&FIG::display_id_and_seq($mid, \$seq, \*NR);
	$seq_count++;
    }
    my $len = length($seq);
    print PS "$mid,$len\t$id\n";
}

sub good_seq {
    my($seq) = @_;

    my $xs = ($seq =~ tr/xX//);
    my $ln = length($seq);
    return (($ln - $xs) > 25);
}


#
# There's a fair bit of history here.
#
#
# In addition, it's possible, maybe, sometime (if we more extensively
# parse say the NCBI NR format) that one ID line will
# turn into multiple ids. Support that here by returning a list of
# reformatted ids.
#
#

sub reformat_id
{
    my($id) = @_;
    
		$id =~ s/\cA.*$//;
#		$id =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/;  allow multiple |s
#		$id =~ s/\|\s*$//;

		#
		# for gi numbers, delete all the stuff after the gi|number
		#

 		$id =~ s/^(gi\|\d+).*$/$1/g;
# 		{
# 		    print "fixed $id\n";
# 		}
# 		else
# 		{
# 		    print "no $id\n";
# 		}
			

#		push(@pids, $id);

		#
		# Recognize NCBI's multi-bar id lines.
		#

#		my @bars = split(/\|/, $id);
#
#		while (@bars)
#		{
#		    #
#		    # Reassemble pairs of elements pulled off the front of the list of split-up elements.
#		    #
#		    push(@pids, join("|", splice(@bars, 0, 2)));
#		}
    return ($id);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3