[Bio] / StandaloneTools / split_into_taxa.pl Repository:
ViewVC logotype

View of /StandaloneTools/split_into_taxa.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Wed Aug 30 21:31:34 2006 UTC (13 years, 1 month ago) by overbeek
Branch: MAIN
CVS Tags: HEAD
Script to split concatenated RefSeq files into taxa. -- /gdp

#!/usr/bin/perl -w

# usage: split_into_taxa Taxa Files 

($taxon_dir = shift @ARGV)
    || die "usage: split_into_taxa Taxa Files ";

print STDERR "Reading TaxDump names...\n";
(@_ = `cat /disks/project/overbeek/RefSeq/names.dmp`) || die "could not read taxdump names, /disks/project/overbeek/RefSeq/names.dmp";
foreach $_ (@_)
{
    ($id, $name, $unique_name) = split /\t\|\t/, $_;
    if ($unique_name) { 
	$name_of{$id} = $unique_name;
    } else {
	$name_of{$id} = $name;
    }
}

mkdir($taxon_dir,0777) || die "could not make $taxon_dir";

foreach $file (sort { $a =~ m/(\d+)/; $x = $1; $b =~ m/(\d+)/; $y = $1; (($x <=> $y) || ($a cmp $b)) } @ARGV)
{
    print STDERR "splitting $file\n";
    $/ = "\n//\n";
    open(TMP,"zcat $file |") || die "could not open $file";
    while (defined($_ = <TMP>))
    {
#	next if (($_ !~ /LOCUS\s+N[GCWZ]/s) || ($_ !~ /ORIGIN/s));
#	next if ($_ !~ /LOCUS\s+N[GCWZ]/s);
	
	@taxa = ();
	undef %taxa;
	while ($_ =~ /=\"taxon:(\d+)/g)
	{
	    $taxon    = $1;
	    $taxa{$1} =  1;
	    push @taxa, $1;
	}
	
	if ((keys %taxa) > 1)
	{
	    @_ = @taxa;
	    print STDERR "Multiple-taxon record in $file: ", join(", ", map { $_ = "$_ ($name_of{$_})" } @_),"\n";
	    print STDOUT $_;
	}
	
	if (@taxa > 0)
	{
	    $taxon = $taxa[0];
	    open( TAXON, ">>$taxon_dir/$taxon") || die "could not open $taxon_dir/$taxon";
	    print TAXON  $_;
	    close(TAXON);
	}
    }
    close(TMP);
}



MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3