[Bio] / Babel / bin / build_nr_from_md5.pl Repository:
ViewVC logotype

View of /Babel/bin/build_nr_from_md5.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Tue Dec 1 15:22:43 2009 UTC (10 years, 5 months ago) by wilke
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_03052011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_release_3_0, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, mgrast_dev_04052011, mgrast_dev_02222011
initial babel checkin

use strict;
use warnings;
use vars qw($opt_f $opt_d $opt_o);
use Getopt::Std;
use Digest::MD5;
#use Bio::SearchIO;
  


getopts('f:d:o:');

my @files;

if ($opt_d and -d $opt_d){

    my @list =`find $opt_d -name "*md5.fasta"`;
    foreach my $var (@list){
	chomp $var;
	push  @files, $var;
    }
    print STDERR join "\n" , @files , "\n";

}
elsif($opt_f and -f $opt_f){
    
    push @files , $opt_f;
}

$opt_o = "/tmp/" unless ($opt_o);


my $id_hash = {};

$id_hash = read_fasta( $id_hash , @files);

# my $md5 = Digest::MD5::md5_hex( uc $sequence );


sub read_fasta{
    my ($hash , $files) = @_;

    my $default = $/;
   
    
    open(NR , ">$opt_o/nr") or die "Can't open nr\n";


    foreach my $file (@files){

	my ($path,$source) = $file =~ /([\w\/]+)\/(\w+)\/[\w\.]+$/;
	print  "Reading $file\n";
	print  "$source\n$path\n";
	# next unless ($source=~/NCBI/);
	my $ids = {};

   
  
	open(FASTA , $file ) or die "Can't open file $file \n!";
	# set line end
	$/=">";

	my $count = 0;
	while(my $line = <FASTA>){
	    my @entries = split "\n" , $line;
	    my $end = pop @entries;
	    my $md5id  = shift @entries;
	    my $fasta = join "" , @entries;
	    next unless ($fasta);

       

	    # print "$md5id\n";
	    $hash->{ $md5id }++ ;


	    if ( $hash->{ $md5id }  < 2 ) {
		print NR ">$md5id\n$fasta\n";
	    } 
	    $count++;
	    # exit if ($count > 3);
	}
	
	
	close(FASTA);

	# set line end back to default
	$/=$default;
    }
    
    close(FILE);
    close(NR);

    $/=$default;
    return $hash;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3