[Bio] / MGRASTBackend / sequence_dereplication.pl Repository:
ViewVC logotype

View of /MGRASTBackend/sequence_dereplication.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Mon Jul 6 19:40:28 2009 UTC (10 years, 3 months ago) by arodri7
Branch: MAIN
CVS Tags: HEAD
first version, figfams pipeline

use strict;
use warnings;
use vars qw($opt_f $opt_c $opt_d $opt_v $opt_o);
use Getopt::Std;
use Digest::MD5;

my $usage = "$0 -f fasta_file [-c cuttoff] [-v] [-o output_fasta]\n";
$usage .= "-v prints out the unique fasta sequences to a fasta file\n";
  
getopts('f:c:d:vo:');

my $file = $opt_f || die "you did not enter a fasta file.\n$usage\n";

my $outputFasta;
if ($opt_o) {
  $outputFasta = $opt_o;
}
else {
  $outputFasta = "unique.fasta";
}

unless($file and -f $file){
  print STDERR "No file $file\n";
  print STDERR "$0 -f FILE -c PREFIX_LENGHTH -d DESTINATION_DIR\n";
  exit;
}

# set length for prefix
my $cutoff = $opt_c || 100;


# read fasta file and remove duplicates and clusters
my ($md5_hash , $seq , $dupl , $stats) = check_fasta($file , $cutoff);


my @distr;

for my $md5 (keys %$md5_hash){
  unless  ( $distr[  scalar @{ $md5_hash->{ $md5 } } ] ) {
     $distr[  scalar @{ $md5_hash->{ $md5 } } ] = 0;
  }
  $distr[  scalar @{ $md5_hash->{ $md5 } } ]++;
}

if ($opt_v){
  print "Remaining Sequences\t" , scalar ( keys %$seq ) ,"\n";
  print "Removed Sequences\t" , scalar ( keys %$dupl ) ,"\n";
  
  
  print "nr members in md5 set\tnr md5 sets\n";
  for (my $i=1 ; $i < @distr ;  $i++ ){
    print $i , "\t", $distr[$i] , "\n" if (  $distr[$i] );
  }
  
  open (FASTA, ">$outputFasta") || die "could not open file $outputFasta\n";
  foreach my $id (sort {$a cmp $b} keys %$seq) {
    print FASTA ">$id\n".$seq->{$id}."\n";
  }
  close FASTA;

}
else{
  
  print $stats->{ nr_seq_file }  , "\t";
  print $stats->{ nr_seq_unique } , "\t";
  print $stats->{ nr_seq_removed }  , "\t";
  print $stats->{ percent_seq_removed } , "\t";
  print  $stats->{ nr_seq_unique } +  $stats->{ nr_seq_removed } , "\n";

}



# reads a fasta file and returns three hash references
# 1. hash of md5 sums for all prefixes and all Sequences IDs for this prefix
# 2. hash of removed IDs and sequences
# 3. hash of remaining sequences 
# 4. hash of counts

sub check_fasta{
    my ($file , $cutoff) = @_;

    my $hash = {};        # md5 of prefix and IDs having this prefix 
    my $duplicates = {};  # duplicate/clustered sequences
    my $seq = {};         # cleand sequences
    my $nr_seq = 0;       # sequences in fasta file
    my $stats = {};       # statistics

    my $default = $/;     # saving default line end character;

    open(FASTA , $file ) or die "Can't open file $file \n!";
    
    # set line end
    $/=">";
    
    while(my $line = <FASTA>){
      
      my @entries = split "\n" , $line;
      my $end = pop @entries;
      my $id  = shift @entries;
      my $fasta = join "" , @entries;
      next unless ($fasta);
 
      # count entries
      $nr_seq++;

      # set seen sequences
      $seq->{$id} = $fasta;

      # get prefix
      my $prefix = substr( $fasta , 0 , $cutoff);

      unless (length $prefix == $cutoff){   
	# print STDERR $cutoff , "\t" , length $prefix , "\t" , length $fasta, "\t" , $prefix , "\n";
	# next;
      }

      # compute md5 and add ID to the group of sequences with same prefix
      my $md5 = Digest::MD5::md5_hex( lc $prefix );
      push @{ $hash->{ $md5 } } , $id ;
      
      if (scalar @{ $hash->{ $md5 } } > 1 )
	{
	  # check if sequence is a substring for all sequences having the same prefix
	  &compare_seq( $seq , $duplicates, $id ,  $hash->{ $md5 } );
	}
      
    }
    
    
    close(FASTA);
    
    # set line end back to default
    $/=$default;
    
    # get stats
    my $nr_removed    = scalar ( keys %$duplicates );
    my $nr_seq_unique = scalar ( keys %$seq );

    my $percent_removed = "not available";
    if ( $nr_seq and $nr_removed){
      $percent_removed = ($nr_removed*100)/$nr_seq ;
    }
      
    $stats->{ percent_seq_removed }  = $percent_removed;
    $stats->{ nr_seq_unique }        = $nr_seq_unique;
    $stats->{ nr_seq_removed }       = $nr_removed;
    $stats->{ nr_seq_file }          = $nr_seq;

    return ($hash , $seq , $duplicates , $stats);
  }



# compares to sequences and bins them togheter if the shorter sequence is a substring of the longer sequence
sub compare_seq{
  my ($seq , $dupl , $id , $ids) = @_;
  my $seq1 = $seq->{ $id };
  my $len1 = length $seq1;

  # check for all members having the same prefix
  foreach my $tid ( @$ids ){
    
    next if ($tid eq $id);
    next unless ($seq->{ $tid });

    my $seq2 = $seq->{ $tid };
    my $len2 = length $seq2;
    
    my $tseq1 = ''; # temporary sequence
    my $tseq2 = ''; # temporary sequence
    my $sid   = ''; # id for shortest sequence

    # compare sequence from shorter fragment with subsequence from longer fragment
    if ( $len1 < $len2){
      $tseq1 = $seq1 ;
      $tseq2 = substr ($seq2 , 0 , $len1);
      $sid = $id
    }
    else{
      $tseq1 = $seq2 ;
      $tseq2 = substr ($seq1 , 0 , $len2);
      $sid = $tid;
    }

    # delete shorter fragment and add it to duplicates hash
    if ( $tseq1 eq $tseq2){
      #print "Same $id , $tid\n$tseq1\n$tseq2\n";
      #print "Orig $id , $tid\n$seq1\n$seq2\n";
      $dupl->{ $sid } = $seq->{ $sid };
      delete $seq->{ $sid };
  
    }
  }
  
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3