[Bio] / StandaloneTools / reformat_contigs Repository:
ViewVC logotype

View of /StandaloneTools/reformat_contigs

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (annotate)
Wed Oct 26 17:09:18 2005 UTC (13 years, 11 months ago) by overbeek
Branch: MAIN
CVS Tags: HEAD
Changes since 1.2: +1 -0 lines
Lowercased output sequence. (NOTE: there is now also a SEED version of this tool.) -- /gdp

#!/usr/bin/perl -w
($] >= 5.004) || die "version is $] -- need perl 5.004 or greater";


# -*- perl -*-

use Pod::Text;
# use File::Basename;   $this_tool_name = basename($0);
if ((@ARGV == 1) && ($ARGV[0] =~ m/-help/))  {
    pod2text($0);  exit(0);
}

=pod

=over 5

=item Usage:     reformat_contigs [-help] [-v(erbose)] [-split(=len)] [-keep] [-width=line_width] [-[no]renumber] [-min=min_length] < fasta_in  > fasta_out

=item Function:  Reformats a contigs file to a uniform 50 chars per line,
                checking for non-IUPAC characters. Default behavior is to
                zero-pad the digits of names of the form 'Contig' followed
                by one or more digits to four digits.

                Optionally, renumbers
                contigs, and/or discards contigs that are too short.
		
		-split      Split contigs on runs of more than len (def=3) 'N's or 'X's
                -keep       Appends original contig ID to new ID if -renumber
                -width      line width (def=50)
                -renumber   Renumbers contigs starting from Contig0001
                -norenumber Does not renumber contigs
                -min        Minimum acceptable contig length (def=0)
                -max        Maximum acceptable contig length (def=999999999999)
		
=back

=cut

$split       = 0;
$keep        = 0;
$width       = 50;
$renumber    = 0;
$norenumber  = 0;
$min_length  = 0;
$max_length  = 999999999999;

$short_chars = 0;
$long_chars  = 0;
$nx_chars    = 0;

while (@ARGV)
{
    if    ($ARGV[0] =~ m/^-v/)
    {
	$ENV{VERBOSE} = 1;
    }
    elsif ($ARGV[0] =~ m/-split/)
    {
    	$split = 3;
	if ($ARGV[0] =~ m/-split=(\d+)/)   { $split = $1; }
	open(SCAFFOLD_MAP, ">scaffold.map") or die "Could not open scaffold.map";
    }
    elsif ($ARGV[0] =~ m/-keep/)
    {
	$keep = 1; 
    } 
    elsif ($ARGV[0] =~ m/-width=(\d+)/)
    {
	$width = $1;
    }
    elsif ($ARGV[0] =~ m/-(no)?renumber/)
    {
	if ($1) { $norenumber = 1 } else { $renumber = 1; }
    }
    elsif ($ARGV[0] =~ m/-min=(\d+)/)
    {
	$min_length = $1;
    }
    elsif ($ARGV[0] =~ m/-max=(\d+)/)
    {
	$max_length = $1;
    }
    else
    {
	print STDERR "bad arg $ARGV[0]\n\n";
	pod2text($0);
	die "aborted";
    }
    
    shift;
}


$num   = 0;
$bad   = 0;
$short = 0;
$long  = 0;
while ( (! eof(STDIN)) && (($head, $seqP) = &get_a_fasta_record()) )
{
    $num++;
    
    $$seqP = lc($$seqP);
    unless ($head && ($len = length($$seqP)))
    {
	$bad++;
	warn "bad record num=$num, line=$., head=$head, len=$len,";
	next;
    }
    
    if ($renumber)
    {
	$contig_id = "Contig" . ("0" x (4-length($num))) . $num; 
	if ($keep) { $head = "$contig_id\t$head"; } else { $head = $contig_id; }
    }
    else
    {
	unless ($norenumber)
	{
#	    $head =~ s/\./-/g;    #...needed to make CRITICA behave sanely...
	    $head =~ m/^(\S+)/;
	    $head =  $1;
	    
#	    if ($head =~ m/(Contig\d+)/)   { $head = $1; }
#	    if ($head =~ m/Contig(\d+)/)   { $head = "Contig" . ("0" x (4-length($1))) . $1; }
#	    if ($head =~ m/^(\d+)$/)       { $head = "Contig" . ("0" x (4-length($1))) . $1; }
#	    if ($head =~ m/\D+(\d+)\s*$/)  { $head = "Contig" . ("0" x (5-length($1))) . $1; }
	}
    }
    
    $head =~ m/^(\S+)/;
    $contig_id = $1;
    
    @Nruns = ();
    if ($split) {
	$_ =  $$seqP;
#	$_ =~ s/^[nx]{$split,}//i;
#	$_ =~ s/[nx]{$split,}$//i;
	while ($_ =~ m/([nx]{$split,})/gi)  { push(@Nruns, length($1)); }
	if (@Nruns > 1) { $runs = 'runs'; } else { $runs = 'run'; }
	print STDERR "$head contains ", (scalar @Nruns), " long $runs of Ns or Xs separating subcontigs, of lengths "
	    , join(', ', @Nruns), "\n" if (@Nruns && $ENV{VERBOSE});
    }
    
    print SCAFFOLD_MAP "$contig_id\n" if $split;
    if ($split && @Nruns)
    {   
	$last_pos   = 0;
	$subcon_num = 0;
	$prefix = $bridge = $suffix = "";
	while ($$seqP =~ m/[nx]{$split,}/gi)
	{
	    ($prefix, $bridge, $suffix) = ($`, $&, $');
	    # print STDERR "<$prefix> <$bridge> <$suffix>\n";
	    
	    $accept = 1;
	    if (length($prefix))
	    {
		++$subcon_num;
		$subcontig_id = "$contig_id.$subcon_num";
		
		$subcontig = substr($$seqP, $last_pos, length($prefix) - $last_pos);
		print SCAFFOLD_MAP "$subcontig_id\t", $last_pos, "\n";
		
		if (($len = length($subcontig)) < $min_length)
		{		    
		    print STDERR "   skipping len=$len $subcontig_id\n" if ($ENV{VERBOSE});
		    ++$short;
		    $short_chars += $len;
		    $accept = 0;
		}
		
		if (($len = length($subcontig)) > $max_length)
		{		    
		    print STDERR "   skipping len=$len $subcontig_id\n" if ($ENV{VERBOSE});
		    ++$long;
		    $long_chars += $len;
		    $accept = 0;
		}
		
		print STDERR "   accepting len=$len $subcontig_id\n" if $ENV{VERBOSE};
		&display_id_and_seq($subcontig_id, \$subcontig) if $accept;
	    }
	    $last_pos = pos($$seqP);
	}

	$accept = 1;
	if ($suffix)
	{
	    ++$subcon_num;
	    $subcontig_id = "$contig_id.$subcon_num";
	    
	    $subcontig = $suffix;
	    print SCAFFOLD_MAP "$subcontig_id\t", $last_pos,"\n";

	    if (($len = length($subcontig)) < $min_length)
	    {		    
		print STDERR "   skipping len=$len $subcontig_id\n" if ($ENV{VERBOSE});
		++$short;
		$short_chars += $len;
		$accept = 0;
	    }
	    
	    if (($len = length($subcontig)) > $max_length)
	    {		    
		print STDERR "   skipping len=$len $subcontig_id\n" if ($ENV{VERBOSE});
		++$long;
		$long_chars += $len;
		$accept = 0;
	    }
	    
	    print STDERR "   accepting len=$len $subcontig_id\n" if $ENV{VERBOSE};
	    &display_id_and_seq($subcontig_id, \$subcontig) if $accept;
	}	    
    }
    else
    {
	$accept = 1;
	
	if (($len = length($$seqP)) < $min_length)
	{		    
	    print STDERR "   skipping len=$len $head\n" if ($ENV{VERBOSE});
	    ++$short;
	    $short_chars += $len;
	    $accept = 0;
	}
	
	if (($len = length($$seqP)) > $max_length)
	{		    
	    print STDERR "   skipping len=$len $head\n" if ($ENV{VERBOSE});
	    ++$long;
	    $long_chars += $len;
	    $accept = 0;
	}
	
	&display_id_and_seq( $head, $seqP ) if $accept;
    }
    
    print SCAFFOLD_MAP "//\n" if $split;
}

print STDERR "\n" if ($skipped || $bad);
print STDERR "skipped $bad bad contigs.\n" if ($bad);
print STDERR "skipped $nx_chars 'N' or 'X' chars.\n" if ($nx_chars);
print STDERR "skipped $short_chars chars in $short contigs shorter than $min_length bp.\n" if ($short);
print STDERR "skipped $long_chars chars in $long contigs longer than $max_length bp.\n"   if ($long);
print STDERR "\n" if ($skipped || $bad);



sub get_a_fasta_record
{
    my ( $old_eol, $entry, @lines, $head, $seq ); 

    $old_eol = $/;
    $/ = "\n>";
    
    $entry =  <STDIN>;
    chomp $entry;
    @lines =  split( /\n/, $entry );
    $head  =  shift @lines;
    $head  =~ s/^>?//;

    foreach $ln (@lines)
    {
	$_  =  $ln;
	$ln =~ s/\s//g;
	
	print STDERR "$head: contains X's\n"    if ($ln =~ s/x/n/ig);
	print STDERR "$head: contains colons\n" if ($ln =~ s/://g);
	
	while ($ln =~ s/([^ACGTUMRWSYKBDHVN]+)/n/i)
	{
	    print STDERR ">$head:\tbad char $1 at ", pos($ln), "\n";
	}
    }
    
    $seq   =  join( "", @lines );
    $seq   =~ s/\cM//g;
    $seq   =~ tr/a-z/A-Z/;
    
    $/ = $old_eol;
    return ( $head, \$seq );
}


sub display_id_and_seq {
    my( $id, $seq, $fh ) = @_;
    my ( $i, $n, $ln );
    
    if (! defined($fh) )  { $fh = \*STDOUT; }
    
    print $fh ">$id\n";
    
    $n = length($$seq);
#   confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
    for ($i=0; ($i < $n); $i += $width)
    {
	if (($i + $width) <= $n)
	{
	    $ln = substr($$seq,$i,$width);
	}
	else
	{
	    $ln = substr($$seq,$i,($n-$i));
	}
	
	print $fh "$ln\n";
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3