[Bio] / FigKernelScripts / reformat_contigs.pl Repository:
ViewVC logotype

View of /FigKernelScripts/reformat_contigs.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.27 - (download) (as text) (annotate)
Thu Oct 10 12:20:46 2019 UTC (4 weeks, 4 days ago) by gdpusch
Branch: MAIN
CVS Tags: HEAD
Changes since 1.26: +178 -160 lines
Major rewrite, processing sequences using 'gjoseqlib' and arguments using 'Getopt::Long::Descriptive'.

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

use File::Basename;
# $this_tool_name = basename($0);

use strict;
use warnings;
use Data::Dumper;

#use FIG;
use gjoseqlib;


use Getopt::Long::Descriptive;
my ($opt, $usage) = 
describe_options("%c %o",
		 ['help|h',    'Show this help message', { shortcircuit => 1 } ],
		 ['pod',       'Display POD document for this tool'],
		 ['verbose|v', "Run in 'verbose' mode (typically to debug a problem)"],
		 ['logfile=s', 'Optional logfile name' ],
		 ['input=s', ('Input filename '
			      ." (if not provided and there is no positional input-file argument,"
			      ." input defaults to STDIN)"
			      ) ],
		 ['output=s', ('Output filename (if not provided, and there is no positional input-file argument,'
			       .' output defaults to STDOUT)') ],
		 ['split:i', ("Split contigs at runs of IUPAC ambigs longer than split-value;"
			      ." defaults to no split. If --split, but no value is given, defaults to 4."
			      ." (As a side-effect, writes a scaffold-map to the output-path's directory"
			      ." if there is an '--output=file' or positional output-file)"
			      ), ],  
		 ['split-homopol:i', 'Split contigs at homopolymer runs of length >= value', { default => 50 } ],
		 ['max-badchar=i', 'Maximum number of invalid (non-IUPAC) characters allowed before aborting', { default => 500 } ],
		 ['width=i',   'Output-sequence width',            { default => 50 } ],
		 ['min=i',     'Minimum acceptable contig length', { default => 0 } ],
		 ['max=i',     'Maximum acceptable contig length', { default => 999999999999 } ],
		 ['renumber!', "Renumber contigs starting from, e.g., 'Contig0001'" ],
		 ['renumber-map=s',    'Save map from original to renumbered contig-IDs to filename'],
		 ['renumber-digits=i', 'Number of digits used in renumbered contig IDs', { default => 4 } ],
		 ['renumber-prefix=s', 'Prefix to use in renumbered contig names',       { default => 'Contig' } ],
		 ['keep',              'Appends original contig-ID as a FASTA comment to new ID if --renumber'],
		 ['remove-duplicates', 'Remove duplicate contig sequences'],
		 ['duplicates-file=s', 'Save information about removed duplicates in filename'],
		 { show_defaults => 1 }
		 );

if ($opt->help) {
    print STDERR $usage->text;
    exit(0);
}

use Pod::Text;
if ($opt->pod) {
    pod2text($0);
    exit(0);
}

my $verbose   = $opt->verbose;
if ($verbose) { $ENV{VERBOSE} = 1; }

my $logfilename = $opt->logfile;
my $fasta_in  = $opt->input;
my $fasta_out = $opt->output;

my $split = $opt->split || (defined($opt->split) ? 4 : 0);
print STDERR "split=$split\n";
my $longrepeat = $opt->split_homopol;
my $max_bad    = $opt->max_badchar;

my $width = $opt->width;
my $min_length = $opt->min;
my $max_length = $opt->max;

my $renumber = $opt->renumber;
my $renumber_map = $opt->renumber_map;
my $renumber_digits = $opt->renumber_digits;
my $renumber_prefix = $opt->renumber_prefix;
my $keep = $renumber ? $opt->keep : 0;

my $remove_duplicates = $opt->remove_duplicates;
my $duplicates_file   = $opt->duplicates_file;
my $num_duplicates_removed = 0;


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Grandfather in the "two-argument" invocation mode...
#-----------------------------------------------------------------------
my $trouble = 0;

if ((not $fasta_in) && (not $fasta_out) && (@ARGV == 2)) {
    $fasta_in  = shift;
    $fasta_out = shift;
}

my $fh_in;
if ($fasta_in) {
    if (!-s $fasta_in) {
        $trouble = 1;
        warn "Input file '$fasta_in' does not exist or has zero size\n";
    }
    else {
	open($fh_in,  q(<), $fasta_in)
	    || (warn("Could not read-open '$fasta_in' --- ERROR=$!") && ($trouble = 1));
    }
}
else {
    $fh_in = \*STDIN;
}

my $fh_out;
if ($fasta_out) {
    open($fh_out,  q(>), $fasta_out)
	|| (warn("Could not read-open '$fasta_out' --- ERROR=$!") && ($trouble = 1));
}
else {
    $fh_out = \*STDOUT;
}

my $basepath;
if ($fasta_out) {
    $basepath = dirname($fasta_out);
}
else {
    warn "No output-file given, so no scaffold-map will be produced\n" if $verbose;
}


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Open various auxilliary files (if requested)
#-----------------------------------------------------------------------
my $logfile_fh = \*STDERR;
if (defined($logfilename)) {
    if (not open($logfile_fh, q(>), $logfilename)) {
	$trouble = 1;
	warn "ERROR=$!: Could not write-open logfile=\'$logfilename\'";
    }
}

my $renumber_map_fh;
if ($renumber) {
    if (not open($renumber_map_fh, q(>), $renumber_map)) {
	$trouble = 1;
	warn "ERROR=$!: Cannot write-open renumber-map file \'$renumber_map\'";
    }
}

my $duplicates_fh;
if ($duplicates_file) {
    if (not open($duplicates_fh, ">", $duplicates_file)) {
	$trouble = 1;
	warn "ERROR=$1: Cannot write-open duplicates file \'$duplicates_file\'";
    }
}

my $scaffold_map;
if ($split && $basepath) {
    if (not open($scaffold_map, '>', "$basepath/scaffold.map")) {
	$trouble = 1;
	warn "ERROR=$1: Could not write-open scaffold-map file \'$basepath/scaffold.map\'";
    }
}


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Abort if any problems were detected
#-----------------------------------------------------------------------
if ($trouble || @ARGV) {
    die qq(There were invalid arguments: ), join(" ", @ARGV), qq(\n\n);
}
#=======================================================================



#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Main body of code
#-----------------------------------------------------------------------
my $num_contigs   = 0;
my $num_short = 0;
my $num_long  = 0;
my $num_bad_contigs   = 0;
my $max_contig_id_len = 0;

my $short_chars = 0;
my $long_chars  = 0;
my $nx_chars    = 0;
my $badchars  = 0;

my %id_by_content;  #...Used to determine if a contig sequence has already been seen


#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Old way of reading sequences reacord by record...
#=======================================================================
#while (my($head, $seqP) = &get_fasta_record($fh_in)) {
#-----------------------------------------------------------------------

while (my @record = &gjoseqlib::read_next_fasta_seq($fh_in)) {
    ++$num_contigs;
    my ($contig_id, $comment, $seq) = @record;
    
    $seq = lc($seq);
    my $len = length($seq);
    
    unless (defined($contig_id) && $len) {
	++$num_bad_contigs;
	print $logfile_fh "No contig-ID for record num=$num_contigs, line=$., len=$len" unless (defined($contig_id));
	print $logfile_fh "Zero-length record num=$num_contigs, line=$., contig_id=$contig_id" unless ($len);
	next;
    }
    
    my $prev_id = $id_by_content{$seq};
    
    my $head;
    if ($renumber) {
	my $orig = $contig_id . ($comment ? qq(\t$comment) : q());
	my $contig_id = $renumber_prefix . ("0" x ($renumber_digits - length($num_contigs))) . $num_contigs; 
	if ($keep) {
	    $head = "$contig_id\t$orig";
	}
	else {
	    $head = $contig_id;
	}
	print $renumber_map_fh "$orig\t$head\n" if $renumber;
    }
    else {
	if (not $contig_id) {
	    $trouble = 1;
	    ++$num_bad_contigs;
	    print $logfile_fh "Record $. has leading whitespace or no contig name\n";
	    next;
	}
    }
    
    
    if ($contig_id =~ m/\,/o) {
	++$num_bad_contigs;
	$trouble = 1;
	print STDERR "Record $. has a comma embedded in the contig name\n";
	next;
    }
    
    if ((my $l = length($contig_id)) > $max_contig_id_len) {
	$max_contig_id_len = $l;
    }

    if ($prev_id) {
	if ($remove_duplicates) {
	    print $duplicates_fh "$prev_id\t$contig_id\n" if $duplicates_fh;
	    ++$num_duplicates_removed;
	    next;
	}
    }
    else {
	$id_by_content{$seq} = $contig_id;
    }
    
    my $accept;
    my @ambig_runs = ();
    if ($split) {
	$_ =  $seq;
	while ($_ =~ m/([nbdhvrykmswx]{$split,}|a{$longrepeat,}|c{$longrepeat,}|g{$longrepeat,}|t{$longrepeat,})/gio) {
	    my $run = $1;
	    $nx_chars += length($run);
	    push @ambig_runs, $run;
	}

	my $runs;
	if (@ambig_runs > 1) { $runs = 'runs'; } else { $runs = 'run'; }
	
	if (defined($ENV{VERBOSE}) && @ambig_runs) {
	    if ($ENV{VERBOSE} == 1) {
		print STDERR "$head contains "
		    , (scalar @ambig_runs)
		    , " long $runs of ambiguity characters separating subcontigs, with run-lengths "
		    , join(qq(, ), (map { length($_) } @ambig_runs)), "\n"
		    if (@ambig_runs && $ENV{VERBOSE});
	    }
	    elsif ($ENV{VERBOSE} > 1) {
		print STDERR "$head contains "
		    , (scalar @ambig_runs)
		    , " long $runs of ambiguity characters separating subcontigs: "
		    , join(qq(, ), @ambig_runs), "\n"
		    if (@ambig_runs && $ENV{VERBOSE});
	    }
	}		
    }
    
    print $scaffold_map ($contig_id, "\n") if $scaffold_map;
    if ($split && @ambig_runs) {
	my $last_pos   = 0;
	my $subcon_num = 0;
	my($subcontig, $subcontig_id);
	my($prefix, $bridge, $suffix) = ("", "", "");
	while ($seq =~ m/[nbdhvrykmswx]{$split,}|a{$longrepeat,}|c{$longrepeat,}|g{$longrepeat,}|t{$longrepeat,}/gio) {
	    ($prefix, $bridge, $suffix) = ($`, $&, $');
	    print STDERR "$bridge\n";
	    
	    $accept = 1;
	    if (length($prefix)) {
		++$subcon_num;
		$subcontig_id = "$contig_id.$subcon_num";
		
		$subcontig = substr($seq, $last_pos, length($prefix) - $last_pos);
		print $scaffold_map ($subcontig_id, "\t", $last_pos, "\n") if $scaffold_map;
		
		if (($len = length($subcontig)) < $min_length) {		    
		    print STDERR "   skipping len=$len $subcontig_id\n" if ($ENV{VERBOSE});
		    ++$num_short;
		    $short_chars += $len;
		    $accept = 0;
		}
		
		if (($len = length($subcontig)) > $max_length) {		    
		    print STDERR "   skipping len=$len $subcontig_id\n" if ($ENV{VERBOSE});
		    ++$num_long;
		    $long_chars += $len;
		    $accept = 0;
		}
		
		print STDERR "   accepting prefix len=$len $subcontig_id\n" if $ENV{VERBOSE};
		&display_id_and_seq($subcontig_id, \$subcontig, $fh_out) if $accept;
	    }
	    $last_pos = pos($seq);
	}

	$accept = 1;
	if ($suffix) {
	    ++$subcon_num;
	    $subcontig_id = "$contig_id.$subcon_num";
	    
	    $subcontig = $suffix;
	    print $scaffold_map ($subcontig_id, "\t", $last_pos, "\n") if $scaffold_map;
	    
	    if (($len = length($subcontig)) < $min_length) {		    
		print STDERR "   skipping len=$len $subcontig_id\n" if ($ENV{VERBOSE});
		++$num_short;
		$short_chars += $len;
		$accept = 0;
	    }
	    
	    if (($len = length($subcontig)) > $max_length) {		    
		print STDERR "   skipping len=$len $subcontig_id\n" if ($ENV{VERBOSE});
		++$num_long;
		$long_chars += $len;
		$accept = 0;
	    }
	    
	    print STDERR "   accepting suffix len=$len $subcontig_id\n" if $ENV{VERBOSE};
	    &display_id_and_seq($subcontig_id, \$subcontig, $fh_out) if $accept;
	}
    }
    else {
	$accept = 1;
	
	if (($len = length($seq)) < $min_length) {		    
	    print STDERR "   skipping len=$len $head\n" if ($ENV{VERBOSE});
	    ++$num_short;
	    $short_chars += $len;
	    $accept = 0;
	}
	
	if (($len = length($seq)) > $max_length) {		    
	    print STDERR "   skipping len=$len $head\n" if ($ENV{VERBOSE});
	    ++$num_long;
	    $long_chars += $len;
	    $accept = 0;
	}
	
	&display_id_and_seq( $contig_id, \$seq, $fh_out ) if $accept;
    }
    print $scaffold_map "//\n" if $scaffold_map;


#...Abort if error-thresholds exceeded...    
    if ($badchars > $max_bad) {
	warn "Aborting reformat: reached $badchars bad characters\n";
	last;
    }
    
    if ($num_bad_contigs > $max_bad) {
	warn "Aborting reformat: reached $num_bad_contigs bad contigs\n";
	last;
    }
}

my ($s, $sa, $sb);
print STDERR "\n" if ($num_bad_contigs);
print STDERR "max id length $max_contig_id_len\n";

$s = ($num_duplicates_removed == 1) ? qq() : qq(s);
print STDERR "removed $num_duplicates_removed duplicate$s.\n" if ($num_duplicates_removed);

$s = ($num_bad_contigs == 1) ? qq() : qq(s);
print STDERR "skipped $num_bad_contigs bad contig$s.\n" if ($num_bad_contigs);

$s = ($badchars == 1) ? qq() : qq(s);
print STDERR "skipped $badchars invalid char$s.\n"   if ($badchars);

$s = ($nx_chars == 1) ? qq() : qq(s);
print STDERR "skipped $nx_chars ambiguity char$s.\n" if ($nx_chars);

$sa = ($short_chars == 1) ? qq() : qq(s);
$sb = ($num_short == 1) ? qq() : qq(s);
print STDERR "skipped $short_chars char$sa in $num_short contig$sb shorter than $min_length bp.\n" if ($num_short);

$sa = ($long_chars == 1) ? qq() : qq(s);
$sb = ($num_long == 1) ? qq() : qq(s);
print STDERR "skipped $long_chars char$sa in $num_long contig$sb longer than $max_length bp.\n"   if ($num_long);
print STDERR "\n" if ($num_bad_contigs);

if (defined($logfilename)) {
    close $logfile_fh;
}

exit($trouble || $num_bad_contigs);

#=======================================================================

sub display_id_and_seq {
    my( $id, $seq, $fh ) = @_;
    my ( $i, $n, $ln );
    
    if (! defined($fh) )  { $fh = \*STDOUT; }
    
    print $fh ">$id\n";
    
    $n = length($$seq);
#   confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
    for ($i=0; ($i < $n); $i += $width) {
	if (($i + $width) <= $n) {
	    $ln = substr($$seq,$i,$width);
	}
	else {
	    $ln = substr($$seq,$i,($n-$i));
	}
	
	print $fh "$ln\n";
    }
}


my $first_line  = 1;
my $input_eol_marker = $/;
sub get_fasta_record {
    my ( $fh ) = @_;
    my ( $old_eol, $entry, @lines, $head, $seq);
    
    if (not defined($fh))  { $fh = \*STDIN; }
    $old_eol = $/;
    $/ = "$input_eol_marker>";
    
    my @record = ();
    if (defined($entry = <$fh>)) {
	chomp $entry;
	@lines =  split( /$input_eol_marker/, $entry );
	while (@lines and (not defined($lines[0])))  { shift @lines; }
	
	$head  =  shift @lines;
	if ($first_line) {
	    $first_line = 0;
	    if (not ($head  =~ s/^\s*>//)) {
		$trouble = 1;
		warn $head;
		die "ERROR: File does not appear to be in FASTA format\n";
	    }
	}
	else {
	    if ($head  =~ s/^\s*>//) {
		$trouble = 1;
		warn $head;
		die "Spurious beginning-of record mark found in record $.\n";
	    }
	}
	
	foreach my $ln (@lines) {
	    $_  =  $ln;
	    $ln =~ s/\s//g;
	    
	    print STDERR "$head: contains X's\n"    if ($ln =~ s/x/n/ig);
	    print STDERR "$head: contains colons\n" if ($ln =~ s/://g);
	    
	    while ($ln =~ s/([^ACGTUMRWSYKBDHVN]+)/n/i) {
		$trouble = 1;
		++$badchars;
		print STDERR ">$head:\tbad char $1 at ", pos($ln), " at line $.\n";
	    }
	}
	
	$seq   =  join( "", @lines );
	$seq   =~ s/\cM//g;
	$seq   =~ tr/a-z/A-Z/;
	@record = ($head, \$seq);
    }
    
    $/ = $old_eol;
    return @record;
}


sub file_type {
#...No longer used; Save for archival purposes...
    my ($file_name) = @_;
    
    my $file_type;
    if (($file_type = `file '$file_name' | cut -f2 -d:`) && ($file_type =~ m/\S/o)) {
	my $saved_file_type = $file_type;
	
	$file_type =~ s/^\s+//o;   #...trim leading whitespace
	$file_type =~ s/\s+$//o;   #...trim trailing whitespace
	$file_type =~ s/, with very long lines//;
	
	print STDERR "file_type = $file_type\n" if $ENV{VERBOSE};
	
	if    ($file_type =~ m/^ASCII.*text$/) {
	    print STDERR "ASCII text file\n" if $ENV{VERBOSE};
	}
	elsif ($file_type =~ m/^ASCII.*text, with CR line terminators$/) {
	    print STDERR "CR terminated file\n" if $ENV{VERBOSE};
	    $input_eol_marker = "\cM";
	}
	elsif ($file_type =~ m/^ASCII.*text, with CRLF line terminators$/) {
	    print STDERR "CRLF terminated file\n" if $ENV{VERBOSE};
	    $input_eol_marker = "\cM\cJ";
	}
	elsif ($file_type =~ m/^ASCII.*text, with CR, LF line terminators$/) {
	    print STDERR "CR, LF terminated file\n" if $ENV{VERBOSE};
	    $input_eol_marker = "\cM\cJ";
	}
	elsif ($file_type =~ m/^ASCII.*text, with CRLF, LF line terminators$/) {
	    print STDERR "CRLF, LF terminated file\n" if $ENV{VERBOSE};
	    $input_eol_marker = "\cM\cJ\n";
	}
	else {
	    die "Could not handle file-type $saved_file_type";
	}
    }
}

__DATA__

=pod

=head1 NAME

reformat_contigs

=head1 SYNOPSIS
    
reformat_contigs [-help] [-logfile=logfilename] [-v(erbose)] [-split(=len)] [-keep] [-width=line_width] [-[no]renumber] [-min=min_length]  < fasta_in  > fasta_out

reformat_contigs [-help] [-logfile=logfilename] [-v(erbose)] [-split(=len)] [-keep] [-width=line_width] [-[no]renumber] [-min=min_length]    fasta_in    fasta_out

=head1 DESCRIPTION

Reformats a contigs file to a uniform number of characters per line
(default length of 50), checking for non-IUPAC characters.

Default behavior is to zero-pad the digits of IDs that are of the form
'Contig', immediately followed by one or more digits to four digits.

Optionally renumbers contig IDs to format "PrefixNNNN", where 'Prefix'
defaults to "Contig", "NNNN" are zero-padded digits, and the number of digits
defaults to 4.

Optionally splits contigs at runs of ambig chars (default 4 or more ambigs),
and homopolymer runs (default 50 or more of the same character).
If the script is invoked using the two-argument form or explicit
'-input' and '-output' arguments, a file 'scaffold_map' is written
in the same directory as the output file.

Optionally discards contigs that are shorter or longer than some threshold.

=head1 COMMAND-LINE OPTIONS
		
    --split       Split contigs on runs of ambigs longer than len (def=4)
                    (writes scaffold-map to output path if there is an explicit 'fasta_out' arg)
    --keep        Appends original contig ID as a 'comment' to the to new ID if -renumber
    --width       line width (def=50)
    --renumber    Renumbers contigs starting from Contig0001
    --norenumber  Does not renumber contigs  (Default)
    --renumber-map=filename	Save the a mapping of renumbered contig names to filename
    --renumber-digits=ndigits Number of digits used in renumbered contig names
    --renumber-prefix=prefix  Prefix to use instead of Contig in renumbered contig names
    --remove-duplicates   Remove duplicate sequences
    --duplicates-file     File in which to save information about removed duplicates
    --min         Minimum acceptable contig length (def=0)
    --max         Maximum acceptable contig length (def=999999999999)
    --logfile     Name of the optional logfile

=head1 AUTHORS

L<The SEED Project|http://www.theseed.org>

=cut


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3