Initial release of programs to check and fix valid fasta format



=head1 validate_fasta.pl

Check a file and see whether it is valid fasta. Returns 0 if valid, or an error code if it is not valid.
I have tried to catch most of the common mistakes.


use strict;
my $filename=shift || die "$0 <fasta filename>";

# define some error codes. I just made these up. The rough idea is that based on the error code here, you can parse the file and correct it automatically.

my %error=(
	"File not found" => 1,
	"File contains mac newlines" => 2,
	"File contains spaces before identifier" => 3,
	"File contains numbers where there should be sequence" => 4,
	"Identifier line is empty" => 5,
	"Sequence is empty" => 6,
	"Duplicate identifiers" => 7,

unless (-e $filename) {caught_error("File not found")}

my $parsedfirstline=0;
my $seq; my %identifier;
if ($filename =~ /gz$/) {open(IN, "gunzip -c $filename|") || die "Can't open a pipe to the file $filename"}
else {open(IN, $filename) || die "can't find file even though it exists"}
while (<IN>)
	if (/\r/) {caught_error("File contains mac newlines")}
	if (/^\s+\>/) {caught_error("File contains spaces before identifier")}
	if ($_ !~ /^>/ && /\d/) {caught_error("File contains numbers where there should be sequence")}	
	if (/^>$/) {caught_error("Identifier line is empty")}
	if (/^>/)
		if ($parsedfirstline)
			# there won't be any sequence when we read the first line
			$seq =~ s/[\s\d]//g;
			unless ($seq) {caught_error("Sequence is empty")}
		undef $seq;
	if ($_ !~ /^>/) {$seq .= $_}
	if (/^>(\S+)/)
		if ($identifier{$1}) {caught_error("Duplicate identifiers")}
# make sure there is some sequence in the last line
$seq =~ s/[\s\d]+//g;
unless ($seq) {caught_error("Sequence is empty")}


sub caught_error {
	my $err=shift;
	if (defined $error{$err}) 
		print STDERR "$error{$err}: $err\n";
		print STDERR "-100: Caught an unusual error: $err\n";

