[Bio] / FigKernelScripts / fix_invalid_fasta.pl Repository:
ViewVC logotype

View of /FigKernelScripts/fix_invalid_fasta.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Wed Oct 10 23:39:40 2007 UTC (12 years, 4 months ago) by redwards
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Initial release of programs to check and fix valid fasta format

#__perl__

=pod

=head1 fix_invalid_fasta.pl

This command takes a fasta file as a name, and checks whether it is valid fasta file, and if not, tries to correct the errors. If successful it returns the filename of the correct fasta file on STDOUT. If it can't correct the error it returns an error message on STDERR.

Note that it uses is_valid_fasta to actually check the file, and the exit status of that to determine what needs to be done.
If the file is valid fasta it just returns the same name as the input!

=cut

use strict;
use warnings;
use constant MAX_FIXES => 5; # the maximum number of times we will try and fix a file
use constant VALIDATOR => 'validate_fasta'; # change this for the appropriate command

unless (-x VALIDATOR) {die VALIDATOR . " can not be executed"}

my $filename=shift || die "$0 <fasta file>";
unless (-e $filename) {die "$filename does not exist"}
my $originalfilename = $filename; # this will be kept so we can create an appropriate output filename from it

my $command = VALIDATOR . " $filename";
system $command;
my $error = ($? >> 8); # this gets the return status of the program. See 'perldoc perlvar' for more information

unless ($error) {print "$filename\n"; exit(0)} # there is no error in the file!

my $number_of_tries=0; # we are only going to try 5 times to fix the file



# now loop through the file, while we get errors, and fix it
# note that if there is an unknown error, $? will be -100
while ($number_of_tries < MAX_FIXES && $error > 0) 
{

#### Fixes for specfic errors.


# For a file not found we're screwed, so die
	if ($error == 1) 
	{
		print STDERR "File could not be found. Can not recover from this\n";
		exit(-1);
	}

# For the following errors, we can read and write the file, so we'll open input/output streams

#### Create a new fasta filename
# 
# We are going to add a number to the end of the exisiting filename, but make sure that we don't overwrite another file
# This will also allow us to recursively try to fix the file

	my $count=$$;
	while (-e "$originalfilename.$count.fa") {$count++}
	my $outputfile="$originalfilename.$count.fa";
	if ($filename =~ /gz$/) {open(IN, "gunzip -c $filename|") || die "Can't open a pipe to the file $filename"}
	else {open(IN, $filename) || die "can't find file even though it exists"}
	open(OUT, ">$outputfile") || die "Can't write to $outputfile";

# Warning messages while I am playing with this
# print STDERR "Fixing $filename to $outputfile since error is $error and try is  $number_of_tries\n";


# there are two ways to do this, evaluate the ifs and then loop the file, or loop the file
# and evaluate the ifs. I chose #1 as we only do each if once, rather than once per line.
# We are also not correcting multiple errors at once (at the moment)

	if ($error == 2)
	{
# file contains mac newlines
		while (<IN>)
		{
			s/\r/\n/g;
			s/\n+/\n/g;
			print OUT unless (/^\s+$/);
		}
		close IN; close OUT; # prevent us from trying to correct them further! 
	}
	elsif ($error == 3)
	{
# File contains spaces before identifier
		while (<IN>)
		{
			s/^\s+\>/>/;
			print OUT unless (/^\s+$/);
		}
		close IN; close OUT; # prevent us from trying to correct them further! 
	}
	elsif ($error == 4)
	{
# File contains numbers where there should be sequence
		while (<IN>)
		{
			unless (/^>/) # correct spaces too, even though this doesn't usually cause an error
			{
				s/[\s\d]+//g;
				$_ .= "\n";
			}
			print OUT unless (/^\s+$/);
		}
	}
	elsif ($error == 5)
	{
# Identifier line is empty
		my $linecount=1;
		while (<IN>)
		{
			if (/^>$/) {$_ = ">$linecount\n"; $linecount++}
			print OUT unless (/^\s+$/);
		}
		close IN; close OUT; # prevent us from trying to correct them further! 
	}
	elsif ($error == 6)
	{
# Sequence is empty
		my $seq; my $id;
		while (<IN>)
		{
			if (/^>/)
			{
				if ($seq)
				{
					$seq =~ s/[\s\d]//g;
					$seq =~ s/(.{60})/$1\n/g;
# just avoid adding unnecessary blank lines
					chomp($seq);
					chomp($id);
					if ($seq) {print OUT "$id\n$seq\n"}
				}
				$id=$_;
				undef $seq;
			}
			else {$seq .= $_}
		}
		if ($seq)
		{
			$seq =~ s/[\s\d]+//g;
			$seq =~ s/(.{60})/$1\n/g;
			chomp($seq);
			if ($seq) {print OUT "$id\n$seq\n"}
		}
		close IN; close OUT; # prevent us from trying to correct them further!
	}
	elsif ($error == 7)
	{
	# Duplicate Identifiers
		my $count=1;
		my %seen;
		while (<IN>)
		{
			if (/^>(\S+)/)
			{
				if ($seen{$1})
				{
					while ($seen{$count}) {$count++}
					s/^>/>$count /;
					$seen{$count}=1;
				}
				else {$seen{$1}=1}
			}
			print OUT unless (/^\s+$/);
		}
		close IN; close OUT; # prevent us from trying to correct them further!
	}

	$filename=$outputfile;
	$command = VALIDATOR . " $filename";
	system $command;
	$error = ($? >> 8); # this gets the return status of the program. See 'perldoc perlvar' for more information
		$number_of_tries++;

}

if ($error) {print STDERR "Can't fix the fasta, even after ", MAX_FIXES, " attempts\n"}
else  {print "$filename\n"}









MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3