[Bio] / FigKernelScripts / valid_fasta_file.pl Repository:
ViewVC logotype

View of /FigKernelScripts/valid_fasta_file.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Thu Feb 18 22:08:09 2010 UTC (9 years, 8 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Check to ensure a fasta file is valid.

#
# Determine if the given file is a valid fasta file and attempt to
# determine if it contains protein or DNA data.
#
# This is a SAS Component
#

@ARGV == 1 or @ARGV == 2 or die "usage: valid_fasta_file filename [normalized]\n";
$file = shift;
$norm = shift;

open(F, "<", $file) or die "cannot open $file: $!";

my $clean_fh;
if ($norm)
{
    open($clean_fh, ">", $norm) or die "cannot write normalized file $norm: $!";
}

my $state = 'expect_header';
my $cur_id;
my $dna_chars;
my $prot_chars;

{
    while (<F>)
    {
	chomp;
	
	if ($state eq 'expect_header')
	{
	    if (/^>(\S+)/)
	    {
		$cur_id = $1;
		$state = 'expect_data';
		print $clean_fh ">$cur_id\n" if $clean_fh;
		next;
	    }
	    else
	    {
		die "Invalid fasta: Expected header at line $.\n";
	    }
	}
	elsif ($state eq 'expect_data')
	{
	    if (/^>(\S+)/)
	    {
		$cur_id = $1;
		$state = 'expect_data';
		print $clean_fh ">$cur_id\n" if $clean_fh;
		next;
	    }
	    elsif (/^([acgtumrwsykbdhvn]*)\s*$/i)
	    {
		print $clean_fh uc($1) . "\n" if $clean_fh;
		$dna_chars += length($1);
		next;
	    }
	    elsif (/^([*abcdefghijklmnopqrstuvwxyz]*)\s*$/i)
	    {
		print $clean_fh uc($1) . "\n" if $clean_fh;
		$prot_chars += length($1);
		next;
	    }
	    else
	    {
		my $str = $_;
		if (length($_) > 100)
		{
		    $str = substr($_, 0, 50) . " [...] " . substr($_, -50);
		}
		die "Invalid fasta: Bad data at line $.\n$str\n";
	    }
	}
	else
	{
	    die "Internal error: invalid state $state\n";
	}
    }
}

my $what = ($prot_chars > 0) ? "protein" : "dna";
print "$what\n";


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3