[Bio] / FigKernelScripts / validate_fasta.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/validate_fasta.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : redwards 1.1 #__perl__
2 :    
3 :    
4 :     =pod
5 :    
6 :     =head1 validate_fasta.pl
7 :    
8 :     Check a file and see whether it is valid fasta. Returns 0 if valid, or an error code if it is not valid.
9 :     I have tried to catch most of the common mistakes.
10 :    
11 :     =cut
12 :    
13 :     use strict;
14 :     my $filename=shift || die "$0 <fasta filename>";
15 :    
16 :    
17 :    
18 :     # define some error codes. I just made these up. The rough idea is that based on the error code here, you can parse the file and correct it automatically.
19 :    
20 :     my %error=(
21 :     "File not found" => 1,
22 :     "File contains mac newlines" => 2,
23 :     "File contains spaces before identifier" => 3,
24 :     "File contains numbers where there should be sequence" => 4,
25 :     "Identifier line is empty" => 5,
26 :     "Sequence is empty" => 6,
27 :     "Duplicate identifiers" => 7,
28 :     );
29 :    
30 :    
31 :     unless (-e $filename) {caught_error("File not found")}
32 :    
33 :     my $parsedfirstline=0;
34 :     my $seq; my %identifier;
35 :     if ($filename =~ /gz$/) {open(IN, "gunzip -c $filename|") || die "Can't open a pipe to the file $filename"}
36 :     else {open(IN, $filename) || die "can't find file even though it exists"}
37 :     while (<IN>)
38 :     {
39 :     if (/\r/) {caught_error("File contains mac newlines")}
40 :     if (/^\s+\>/) {caught_error("File contains spaces before identifier")}
41 :     if ($_ !~ /^>/ && /\d/) {caught_error("File contains numbers where there should be sequence")}
42 :     if (/^>$/) {caught_error("Identifier line is empty")}
43 :     if (/^>/)
44 :     {
45 :     if ($parsedfirstline)
46 :     {
47 :     # there won't be any sequence when we read the first line
48 :     $seq =~ s/[\s\d]//g;
49 :     unless ($seq) {caught_error("Sequence is empty")}
50 :     }
51 :     undef $seq;
52 :     $parsedfirstline=1;
53 :     }
54 :     if ($_ !~ /^>/) {$seq .= $_}
55 :     if (/^>(\S+)/)
56 :     {
57 :     if ($identifier{$1}) {caught_error("Duplicate identifiers")}
58 :     $identifier{$1}=1;
59 :     }
60 :     }
61 :     # make sure there is some sequence in the last line
62 :     $seq =~ s/[\s\d]+//g;
63 :     unless ($seq) {caught_error("Sequence is empty")}
64 :    
65 :     exit(0);
66 :    
67 :    
68 :    
69 :     sub caught_error {
70 :     my $err=shift;
71 :     if (defined $error{$err})
72 :     {
73 :     print STDERR "$error{$err}: $err\n";
74 :     exit($error{$err});
75 :     }
76 :     else
77 :     {
78 :     print STDERR "-100: Caught an unusual error: $err\n";
79 :     exit(-100);
80 :     }
81 :     }
82 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3