[Bio] / StandaloneTools / verify_genome_directory_standalone Repository:
ViewVC logotype

View of /StandaloneTools/verify_genome_directory_standalone

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (annotate)
Tue Jan 25 19:22:57 2005 UTC (14 years, 8 months ago) by olson
Branch: MAIN
CVS Tags: HEAD
Rename file.

#!/usr/bin/perl -w

# usage: verify_genome_directory Dir

($dir = shift @ARGV)
    || die "\n   usage: verify_genome_directory Dir\n\n";

if (!-d $dir)
{
    die "Organism directory $dir does not exist";
}

if (! (-s "$dir/GENOME"))    { print "WARNING: missing GENOME file\n" }   else { $genome   = `cat $dir/GENOME`;   chomp $genome; }
if (! (-s "$dir/PROJECT"))   { print "WARNING: missing PROJECT file\n" }
if (! (-s "$dir/TAXONOMY"))  { print "WARNING: missing TAXONOMY file\n" } else { $taxonomy = `cat $dir/TAXONOMY`; chomp $taxonomy; }

$taxonomy =~ m/^([^;]+)/;
$domain   =  $1;

if (!-d "$dir/Features")
{
    die "ERROR: Features directory $dir/Features does not exist";
}
else
{
    if (! opendir(TMP,$dir))
    {
	++$bad{no_org_dir};
	print "ERROR: could not open $dir";
    }
    else
    {
	@contigs = grep { $_ =~ /contig/ } readdir(TMP);
	closedir(TMP) || warn "Could not close directory $dir";
	
	if (@contigs == 0)
	{
	    print "missing contigs\n";
	}
	else
	{
	    $size = 0;
	    foreach $file (@contigs)
	    {
		if (!open(TMP, "<$dir/$file"))
		{
		    ++$bad{unopenable_contigs};
		    print "ERROR: Could not read-open $dir/$file";
		}
		else
		{
		    while (($id, $seqP) = &read_fasta_record(\*TMP))
		    {
			$size += $len_of_contig{$id} = length($$seqP);
		    }
		}
	    }
	    
	    if ((-s "$dir/COMPLETE") && ($size < 300000))
	    {
		print "WARNING: contigs for $dir contain only $size bp of data\n";
	    }
	    
	    if (!-d "$dir/Features/peg")
	    {
		++$bad{peg_no_directory};
		print "ERROR: No peg directory $dir/Features/peg\n";
	    }
	    else
	    {
		if (!-e "$dir/Features/peg/tbl")
		{
		    ++$bad{peg_no_tbl};
		    print "ERROR: No peg tbl file $dir/Features/peg/tbl\n";
		}
		else
		{
		    if (!-s "$dir/Features/peg/tbl")
		    {
			++$bad{peg_zero_sz_tbl};
			print "ERROR: Zero-size peg tbl file $dir/Features/peg/tbl\n";
		    }
		    else
		    {
			if (!open(TMP, "<$dir/Features/peg/tbl"))
			{
			    ++$bad{peg_unopenable_tbl};
			    print "ERROR: Could not read-open $dir/Features/peg/tbl";
			}
			else
			{
			    while (defined($entry = <TMP>))
			    {
				chomp $entry;
				($id, $loc) = split(/\t/, $entry);
				$loc_of_peg{$id} = $loc;
				foreach $exon (split(/,/, $loc))
				{
				    $exon =~ m/^(\S+)_(\d+)_(\d+)$/;
				    ($contig, $beg, $end) = ($1, $2, $3);
				    
				    if ($taxonomy !~ m/Eukaryota/)
				    {
					if (not defined($len_of_contig{$contig}))
					{
					    ++$bad{peg_undef_contig_ref};
					    print "ERROR: for $id, $loc refers to undefined contig $contig\n";
					}
					else
					{
					    if (&max($beg,$end) > $len_of_contig{$contig})
					    {
						++$bad{peg_out_of_bounds};
						print "ERROR: for $id, $exon in $loc exceeds contig length $len_of_contig{$contig}\n";
					    }
					}
				    }
				}
			    }
			    close(TMP) || warn "could not close $dir/Features/peg/tbl";
			}
		    }

		    if (!open(TMP, "<$dir/Features/peg/fasta"))
		    {
			++$bad{peg_unopenable_fasta};
			print "ERROR: Could not read-open $dir/Features/peg/fasta";
		    }
		    else
		    {
			while (($id, $seqP) = &read_fasta_record(\*TMP))
			{
			    $len_of_peg{$id} = length($$seqP);
			    
			    if (!defined($loc_of_peg{$id}))
			    {
				++$bad{peg_fasta_no_tbl};
				print "ERROR: $id is in peg fasta, but not in tbl\n";
			    }
			}
			close(TMP) || warn "could not close $dir/Features/peg/fasta";
			print STDERR "Read ", (scalar keys %len_of_peg), " pegs from $dir/Features/peg/fasta\n" 
			    if $ENV{SEED_VERBOSE};
		    }
		    
		    foreach $id (sort keys %loc_of_peg)
		    {
			if (!defined($len_of_peg{$id}))
			{
			    ++$bad{peg_tbl_no_fasta};
			    print "ERROR: $id is in peg tbl, but not in fasta\n";
			}
		    }
		}
		
		if (!-d "$dir/Features/rna")
		{
		    print "WARNING: no rna directory for $dir\n";
		}
		else
		{
		    if (!-e "$dir/Features/rna/tbl")
		    {
			++$bad{rna_no_tbl};
			print "ERROR: No rna tbl file $dir/Features/rna/tbl\n";
		    }
		    else
		    {
			if (!-s "$dir/Features/rna/tbl") 
			{
			    ++$bad{rna_zero_sz_tbl};
			    print "ERROR: Zero-size rna tbl file $dir/Features/rna/tbl\n";
			} 
			else 
			{
			    if (!open(TMP, "<$dir/Features/rna/tbl"))
			    {
				++$bad{rna_unopenable_tbl};
				warn "Could not read-open $dir/Features/rna/tbl";
			    }
			    else
			    {
				while (defined($entry = <TMP>))
				{
				    chomp $entry;
				    ($id, $loc) = split(/\t/, $entry);
				    $loc_of_rna{$id} = $loc;
				    foreach $exon (split(/,/, $loc))
				    {
					$exon =~ m/^(\S+)_(\d+)_(\d+)$/;
					($contig, $beg, $end) = ($1, $2, $3);
					if (!defined($len_of_contig{$contig}))
					{
					    ++$bad{rna_undef_contig_rna};
					    print "ERROR: for $id, $loc refers to undefined contig $contig\n";
					}
					else
					{
					    if (&max($beg,$end) > $len_of_contig{$contig})
					    {
						++$bad{rna_out_of_bounds};
						print "ERROR: for $id, $exon in $loc exceeds contig length $len_of_contig{$contig}\n";
					    }
					}
				    }
				}
				close(TMP) || warn "could not close $dir/Features/rna/tbl";
			    }
			    
			    open(TMP, "<$dir/Features/rna/fasta") || warn "Could not read-open $dir/Features/rna/fasta";
			    while (($id, $seqP) = &read_fasta_record(\*TMP))
			    {
				$len_of_rna{$id} = length($$seqP);
				
				if (!defined($loc_of_rna{$id}))
				{
				    ++$bad{rna_fasta_no_tbl};
				    print "ERROR: $id is in rna fasta, but not in tbl\n";
				}
			    }
			    close(TMP) || warn "could not close $dir/Features/rna/fasta";
			    print STDERR "Read ", (scalar keys %len_of_rna), " rnas from $dir/Features/rna/fasta\n" 
				if $ENV{SEED_VERBOSE};
			    
			    foreach $id (sort keys %loc_of_rna)
			    {
				if (!defined($len_of_rna{$id}))
				{
				    ++$bad{rna_tbl_no_fasta};
				    print "ERROR: $id is in rna tbl, but not in fasta\n";
				}
			    }
			}
		    }
		}
	    }
	}
    }
}

if (keys %bad)
{
    print STDERR "$dir is corrupt (", join(", ", map { "$_=$bad{$_}" } sort keys %bad), ")\t$domain\t$genome\n";
}
else
{
    print STDERR "$dir is O.K.\n";
}
# print STDERR "$0 has completed\n";

exit(scalar keys %bad);


sub min {
    my(@x) = @_;
    my($min,$i);

    (@x > 0) || return undef;
    $min = $x[0];
    for ($i=1; ($i < @x); $i++)
    {
        $min = ($min > $x[$i]) ? $x[$i] : $min;
    }
    return $min;
}

sub max {
    my(@x) = @_;
    my($max,$i);

    (@x > 0) || return undef;
    $max = $x[0];
    for ($i=1; ($i < @x); $i++)
    {
        $max = ($max < $x[$i]) ? $x[$i] : $max;
    }
    return $max;
}


sub read_fasta_record
{
    my ($file_handle) = @_;
    my ( $old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record );

    if (not defined($file_handle))  { $file_handle = \*STDIN; }

    $old_end_of_record = $/;
    $/ = "\n>";

    if (defined($fasta_record = <$file_handle>))
    {
        chomp $fasta_record;
        @lines  =  split( /\n/, $fasta_record );
        $head   =  shift @lines;
        $head   =~ s/^>?//;
        $head   =~ m/^(\S+)/;
        $seq_id = $1;

        if ($head  =~ m/^\S+\s+(.*)$/)  { $comment = $1; } else { $comment = ""; }

        $sequence  =  join( "", @lines );

        @parsed_fasta_record = ( $seq_id, \$sequence, $comment );
    }
    else
    {
        @parsed_fasta_record = ();
    }

    $/ = $old_end_of_record;

    return @parsed_fasta_record;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3