[Bio] / FigKernelScripts / gather_genome_info.pl Repository:
ViewVC logotype

View of /FigKernelScripts/gather_genome_info.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Mon Oct 12 16:45:02 2009 UTC (10 years, 1 month ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.1: +16 -6 lines
Add more data gathering

#
# Gather all information about the complete prokaryotic genomes.
#
# This includes
#
# Taxon id
# accession #s from genbank file
# project id from genbank file
# RAST job id if present
#
# one line for each contig:
# contig id, size, md5sum
#

use strict;
use FIG;

my $fig = new FIG;

#my @genomes = sort { &FIG::by_genome_id($a, $b) } $fig->genomes();
#my @genomes = qw(585057.4);
my $genome_info = $fig->genome_info;

foreach my $entry (sort { &FIG::by_genome_id($a->[0], $b->[0]) } @$genome_info)
{
    my($genome,$name,$szdna,$maindomain,$pegs,$rnas,$complete,$tax) = @$entry;

    next unless $fig->is_prokaryotic($genome);
    next unless $complete;

    my $rrnas = 0;
    my $trnas = 0;
    foreach my $fid ($fig->all_features($genome,'rna'))
    {
	my $func = $fig->function_of($fid);
	if ($func =~ /trna/i) { $trnas++ }
	if ($func =~ /rrna/i) { $rrnas++ }
    }
    
    my $dir = $fig->organism_directory($genome);

    #
    # Gather RAST info.
    #

    my $rast_job;
    if (open(R, "<", "$dir/RAST"))
    {
	while (<R>)
	{
	    if (/RAST job number (\d+)/)
	    {
		$rast_job = $1;
		last;
	    }
	}
	close(R);
    }

    #
    # Genbank info, if available.
    #
    my(@genbank);
    my $cur;
    if (open(G, "<", "$dir/genbank_file"))
    {
	#
	# Crude parse for the data we're interested in.
	#
	while (<G>)
	{
	    chomp;
	    if (/^LOCUS\s+(.*)/)
	    {
		$cur = [];
		push(@$cur, $_);
		push(@genbank, $cur);
	    }
	    elsif (/^ACCESSION\s+(\S+)/)
	    {
		push(@$cur, $_);
	    }
	    elsif (/^DBLINK\s+Project:(\d+)/)
	    {
		push(@$cur, $_);
	    }
	}
	close(G);
    }

    print join("\t", $genome, $name, $maindomain, $rast_job, $complete, $szdna, $pegs, $rnas, $rrnas, $trnas), "\n";
    my $contigs = $fig->contig_lengths($genome);
    for my $id (sort keys %$contigs)
    {
	my $md5 = $fig->contig_md5sum($genome, $id);
	print join("\t", $id, $contigs->{$id}, $md5), "\n";
    }
    print "///\n";
    for my $g (@genbank)
    {
	print "$_\n" for @$g;
    }
    print "///\n";
    print "//\n";
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3