[Bio] / FigKernelScripts / generate_gene_blocks.pl Repository:
ViewVC logotype

View of /FigKernelScripts/generate_gene_blocks.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Mon Dec 5 18:56:37 2005 UTC (13 years, 11 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.2: +17 -0 lines
Add license words.

# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#


use FIG;
my $fig = new FIG;

use strict;
my($dir,$set,@sets,%in_sets,%genomes,@genomes,$genome,$blockid,$peg,$gs,$fam_prefix);

my $usage = "usage: generate_gene_blocks Dir FamilyPrefix < OrderedSets";

(
 ($dir         = shift @ARGV) &&
 ($fam_prefix  = shift @ARGV)
)
    || die $usage;

if (-d $dir)    { die "$dir already exists" }

&FIG::verify_dir($dir);
open(GENOMES,">$dir/genome.tbl")  || die "could not open $dir/genome.tbl";
open(BLOCKS,">$dir/block.tbl")   || die "could not open block.tbl";
open(REGIONS,">$dir/region.tbl") || die "could not open region.tbl";

@sets = map { chop; [sort split(/\t/,$_)] } <STDIN>;
foreach $set (@sets)
{
    foreach $peg (@$set)
    {
	$in_sets{$peg} = 1;
	$genomes{&FIG::genome_of($peg)}++;
    }
}

@genomes = sort { $a <=> $b } keys(%genomes);
print STDERR "building blocks for the following genomes:\n";
foreach $genome (@genomes)
{
    $gs = $fig->genus_species($genome);
    print STDERR "\t$genome\t$gs\n";
    print GENOMES "$genome\t$gs\n";
}

$blockid = 1;
&write_sets($fig,\$blockid,$fam_prefix,\@sets,\*BLOCKS,\*REGIONS);

#foreach $genome (@genomes)
#{
#    &write_remaining_pegs($fig,$genome,\$blockid,$fam_prefix,\%in_sets,\*BLOCKS,\*REGIONS);
#}

close(GENOMES);
close(BLOCKS);
close(REGIONS);

sub write_sets {
    my($fig,$blockidP,$fam_prefix,$sets,$fh_blocks,$fh_regions) = @_;
    my($set,@aligned,$tuple);

    foreach $set (@$sets)
    {
	@aligned = &align_DNA_for_a_set_of_pegs($fig,$set); # return tuples: [$id,$ali_dna,$loc]
	foreach $tuple (@aligned)
	{
	    &write_tuple($$blockidP,$fh_regions,$tuple);
	}
	my $fam_id = $fam_prefix . substr(100000+$blockid,-5);
	print $fh_blocks "$$blockidP\t$fam_id\n";
	$$blockidP++;
    }
}

sub write_tuple {
    my($blockid,$fh,$tuple) = @_;

    my($peg,$ali_seq,$loc) = @$tuple;
    my $genome = &FIG::genome_of($peg);
    if ($loc =~ /^(\S+)_(\d+)_(\d+)$/)
    {
	print $fh join("\t",($peg,$genome,$1,$2,$3,$blockid,$ali_seq)),"\n";
    }
}

sub align_DNA_for_a_set_of_pegs {
    my($fig,$set) = @_;
    my($loc,$dseq);

    my $tuples = [];
    foreach $peg (@$set)
    {
	if (($loc = $fig->feature_location($peg)) && ($dseq = $fig->dna_seq(&FIG::genome_of($peg),$loc)))
	{
	    push(@$tuples,[$peg,$dseq,$loc]);
	}
    }
    return &align_DNA_for_a_set_of_seqs($fig,$tuples);
}

sub align_DNA_for_a_set_of_seqs {
    my($fig,$tuples) = @_;
    my($tuple,$id,$dseq,$hdr,$aseq,%aligned);

    my $tmpS = "/tmp/seq$$";

    open(SEQS,">$tmpS.fasta") || die "could not open $tmpS";
    foreach $tuple (@$tuples)
    {
	($id,$dseq) = @$tuple;
	print SEQS ">$id\n$dseq\n";
    }
    close(SEQS);
    system "$FIG_Config::ext_bin/clustalw -infile=$tmpS.fasta -align -outorder=aligned > /dev/null";
    my @ali = `$FIG_Config::bin/clustal_to_fasta < $tmpS.aln`;
    while (($hdr = shift @ali) && ($aseq = shift @ali))
    {
	if ($hdr =~ /^>(\S+)/)
	{
	    chop $aseq;
	    $aligned{$1} = $aseq;
	}
    }
    return map { my($peg,undef,@rest) = @$_; $aligned{$peg} ? [$peg,$aligned{$peg},@rest] : () } @$tuples;
}

sub write_remaining_pegs {
    my($fig,$genome,$blockidP,$fam_prefix,$in_sets,$fh_blocks,$fh_regions) = @_;
    my($peg,$loc,$dseq);

    foreach $peg ($fig->all_features($genome,"peg"))
    {
	if (! $in_sets->{$peg})
	{
	    if (($loc = $fig->feature_location($peg)) && ($dseq = $fig->dna_seq(&FIG::genome_of($peg),$loc)))
	    {
		if ($loc =~ /^(\S+)_(\d+)_(\d+)$/)
		{
		    print $fh_regions join("\t",($peg,$genome,$1,$2,$3,$$blockidP,$dseq)),"\n";
		}

		my $fam_id = $fam_prefix . substr(100000+$blockid,-5);
		print $fh_blocks "$$blockidP\t$fam_id\n";
		$$blockidP++;
	    }
	}
    }
}
	

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3