[Bio] / FigKernelScripts / update_OTUs.pl Repository:
ViewVC logotype

View of /FigKernelScripts/update_OTUs.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Tue Feb 22 20:31:10 2011 UTC (9 years ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_release_3_0, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_10262011, HEAD
Changes since 1.4: +91 -229 lines
fix OTU update to use gary olsen sequence grouping script

use strict;
use Data::Dumper;
use gjoseqlib;
use POSIX;

use FIG;
my $fig = new FIG;

my $rRNA_fastaF   = "$FIG_Config::global/genome_rRNA.fasta";
my $genome_setsF  = "$FIG_Config::global/genome.sets";
my $clusteredF    = "$FIG_Config::global/clustered.rRNA";
my $rep_rRNA      = "$FIG_Config::global/representative_rRNA.fasta";

$rRNA_fastaF   = "tmp.rRNA.fasta";
$genome_setsF  = "tmp.genome.sets";
$clusteredF    = "clustered.rRNA";
$rep_rRNA      = "representative_rRNA.fasta";

my %old_reps;
my $max_set = 0;
open(SETS,"<",$genome_setsF) || die "could not open $genome_setsF";
my $last = <SETS>;
while ($last && ($last =~ /^(\d+)\t(\d+\.\d+)/))
{
    $old_reps{$2} = $1;
    my $set = $1;
    $max_set = &FIG::max($max_set,$set);
    while ($last && ($last =~ /^(\d+)/) && ($1 == $set))
    {
	$last = <SETS>;
    }
}
close(SETS);

my @all_genomes = grep { $fig->is_prokaryotic($_) } $fig->genomes('complete');
my %allH        = map { $_ => 1 } @all_genomes;
my @old_rnas    = grep { $allH{$_->[0]} } &gjoseqlib::read_fasta($rRNA_fastaF);
my %got_already = map { $_->[0] => 1 } @old_rnas;
my @need        = grep { ! $got_already{$_} } @all_genomes;

my @to_add;
foreach my $genome (@need) 
{
    my @rna_fids = grep { my $func = $fig->function_of($_);
			  ($func =~ /(SSU\s+rRNA|Small\s+Subunit\s+(Ribosomal\s+r)?RNA|ssuRNA|16S\s+(r(ibosomal\s+)?)?RNA)/io) 
			}
                   $fig->all_features($genome,'rna');
    my @seqs = sort { length($b->[2]) <=> length($a->[2]) } 
               grep { length($_->[2]) >= 1000}
               map { [&FIG::genome_of($_),'',$fig->get_dna_seq($_)] } @rna_fids;
    if (@seqs > 0)
    {
	push(@to_add,$seqs[0]);
    }
}

&backup($rRNA_fastaF);
&backup($genome_setsF);

&gjoseqlib::print_seq_list_as_fasta($rRNA_fastaF,[@old_rnas,@to_add]);
&FIG::run("svr_representative_sequences -b -f $clusteredF -s 0.97 < $rRNA_fastaF > $rep_rRNA");

open(CLUST,"<",$clusteredF)  || die "could not open $clusteredF";
open(SETS,">",$genome_setsF) || die "could not open $genome_setsF";
my $nxt_set = $max_set + 1;
my @sets;
while (defined($_ = <CLUST>))
{
    chomp;
    my @members = split(/\t/,$_);
    my @pairs   = sort { $a->[1] <=> $b->[1] } map { [$_,$old_reps{$_} ? $old_reps{$_} : $nxt_set] } @members;
    my $set = $pairs[0]->[1];
    if ($set == $nxt_set) { $nxt_set++ }
    push(@sets,[$set,[map { $_->[0] } @pairs]]);
}
close(CLUST);
foreach $_ (sort { $a->[0] <=> $b->[0] } @sets)
{
    my($set,$members) = @$_;
    foreach my $mem (@$members)
    {
	print SETS join("\t",($set,$mem,$fig->genus_species($mem))),"\n";
    }
}
close(SETS);

sub backup {
    my($file) = @_;

    my $bak = strftime(".bak.%Y-%m-%d-%H-%M-%S", localtime);

    my $bf = $file . $bak;
    if (-e $bf) 
    {
	unlink($bf) || die "could not make backup for $file: $!";
    }
    rename($file,$bf) || die "could not rename $file $bf: $!";
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3