[Bio] / FigKernelScripts / update_SSU_rRNAs.pl Repository:
ViewVC logotype

View of /FigKernelScripts/update_SSU_rRNAs.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Sun Feb 22 14:46:22 2009 UTC (11 years, 3 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2009_07_09, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, HEAD
update of SSUs

use FIG;
my $fig = new FIG;
use gjoseqlib;
use strict;

my $function = "SSU rRNA";
my $user     = "SSU_updates";
my $genome;
foreach $genome (grep { $fig->is_prokaryotic($_) } $fig->genomes('complete'))
{
    print STDERR "processing $genome\n";
    my @hits = map { ($_ =~ /^(\d+\.\d+)\t(\S+)/) ? $2 : () } `find_SEED_SSU_rRNA_genes $genome`;
    if (@hits > 0)
    {
	my @tbl = map { chomp; [split(/\t/,$_)] } `cat $FIG_Config::organisms/$genome/Features/rna/tbl`;
	my $next = &next(\@tbl);

	my @seqs = &gjoseqlib::read_fasta("$FIG_Config::organisms/$genome/Features/rna/fasta");
	my($tuple,@no_overlap,@overlaps);
	foreach $tuple (@tbl)
	{
	    if (&overlap($tuple->[1],\@hits))
	    {
		push(@overlaps,$tuple);
	    }
	    else
	    {
		push(@no_overlap,$tuple);
	    }
	}

	my($hit);
	my($manual) = 0;
	my $altered = 0;
	foreach $hit (@hits)
	{
	    my @tmp = grep { &overlap($_->[1],[$hit]) } @overlaps;
	    if (@tmp == 0)
	    {
		my $new_id = "fig\|$genome\.rna\.$next";
		$next++;
		push(@no_overlap,[$new_id,$hit,$function]);
		push(@seqs,[$new_id,'',$fig->dna_seq($genome,$hit)]);
		$altered = 1;
	    }
	    elsif (@tmp == 1)
	    {
		if ($hit ne $tmp[0]->[1]) { $altered = 1 }
		push(@no_overlap,[$tmp[0]->[0],$hit,$function]);
		my $i;
		for ($i==0; ($i < @seqs) && ($seqs[$i]->[0] ne $tmp[0]->[0]); $i++) {}
		if ($i == @seqs) { print STDERR &Dumper($genome,$hit,$tmp[0],\@seqs); 
				   die "something is wrong"; }
		$seqs[$i]->[2] = $fig->dna_seq($genome,$hit);
	    }
	    else
	    {
		print STDERR "Handle manually: ",&Dumper($genome,$hit,\@tmp);
		$manual = 1;
	    }
	}

	if ((! $manual) && $altered)
	{
	    @no_overlap = sort { &FIG::by_fig_id($a->[0],$b->[0]) } @no_overlap;
	    print STDERR "Installing update for $genome\n";
	    &update_data($fig,$genome,\@no_overlap,\@seqs);
	}
    }
}
&FIG::run("load_features");

foreach $genome (grep { $fig->is_prokaryotic($_) } $fig->genomes('complete'))
{
    &check_assertions($fig,$genome,$function,$user);
}

sub overlap {
    my($loc1,$locs) = @_;

    my $i;
    for ($i=0; ($i < @$locs) && (! &overlap1($loc1,$locs->[$i])); $i++) {}
    return ($i < @$locs);
}

sub overlap1 {
    my($loc1,$loc2) = @_;
    
    if ($loc1 =~ /^(\S+)_(\d+)_(\d+)$/)
    {
	my($c1,$b1,$e1) = ($1,$2,$3);
	if ($loc2 =~ /^(\S+)_(\d+)_(\d+)$/)
	{
	    my($c2,$b2,$e2) = ($1,$2,$3);
	    return (($c1 eq $c2) && (&FIG::between($b1,$b2,$e1) || &FIG::between($b2,$b1,$e2)));
	}
    }
    return 0;
}

sub next {
    my($tbl) = @_;

    my $next = 1;
    foreach $_ (@$tbl)
    {
	if (($_->[0] =~ /(\d+)$/) && ($1 >= $next))
	{
	    $next = $1+1;
	}
    }
    return $next;
}

sub update_data {
    my($fig,$genome,$tbl,$seqs) = @_;

    &update_tbl($fig,$genome,$tbl);
    &update_fasta($fig,$genome,$seqs);
    &cleanup_assigned_functions($fig,$tbl);
}

sub update_tbl {
    my($fig,$genome,$tbl) = @_;

    my $file = "$FIG_Config::organisms/$genome/Features/rna/tbl";
    if (! -s $file)
    {
	print STDERR "CAREFUL: you are missing $file\n";
    }
    else
    {
	my $time = time;
	rename($file,"$file.$time");
    }
    open(TMP,">$file") || die "could not open $file";
    foreach my $tuple (@$tbl)
    {
	if (@$tuple < 3) { $#{$tuple} = 2 }
	print TMP join("\t",@$tuple),"\n";
    }
    close(TMP);
    system "chmod 777 $file";
}

sub update_fasta {
    my($fig,$genome,$seqs) = @_;

    my $file = "$FIG_Config::organisms/$genome/Features/rna/fasta";
    if (! -s $file)
    {
	print STDERR "CAREFUL: you are missing $file\n";
    }
    else
    {
	my $time = time;
	rename($file,"$file.$time");
    }

    open(TMP,">$file") || die "could not open $file";
    foreach my $tuple (@$seqs)
    {
	my($id,undef,$seq) = @$tuple;
	print TMP ">$id\n$seq\n";
    }
    close(TMP);
    system "chmod 777 $file";
}

sub cleanup_assigned_functions {
    my($fig,$tbl) = @_;

    my $file = "$FIG_Config::organisms/$genome/assigned_functions";

    if (-s $file)
    {
	my %rnas = map { $_->[0] => 1 } @$tbl;
	my @bad_entries = grep { ($_ =~ /^(fig\|\d+\.\d+\.rna\.\d+)/) && (! $rnas{$1}) } `cat $file`;
	if (@bad_entries > 0)
	{
	    my $time = time;
	    rename($file,"$file.$time");
	    open(IN,"<$file.$time") || die "could not open $file.$time";
	    open(OUT,">$file") || die "could not open $file";
	    while (defined($_ = <IN>))
	    {
		if (($_ =~ /^(\S+)/) && (! (($_ =~ /^(fig\|\d+\.\d+\.rna\.\d+)/) && (! $rnas{$1}))))
		{
		    print OUT $_;
		}
	    }
	    close(IN);
	    close(OUT);
	    system "chmod 777 $file";
	}
    }
}

sub check_assertions {
    my($fig,$genome,$function,$user) = @_;

    my $tbl = "$FIG_Config::organisms/$genome/Features/rna/tbl";
    if (-s $tbl )
    {
	my @ssu = `grep '$function' $tbl`;
	foreach $_ (@ssu)
	{
	    if ($_ =~ /^(fig\|$genome\.rna\.\d+)/)
	    {
		my $fid = $1;
		my $func = $fig->function_of($fid);
		if (! $func)
		{
		    $fig->assign_function($fid,$user,$function);
		}
	    }
	}
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3