[Bio] / FigWebServices / ribosomal_rnas.cgi Repository:
ViewVC logotype

View of /FigWebServices/ribosomal_rnas.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (annotate)
Sat Aug 29 22:04:48 2009 UTC (10 years, 2 months ago) by redwards
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
new cgi for getting data

#__perl__

=pod

=head1 ribosomal_rnas.cgi

This is a Web services script that extracts the 16S sequences, the 23S sequences, and the iternal spacer region (ITS) that we use for developing the ADAPTdb database. The database is for ARISA data analysis, and is built by Robert Schmieder and Rob Edwards. 

This script is designed to be called whenever we want to get new data, and to return a tab-separated text data stream that we can parse locally.

=author

Rob Edwards, March 2009
Robert Schmieder, April 2009

=cut


use strict;
use warnings;
use CGI qw(header);
print header('text/plain');

use CGI::Carp qw(fatalsToBrowser);
use FIG;
my $fig=new FIG;
use lib '/home/redwards/MIG/Modules';
use Rob;
my $rob=new Rob;


$| = 1; # Do not buffer output

my $maxdist = 3000;
my $countall = 0;
foreach my $kingdom ("Bacteria","Archaea") {
    my $count = 0; #count for 16S-ITS-23S regions found
    #foreach my $g (@{$fig->all_genomes("complete", undef, $kingdom)->{item}}) {
    foreach my $g ($fig->genomes("complete", undef, $kingdom)) {
	my (%rrnas,$keep,@need,$number,$check);
	$number = 0;
	foreach my $rna ($fig->rnas_of($g)) {
	    my $fn = scalar($fig->function_of($rna));
	    if ($fn =~ m/([LS])SU rRNA/i) {
		my ($contig,$start,$stop) = $fig->boundaries_of($fig->feature_location($rna));
		die "Error getting the boundaries of $rna. We returned Contig: $contig Beginning: $start End: $stop\n" unless(defined $contig && defined $start && defined $stop);
		# print STDERR "  location: $contig $start $stop";
		push(@{$rrnas{($1 eq 'S' ? '16S' : '23S')}},[$contig,$start,$stop,$rna]);
	    }
	    # print STDERR "\n";
	}
	($keep,$check) = &checkRegionPairs(&checkForOverlaps(\%rrnas)); #values: number,contig,strand,kind,start,stop,kind,start,stop
	if($keep) {
	    @need = ($g, $fig->taxonomy_of($g), $fig->genus_species($g), ($fig->replaces($g) || '')); #values: id,taxon,orgn,replace
	    foreach(@$keep) {
		#values needed: id,taxon,orgn,replace,number,contig,start16S,stop16S,start23S,stop23S,length,strand,seq
		#get start and stop
		my $number = $_->[0];
		my $contig = $_->[1];
		my $strand = $_->[2];
		my $start = $strand ? ($_->[4] < $_->[7] ? $_->[4] : $_->[7]) : ($_->[4] < $_->[7] ? $_->[5] : $_->[8]);
		my $stop = $strand ? ($_->[4] < $_->[7] ? $_->[8] : $_->[5]) : ($_->[4] < $_->[7] ? $_->[7] : $_->[4]);
		my $length = $stop-$start+1;
		#get seq (rest from need and keep arrays)
		my $loc = join("_", $contig,$start,$stop); #contig,start,stop
		my $seq = $fig->dna_seq($g, $loc);
		$seq = &rc($seq) unless($strand);
		my $start16S = ($_->[3] eq '16S' ? $_->[4] : $_->[7]);
		my $stop16S = ($_->[3] eq '16S' ? $_->[5] : $_->[8]);
		my $start23S = ($_->[3] eq '16S' ? $_->[7] : $_->[4]);
		my $stop23S = ($_->[3] eq '16S' ? $_->[8] : $_->[5]);
		print join("\t", @need,$number,$contig,$start16S,$stop16S,$start23S,$stop23S,$length,$strand,$seq)."\n";
		$number = $_->[0];
	    }
	}
	$count += $number;
	# print STDERR "Warning: found overlapping rRNAs for genome ($g)\n" if($check);
	# print STDERR "Found $number valid 16S-ITS-23S region(s) for genome ($g)\n";
    }
    # print STDERR "\nNumber of valid 16S-ITS-23S regions found for $kingdom: $count\n";
    $countall += $count;
}
# print STDERR "\nNumber of valid 16S-ITS-23S regions found: $countall\n";


# reverse complement a sequence
sub rc {
    my $seq = shift;
    $seq =~ tr/GATCgatc/CTAGctag/;
    $seq = reverse $seq;
    return $seq;
}

#discard overlapping regions on same strand and contig (16S with 16S, 23S with 23S, 16S with 23S)
#remove all overlapping regions bc we do not know which one is correct one
sub checkForOverlaps {
    my $rrnas = shift;
    my (%pos,$strand,$check);
    #separate regions by strand
    while(my ($kind,$array) = each(%$rrnas)) {
	foreach my $rrna (@$array) {
	    my ($contig,$start,$stop,$fig) = @$rrna;
	    $strand = ($start < $stop ? 1 : 0);
	    push(@{$pos{$contig}->{$strand}},[$start,$stop,$kind,$fig]);
	}
    }
    #find overlapping regions and remove them
    while(my ($contig,$hash) = each(%pos)) {
	while(my ($strand,$array) = each(%$hash)) {
	    my (%overlapindex,$n,$c16S,$c23S,@sort);
	    $n = scalar(@$array)-1;
	    if($n) { #more than one region
		#sort array by start position
		@sort = sort {$a->[0] <=> $b->[0]} @$array;
		foreach my $i (0..($n-1)) {
		    foreach my $j (($i+1)..$n) {
			#check if start_i <= start_j and start_j <= stop_i on forward strand -> overlap of rrna_i and rrna_j
			#same for reverse strand, just change start and stop
			$overlapindex{$i} = $overlapindex{$j} = 1 if($sort[$i]->[abs($strand-1)] <= $sort[$j]->[abs($strand-1)] && $sort[$j]->[abs($strand-1)] <= $sort[$i]->[$strand]);
		    }
		}
		#remove overlapping regions
		splice(@sort,$_,1) foreach(reverse sort {$a <=> $b} keys %overlapindex);
	    }
	    $check = 1 if(scalar(keys %overlapindex));
	    #remove strand entries without at least one 16S and 23S region on same strand
	    ($_->[2] eq '16S' ? $c16S = 1 : $c23S = 1) foreach(@sort);
	    if($c16S && $c23S) { #found 16S and 23S regions
		$pos{$contig}->{$strand} = \@sort;
	    } else {
		delete $pos{$contig}->{$strand};
	    }
	}
	#remove contig entries without at least one strand entry
	delete $pos{$contig} unless(scalar(keys %{$pos{$contig}}));
    }
    return (\%pos,$check);
}

#check for each 16S region in contig: 
# - 16S region before 23S on forward strand
# - 16S region behind 23S on on reverse strand
# - region between 16S and 23S region is less than 5000bp long
sub checkRegionPairs {
    my $pos = shift;
    my $check = shift;
    my (@keep,$starta,$stopa,$startb,$stopb,$lengtha,$lengthb);
    my $number = 1;
    return 0 unless($pos && scalar(keys %$pos)); #empty hash from previous function
    while(my ($contig,$hash) = each(%$pos)) {
	while(my ($strand,$array) = each(%$hash)) {
	    my $n = scalar(@$array);
	    foreach(0..($n-2)) {
		next unless($array->[$_]->[2] eq ($strand ? '16S' : '23S') && #check for 16S/23S region
			    $array->[$_+1]->[2] eq ($strand ? '23S' : '16S') && #check for following 16S/23S region
			    ($array->[$_+1]->[abs($strand-1)] - $array->[$_]->[$strand] - 1) < $maxdist); #check for distance between 16S/23S
		
		#check for sequence length
		$starta = $array->[$_]->[0];
		$stopa = $array->[$_]->[1];
		$lengtha = $array->[$_]->[$strand]-$array->[$_]->[abs($strand-1)]+1;
		$startb = $array->[$_+1]->[0];
		$stopb = $array->[$_+1]->[1];
		$lengthb = $array->[$_+1]->[$strand]-$array->[$_+1]->[abs($strand-1)]+1;;
		next unless($lengtha > 300 && $lengthb > 300);
		#values: number,contig,strand,kind,start,stop,kind,start,stop
		push(@keep,[$number,$contig,$strand,$array->[$_]->[2],$starta,$stopa,$array->[$_+1]->[2],$startb,$stopb]);
		$number++;
	    }
	}
    }
    return (\@keep,$check);
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3