[Bio] / FigKernelScripts / rep_sets.pl Repository:
ViewVC logotype

View of /FigKernelScripts/rep_sets.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Thu Oct 29 19:43:16 2009 UTC (10 years ago) by disz
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Old code finally checked in to create representative sets

use strict;
use FIG;
use FIGV;

use Carp;
use Data::Dumper;

if (@ARGV != 1) {
	die "Usage rep_sets genome\n";
}
my $genome = @ARGV[0];
my $genome_sets = "$FIG_Config::data/Global/genome.sets";
my $fasta = "$FIG_Config::data/Global/genome_rRNA.fasta";
my $genomeDir = "$FIG_Config::data/Organisms/$genome";

my @sets = `cat $genome_sets`;
my $next_num = scalar(@sets) - 1;
my ($next_num) = split(/\s/, $sets[$next_num]);
$next_num++;

my $fig = FIG->new();
#my $fig = new FIGV($genomeDir);

my $max = 0;
my %fids;

my @a =  `grep ".rna" $genomeDir/assigned_functions | grep -i -e "SSU" -e "small subunit"`;
my $ln;
#find largest Small Subunit Rna 
foreach $a (@a) {
	my ($id, undef) = split(/\s/, $a);
	my $contig = $fig->feature_location($id);
	my (undef, $a, $b) = split(/_/, $contig);
	$ln = abs($a - $b);
	$fids{$ln} = $id;
	$max = $max > $ln ? $max:$ln;

}

if (! $max) { print "No MAX\n";exit};
if ( $ln < 1000) {print "No 1000\n";exit};

#blast against the reference fasta file
# formatdb must have been run
my $id = $fids{$max};
my $contig = $fig->feature_location($id);
my $dna = ">$id\n" . $fig->dna_seq("$genome", $contig) . "\n";
my @b = `echo "$dna" | blastall -p blastn -d $fasta -m 8 -FF -e 1.0e-20`; 

#no matches
#if (scalar (@b) == 0) {exit};

my $save_b;
my $sz = 0;

#find best match >= 97% and > 1000 in the reference fasta file
$max = 0;
foreach my $b (@b) {
	my ($id1, $id2, $percent, $ln, undef, undef, $b1, $e1, $b2, $e2) = split(/\s/, $b);
	if ($ln > 1000 && $percent >= 97) {
		#print PAT $id2, "\n";
		#print $b;
	#	print "$id1, $id2, $percent, $b1, $e1, $b2, $e2, $ln\n";
		if ($max < $percent) {
			$max = $percent;
			$save_b = $b;
		}
	}
}


open PAT,  ">Tmp";
my ($set, $id1, $id2, $percent, $ln, $b1, $e1, $b2, $e2);
if ($max ) {
	# if we found one, add this new id to the right set
	($id1, $id2, $percent, $ln, undef, undef, $b1, $e1, $b2, $e2) = split(/\s/, $save_b);
	my $set_id = $id2;
	foreach my $line (@sets) {
		print PAT $line;
		($set, $id) = split(/\s/, $line);
		if ($id == $set_id) {
			my $gs = $fig->genus_species($fig->genome_of($id1));
			print PAT "$set\t$gs\n";
			$set_id = undef;
		}
	}
} else {
	#create a new set, add to the reference fasta file
	print PAT join ("", @sets);
	my $gs = $fig->genus_species($fig->genome_of($id));
	print PAT "$next_num\t$gs\n";
	open FASTA,  ">>$fasta";
	print FASTA $dna;
	close FASTA;
}
close PAT;


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3