[Bio] / FigMetagenomeTools / renumber_sequences.pl Repository:
ViewVC logotype

View of /FigMetagenomeTools/renumber_sequences.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Mon Feb 19 17:15:26 2007 UTC (13 years, 3 months ago) by olson
Branch: MAIN
Branch point for: x
Initial revision

#!/usr/bin/perl -w

# renumber sequences and remove any exact duplicate sequences

use strict;
use DBI;
use lib "$ENV{HOME}/MIG/Modules";
use Rob;

my $dbh=DBI->connect("DBI:mysql:sequences", "rob", "forestry") or die "Can't connect to database sequences\n";

my $usage=<<EOF;

$0
-f fasta file
-q quality file

OPTIONAL

-n number of genome. If this is provided, then the data will not be added to the database, and only the files will be written.

EOF

my ($fastaf, $qualf, $genomeno);
while (@ARGV) {
 my $t=shift;
 if ($t eq "-f") {$fastaf=shift}
 elsif ($t eq "-q") {$qualf=shift}
 elsif ($t eq "-n") {$genomeno=shift}
}

unless ($qualf) {$qualf = $fastaf.".qual"}

die $usage unless (-e $fastaf && -e $qualf);
my ($count, $max);
if ($genomeno)
{
	$count=$genomeno;
	my $min=1E99;
	$max=0;
	my $exc=$dbh->prepare("select count, genome from DNA where genome = $genomeno") || die $dbh->errstr;
	$exc->execute||die $dbh->errstr;
	while (my @ret=$exc->fetchrow_array)
	{
		($ret[0] < $min) ? ($min=$ret[0]) : 1;
		($ret[0] > $max) ? ($max=$ret[0]) : 1;
	}
	unless ($min && $max) {die "$genomeno does not appear to be in the database. Please confirm why we couldn't get min and max?"}
	print STDERR "Reading files starting writing at sequence $min. There were ", $max-$min+1, " sequences\n";
	$max=$min;
}
else
{
# set the information for the genome table
	print "This is the information for the Genome table in the sequences database\nPlease enter the name of the genome:\n";
	my $name=<STDIN>;
	chomp($name);
	my $exc=$dbh->prepare("select count from genome where name like '$name'");
	$exc->execute || die $dbh->errstr;
	($count)=$exc->fetchrow_array;
	die "The genome $name already appears in the database as number $count\n" if ($count);

	print "If the sequence is for a phage, and the host is known, please enter the host (or leave blank):\n";
	my $host=<STDIN>;
	chomp($host);

	print "If the sequence has been assembled, please enter the path to the assembled sequence (or leave blank):\n";
	my $path=<STDIN>;
	chomp($path);

	$exc=$dbh->prepare("insert into genome (count, name, host, assembled_sequence) values ('', '$name', '$host', '$path')");
	$exc->execute || die $dbh->errstr;
	$exc=$dbh->prepare("select count from genome where name like '$name'");
	$exc->execute || die $dbh->errstr;
	($count)=$exc->fetchrow_array;

	$exc=$dbh->prepare("select count from DNA");
	$exc->execute || die $dbh->errstr;
	$max=0;
	while (my @ret=$exc->fetchrow_array) {$max=$ret[0] if ($ret[0] > $max)}
	$max++;

	print "The new genome is number $count and the first sequence will be $max\nReading and dereplicating the fasta file\n";
}

# read and de replicate the fasta file
my $fasta=Rob->read_fasta($fastaf);
{
	my $seqs; 
	map {$fasta->{$_} =~ s/\s+//g; $seqs->{$fasta->{$_}}=$_} keys %$fasta;
	undef $fasta;
	map {$fasta->{$seqs->{$_}}=$_} keys %$seqs;
}

my $qual=Rob->read_fasta($qualf, 1);
map {$qual->{$_} =~ s/\s+/ /g} keys %$qual;

print "Inserting data into database and rewriting files\n";
open (FA, ">$count.fa") || die "Can't open $count.fa for writing\n";
open (QU, ">$count.qual") || die "Can't open $count.qual for writing\n";

foreach my $s (keys %$fasta)
{
	unless ($qual->{$s}) {print STDERR "No quality for $s. Skipped\n"; next}
	print FA ">$max\n", $fasta->{$s}, "\n";
	print QU ">$max\n", $qual->{$s}, "\n";
	unless ($genomeno)
	{
		my $exc=$dbh->prepare("insert into DNA (count, genome, original_name, trace_file, sequence_file, quality_file) values 
				($max, '$count', '$s', '', '$count.fa', '$count.qual')");
		$exc->execute || die $dbh->errstr;
	}
	$max++;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3