[Bio] / FigMetagenomeTools / renumber_sequences.pl Repository:
ViewVC logotype

View of /FigMetagenomeTools/renumber_sequences.pl

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.2 - (download) (as text) (annotate)
Fri Mar 16 20:51:25 2007 UTC (13 years, 3 months ago) by olson
Branch: MAIN
Changes since 1.1: +63 -32 lines
initial tweaks

#!/usr/bin/perl -w

# renumber sequences and remove any exact duplicate sequences

use strict;
use DBI;
use Rob;

my @opts;

my $dbh=DBI->connect($FIG_Config::metagenome_dsn, $FIG_Config::metagenome_dbuser, $FIG_Config::metagenome_dbpass,
		    { RaiseError => 1 })
	or die "Can't connect to database sequences\n";

my $usage=<<EOF;

-f fasta file
-q quality file
-g genome-name
-h host-name
-p assembled-path

-n number of genome. If this is provided, then the data will not be added to the database, and only the files will be written.


my ($fastaf, $qualf, $genomeno, $genome_name, $genome_host, $asm_path);
while (@ARGV) {
 my $t=shift;
 if ($t eq "-f") {$fastaf=shift}
 elsif ($t eq "-q") {$qualf=shift}
 elsif ($t eq "-n") {$genomeno=shift}
 elsif ($t eq "-g") {$genome_name = shift}
 elsif ($t eq "-h") {$genome_host = shift}
 elsif ($t eq "-p") {$asm_path = shift}


unless ($qualf) {$qualf = $fastaf.".qual"}

die $usage unless (-e $fastaf && -e $qualf);
my ($count, $max);
if ($genomeno)
	my $min=1E99;
	my $exc=$dbh->prepare("select count, genome from DNA where genome = $genomeno") || die $dbh->errstr;
	$exc->execute||die $dbh->errstr;
	while (my @ret=$exc->fetchrow_array)
		($ret[0] < $min) ? ($min=$ret[0]) : 1;
		($ret[0] > $max) ? ($max=$ret[0]) : 1;
	unless ($min && $max) {die "$genomeno does not appear to be in the database. Please confirm why we couldn't get min and max?"}
	print STDERR "Reading files starting writing at sequence $min. There were ", $max-$min+1, " sequences\n";
    # set the information for the genome table#
    # Assume that if the genome name was passed on the command line that genome_host
    # was also passed (meaning that an undefined value for genome_host means it is not defined,
    # not that we need to ask the user).

    my $ask_user;
    if (!defined($genome_name))
	print "This is the information for the Genome table in the sequences database\nPlease enter the name of the genome:\n";
	$genome_name = <STDIN>;
    my $res = $dbh->selectall_arrayref("select count from genome where name = ?", undef, $genome_name);
    if (@$res)
	die "The genome $genome_name already appears in the database as number $count\n";
    $count = $res->[0]->[0];

    if ($ask_user)
	print "If the sequence is for a phage, and the host is known, please enter the host (or leave blank):\n";
	my $genome_host=<STDIN>;

	print "If the sequence has been assembled, please enter the path to the assembled sequence (or leave blank):\n";
	my $asm_path=<STDIN>;

    $dbh->do("insert into genome (name, host, assembled_sequence) values (?, ?, ?)", undef,
	     $genome_name, $genome_host, $asm_path);

    my $res = $dbh->selectcol_arrayref("select count from genome where name = ?", undef, $genome_name);
    $res or die $dbh->errstr;
    $count = $res->[0];

    $res = $dbh->selectcol_arrayref("select max(count) from DNA");
    $res or die $dbh->errstr;
    $max = $res->[0];
    print "The new genome is number $count and the first sequence will be $max\nReading and dereplicating the fasta file\n";

# read and de replicate the fasta file
my $fasta=Rob->read_fasta($fastaf);
	my $seqs; 
	map {$fasta->{$_} =~ s/\s+//g; $seqs->{$fasta->{$_}}=$_} keys %$fasta;
	undef $fasta;
	map {$fasta->{$seqs->{$_}}=$_} keys %$seqs;

my $qual=Rob->read_fasta($qualf, 1);
map {$qual->{$_} =~ s/\s+/ /g} keys %$qual;

print "Inserting data into database and rewriting files\n";
open (FA, ">$count.fa") || die "Can't open $count.fa for writing\n";
open (QU, ">$count.qual") || die "Can't open $count.qual for writing\n";

my $exc=$dbh->prepare(qq(insert into DNA (count, genome, original_name, trace_file, sequence_file, quality_file)
			 values (?, ?, ?, ?, ?, ?)));

foreach my $s (keys %$fasta)
	unless ($qual->{$s}) {print STDERR "No quality for $s. Skipped\n"; next}
	print FA ">$max\n", $fasta->{$s}, "\n";
	print QU ">$max\n", $qual->{$s}, "\n";
	unless ($genomeno)
	    $exc->execute($max, $count, $s, '', "$count.fa", "$count.qual")  || die $dbh->errstr;


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3