[Bio] / FigMetagenomeTools / renumber_sequences.pl Repository:
ViewVC logotype

View of /FigMetagenomeTools/renumber_sequences.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Sat Apr 14 22:35:31 2007 UTC (13 years, 3 months ago) by olson
Branch: MAIN
Changes since 1.3: +18 -10 lines
finish fixign renumber stuff

#!/usr/bin/perl -w

# renumber sequences and remove any exact duplicate sequences

use strict;
use DBI;
use Rob;

my @opts;

my $dbh=DBI->connect($FIG_Config::metagenome_dsn, $FIG_Config::metagenome_dbuser, $FIG_Config::metagenome_dbpass,
		    { RaiseError => 1 })
	or die "Can't connect to database sequences\n";


my $usage=<<EOF;

$0
-f fasta file
-q quality file
-g genome-name
-h host-name
-p assembled-path
OPTIONAL

-n number of genome. If this is provided, then the data will not be added to the database, and only the files will be written.

EOF

my ($fastaf, $qualf, $genomeno, $genome_name, $genome_host, $asm_path);
while (@ARGV) {
 my $t=shift;
 if ($t eq "-f") {$fastaf=shift}
 elsif ($t eq "-q") {$qualf=shift}
 elsif ($t eq "-n") {$genomeno=shift}
 elsif ($t eq "-g") {$genome_name = shift}
 elsif ($t eq "-h") {$genome_host = shift}
 elsif ($t eq "-p") {$asm_path = shift}
}

unless ($qualf) {$qualf = $fastaf.".qual"}

die $usage unless (-e $fastaf && -e $qualf);
my ($count, $max);

$dbh->begin_work();

if ($genomeno)
{
	$count=$genomeno;
	my $min=1E99;
	$max=0;
	my $exc=$dbh->prepare("select count, genome from DNA where genome = $genomeno") || die $dbh->errstr;
	$exc->execute||die $dbh->errstr;
	while (my @ret=$exc->fetchrow_array)
	{
		($ret[0] < $min) ? ($min=$ret[0]) : 1;
		($ret[0] > $max) ? ($max=$ret[0]) : 1;
	}
	unless ($min && $max) {die "$genomeno does not appear to be in the database. Please confirm why we couldn't get min and max?"}
	print STDERR "Reading files starting writing at sequence $min. There were ", $max-$min+1, " sequences\n";
	$max=$min;
}
else
{
    # set the information for the genome table#
    #
    # Assume that if the genome name was passed on the command line that genome_host
    # was also passed (meaning that an undefined value for genome_host means it is not defined,
    # not that we need to ask the user).
    #

    my $ask_user;
    if (!defined($genome_name))
    {
	$ask_user++;
	print "This is the information for the Genome table in the sequences database\nPlease enter the name of the genome:\n";
	$genome_name = <STDIN>;
	chomp($genome_name);
    }
    my $res = $dbh->selectall_arrayref("select count from genome where name = ?", undef, $genome_name);
    if (@$res)
    {
	die "The genome $genome_name already appears in the database as number $count\n";
    }
    
    $count = $res->[0]->[0];

    if ($ask_user)
    {
	print "If the sequence is for a phage, and the host is known, please enter the host (or leave blank):\n";
	my $genome_host=<STDIN>;
	chomp($genome_host);

	print "If the sequence has been assembled, please enter the path to the assembled sequence (or leave blank):\n";
	my $asm_path=<STDIN>;
	chomp($asm_path);
    }

    $dbh->do("insert into genome (name, host, assembled_sequence) values (?, ?, ?)", undef,
	     $genome_name, $genome_host, $asm_path);

    my $res = $dbh->selectcol_arrayref("select count from genome where name = ?", undef, $genome_name);
    $res or die $dbh->errstr;
    $count = $res->[0];

    $res = $dbh->selectcol_arrayref("select max(count) from DNA");
    $res or die $dbh->errstr;
    $max = $res->[0];
    
    $max++;
    
    print "The new genome is number $count and the first sequence will be $max\nReading and dereplicating the fasta file\n";
}
$dbh->commit();

# read and de replicate the fasta file
my $fasta=Rob->read_fasta($fastaf);
{
	my $seqs; 
	map {$fasta->{$_} =~ s/\s+//g; $seqs->{$fasta->{$_}}=$_} keys %$fasta;
	undef $fasta;
	map {$fasta->{$seqs->{$_}}=$_} keys %$seqs;
}

my $qual=Rob->read_fasta($qualf, 1);
map {$qual->{$_} =~ s/\s+/ /g} keys %$qual;

print "Inserting data into database and rewriting files\n";
open (FA, ">$count.fa") || die "Can't open $count.fa for writing\n";
open (QU, ">$count.qual") || die "Can't open $count.qual for writing\n";

my $exc=$dbh->prepare(qq(insert into DNA ( genome, original_name, trace_file, sequence_file, quality_file)
			 values (?, ?, ?, ?, ?)));

foreach my $s (keys %$fasta)
{
    my $id;

    unless ($qual->{$s}) {print STDERR "No quality for $s. Skipped\n"; next}
    
    if ($genomeno)
    {
	$id = $max++;
    }
    else
    {
	$exc->execute($count, $s, '', "$count.fa", "$count.qual")  || die $dbh->errstr;
	$id = $dbh->{mysql_insertid};
    }
    
    print FA ">$id\n", $fasta->{$s}, "\n";
    print QU ">$id\n", $qual->{$s}, "\n";
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3