[Bio] / FigMetagenomeTools / renumber_sequences.pl Repository:
ViewVC logotype

View of /FigMetagenomeTools/renumber_sequences.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Tue Jul 10 16:39:58 2007 UTC (12 years, 5 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_rel_2008_0806, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_rel_2008_0923, mgrast_release_3_0, mgrast_dev_03252011, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, mgrast_rel_2008_0919, mgrast_rel_2008_1110, myrast_33, mgrast_rel_2008_0917, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
Changes since 1.4: +38 -14 lines
Fixes for not using a qual file

#!/usr/bin/perl -w

# renumber sequences and remove any exact duplicate sequences

use strict;
use DBI;
use Rob;

my @opts;

my $dbh=DBI->connect($FIG_Config::metagenome_dsn, $FIG_Config::metagenome_dbuser, $FIG_Config::metagenome_dbpass,
		    { RaiseError => 1 })
	or die "Can't connect to database sequences\n";


my $usage=<<EOF;

$0
-f fasta file
-q quality file
-g genome-name
-h host-name
-p assembled-path
OPTIONAL

-n number of genome. If this is provided, then the data will not be added to the database, and only the files will be written.

EOF

my ($fastaf, $qualf, $genomeno, $genome_name, $genome_host, $asm_path);
while (@ARGV) {
 my $t=shift;
 if ($t eq "-f") {$fastaf=shift}
 elsif ($t eq "-q") {$qualf=shift}
 elsif ($t eq "-n") {$genomeno=shift}
 elsif ($t eq "-g") {$genome_name = shift}
 elsif ($t eq "-h") {$genome_host = shift}
 elsif ($t eq "-p") {$asm_path = shift}
}

#unless ($qualf) {$qualf = $fastaf.".qual"}

die $usage unless (-e $fastaf);
my ($count, $max);

$dbh->begin_work();

if ($genomeno)
{
	$count=$genomeno;
	my $min=1E99;
	$max=0;
	my $exc=$dbh->prepare("select count, genome from DNA where genome = $genomeno") || die $dbh->errstr;
	$exc->execute||die $dbh->errstr;
	while (my @ret=$exc->fetchrow_array)
	{
		($ret[0] < $min) ? ($min=$ret[0]) : 1;
		($ret[0] > $max) ? ($max=$ret[0]) : 1;
	}
	unless ($min && $max) {die "$genomeno does not appear to be in the database. Please confirm why we couldn't get min and max?"}
	print STDERR "Reading files starting writing at sequence $min. There were ", $max-$min+1, " sequences\n";
	$max=$min;
}
else
{
    # set the information for the genome table#
    #
    # Assume that if the genome name was passed on the command line that genome_host
    # was also passed (meaning that an undefined value for genome_host means it is not defined,
    # not that we need to ask the user).
    #

    my $ask_user;
    if (!defined($genome_name))
    {
	$ask_user++;
	print "This is the information for the Genome table in the sequences database\nPlease enter the name of the genome:\n";
	$genome_name = <STDIN>;
	chomp($genome_name);
    }

    #
    # Allow duplicate genome names.
    #
    # my $res = $dbh->selectall_arrayref("select count from genome where name = ?", undef, $genome_name);
    # if (@$res)
    # {
# 	die "The genome $genome_name already appears in the database as number $count\n";
 #    }
    
  #   $count = $res->[0]->[0];

    if ($ask_user)
    {
	print "If the sequence is for a phage, and the host is known, please enter the host (or leave blank):\n";
	my $genome_host=<STDIN>;
	chomp($genome_host);

	print "If the sequence has been assembled, please enter the path to the assembled sequence (or leave blank):\n";
	my $asm_path=<STDIN>;
	chomp($asm_path);
    }

    $dbh->do("insert into genome (name, host, assembled_sequence) values (?, ?, ?)", undef,
	     $genome_name, $genome_host, $asm_path);

    my $res = $dbh->selectcol_arrayref("select count from genome where name = ?", undef, $genome_name);
    $res or die $dbh->errstr;
    $count = $res->[0];

    $res = $dbh->selectcol_arrayref("select max(count) from DNA");
    $res or die $dbh->errstr;
    $max = $res->[0];
    
    $max++;
    
    print "The new genome is number $count and the first sequence will be $max\nReading and dereplicating the fasta file\n";
}
$dbh->commit();

# read and de replicate the fasta file
my $fasta=Rob->read_fasta($fastaf);
{
	my $seqs; 
	map {$fasta->{$_} =~ s/\s+//g; $seqs->{$fasta->{$_}}=$_} keys %$fasta;
	undef $fasta;
	map {$fasta->{$seqs->{$_}}=$_} keys %$seqs;
}

my $qual;
if ($qualf)
{
    $qual=Rob->read_fasta($qualf, 1);
    map {$qual->{$_} =~ s/\s+/ /g} keys %$qual;
}

print "Inserting data into database and rewriting files\n";
open (FA, ">$count.fa") || die "Can't open $count.fa for writing\n";
if ($qualf)
{
    open (QU, ">$count.qual") || die "Can't open $count.qual for writing\n";
}

$dbh->begin_work();
my $exc=$dbh->prepare(qq(insert into DNA ( genome, original_name, trace_file, sequence_file, quality_file)
			 values (?, ?, ?, ?, ?)));

foreach my $s (keys %$fasta)
{
    my $id;

    if ($qualf)
    {
	if (!$qual->{$s})
	{
	    print STDERR "No quality for $s. Skipped\n";
	    next;
	}
    }
    
    if ($genomeno)
    {
	$id = $max++;
    }
    else
    {
	$exc->execute($count, $s, '', "$count.fa", "$count.qual")  || die $dbh->errstr;
	$id = $dbh->{mysql_insertid};
    }
    
    print FA ">$id\n", $fasta->{$s}, "\n";
    if ($qualf)
    {
	print QU ">$id\n", $qual->{$s}, "\n";
    }
}
close(FA);
close(QU) if $qualf;
$dbh->commit();

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3