[Bio] / FigKernelPackages / CallBlast.pm Repository:
ViewVC logotype

View of /FigKernelPackages/CallBlast.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Mon Jul 16 19:44:35 2007 UTC (12 years, 8 months ago) by parrello
Branch: MAIN
CVS Tags: HEAD
Changes since 1.2: +0 -0 lines
FILE REMOVED
Moved code to SHToolSearch.

#!/usr/bin/perl -w

package CallBlast;

    require Exporter;
    @ISA = ('Exporter');
    @EXPORT = qw(ExecBlast);
    @EXPORT_OK = qw();

    use strict;
    use Tracer;
    use Sim;

=head1 Blast Caller

=head2 Introduction

This package exports the B<ExecBlast> method, which can be used to perform a
BLAST from a FASTA file against one or more genomes.

=cut

=head2 Data Structures

=head3 ToolTable

The tool table is a hash that provides useful information about each blast tool.
The hash maps each tool name to a hash reference. The various fields in the
hashes are as follows.

=over 4

=item db_type

Type of database against which the tool runs: C<prot> for a protein database and
C<dna> for a DNA database.

=back

=cut

my %ToolTable = (   blastp => {     db_type => 'prot'
                              },
                    blastx => {     db_type => 'prot'
                              },
                );

=head2 Public Methods

=head3 ExecBlast

C<< my @sims = ExecBlast($fig, $seqFile, \@genomes, $tool, $options); >>

Call BLAST to compute the similarities for a sequence.

=over 4

=item fig

A FIG-like object for accessing the data store.

=item seqFile

Name of a file containing the sequence to blast in FASTA format.

=item genomes

A list of the IDs for the target genomes of the blast.

=item tool

Name of the blast tool to use.

=item options

Options to pass to the blast tool, formatted for the command line.

=item RETURN

Returns a list of similarity objects, each representing a segment similar to
the incoming sequence.

=back

=cut

sub ExecBlast {
    # Get the parameters.
    my ($fig, $seqFile, $genomes, $tool, $options) = @_;
    # Declare the return variable.
    my @retVal = ();
    # Insure the blast tools can find the blast matrix directory.
    if (! $ENV{"BLASTMAT"}) { $ENV{"BLASTMAT"} = "$FIG_Config::blastmat" }
    # Get the location of blastall.
    my $blastall = "$FIG_Config::ext_bin/blastall";
    Trace("CallBlast for file $seqFile, tool => $tool, options => $options.") if T(2);
    # Get this tool's parameters.
    my $toolData = $ToolTable{$tool};
    # Determine whether or not this is a multi-genome call.
    my $genomeCount = scalar(@$genomes);
    Trace("$genomeCount genomes in list.") if T(2);
    my $multiGenome = ($genomeCount > 1 ? 1 : 0);
    # Loop through the genome list.
    for my $genome (@$genomes) {
        Trace("Blasting against $genome.") if T(3);
        # Get the target database location for this genome.
        my $db = ($toolData->{db_type} eq 'prot' ?
                  "$FIG_Config::organisms/$genome/Features/peg/fasta" :
                  "$FIG_Config::organisms/$genome/contigs");
        # Only proceed if the database has data in it.
        if (-s $db) {
            # Verify the database.
            VerifyDB($db, $toolData->{db_type});
            # Compute the BLAST parameters.
            my @args = ('-i', $seqFile, '-d', $db, '-m', 8, '-FF', '-p', $tool);
            my $command = join(" ", $blastall, @args);
            # Call blastall.
            Trace("Blasting: $command") if T(3);
            my @data = TICK($command);
            Trace(scalar(@data) . " lines returned from BLAST.") if T(3);
            # If this is a multiple-genome situation, limit the output
            # to the configured number of lines.
            if ($multiGenome && @data > $FIG_Config::blast_limit) {
                Trace("Limiting blast output.") if T(3);
                $#data = $FIG_Config::blast_limit - 1;
            }
            # Process the lines.
            for my $line (@data) {
                # Convert this line into a Sim object.
                my $sim = Sim->new_from_line($line);
                # Push it onto the result list.
                push @retVal, $sim;
            }
        }
    }
    # Sort the results by score.
    @retVal = sort { $a->psc <=> $b->psc } @retVal;
    # Return the resulting list.
    return @retVal;
}

=head2 Blast Utilities

=head3 VerifyDB

C<< CallBlast::VerifyDB($db, $type); >>

Verify that the specified FASTA file has BLAST databases. If the databases
do not exist, they will be created. If they are older than the FASTA file,
they will be regenerated.

=over 4

=item db

Name of the FASTA file.

=item type

Type of database desired: C<prot> for protein and C<dna> for DNA.

=back

=cut

sub VerifyDB {
    # Get the parameters.
    my ($db, $type) = @_;
    # Process according to the data type.
    if ($type eq 'prot') {
        if ((! -s "$db.psq") || (-M "$db.psq" > -M $db)) {
            Trace("Building protein FASTA database for $db.") if T(3);
            system "$FIG_Config::ext_bin/formatdb -p T -i $db";
        }
    } else {
        if ((! -s "$db.nsq") || (-M "$db.nsq" > -M $db)) {
            Trace("Building DNA FASTA database for $db.") if T(3);
            system "$FIG_Config::ext_bin/formatdb -p F -i $db";
        }
    }
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3