[Bio] / FigKernelScripts / collect_related_sequences.pl Repository:
ViewVC logotype

View of /FigKernelScripts/collect_related_sequences.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Sat Apr 28 22:35:02 2007 UTC (12 years, 7 months ago) by golsen
Branch: MAIN
Changes since 1.1: +68 -9 lines
Improvements of functionality of finding related sequences in SEED.

########################################################################
# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

#   collect_related_sequences  [options] dbfile seqfile

# use Data::Dumper;
use strict;
use gjoseqlib;
use collect_related_sequences;

my $is_SEED = 0;
eval { require FIG_Config; $is_SEED = 1 };

my $usage =<<"End_of_Usage";

Usage:  collect_related_sequences  [options]  dbfile  seqfile

Options:

    -c  min_coverage  # D=0.80
    -d  tmp_dir       # name of temporary directory
    -e  max_e_value   # D=0.001
    -f  'fid ...'     # use sequence(s) from SEED (instead of seqfile)
    -i  min_identity  # D=0.25
    -m                # do NOT merge queries with found sequences
    -n                # use SEED nr database (instead of dbfile)
    -t  tmp           # place for temporary directory
    -x  extra_ends    # extra length at ends

End_of_Usage

my $min_coverage = 0.80;
my $tmp_dir;
my $max_e_value  = 0.001;
my $fids;
my $min_identity = 0.25;
my $merge        = 1;
my $nr           = 0;
my $tmp;
my $extra_ends   = 10;

while ( $ARGV[0] =~ /^-/ )
{
    $_ = shift;
    if    ( s/^-c// ) { $min_coverage = $_ || shift }
    elsif ( s/^-d// ) { $tmp_dir      = $_ || shift }
    elsif ( s/^-e// ) { $max_e_value  = $_ || shift }
    elsif ( s/^-i// ) { $min_identity = $_ || shift }
    elsif ( s/^-m// ) { $merge        = 0 }
    elsif ( s/^-t// ) { $tmp          = $_ || shift }
    elsif ( s/^-x// ) { $extra_ends   = $_ || shift }
    elsif ( $is_SEED && s/^-f// ) { $fids = $_ || shift }
    elsif ( $is_SEED && s/^-n// ) { $nr   = 1           }
    elsif ( m/^-[fn]/ )
    {
        usage( "-f and -n are only valid in the SEED environment.\n" );
    }
    else
    {
        usage( "Bad command flag '$_'\n" );
    }
}

my ( $dbfile, $seqfile ) = @ARGV;

my $dbfile = $nr ? "$FIG_Config::global/nr"
                 : shift @ARGV;
-f $dbfile || usage( "Cannot locate database file '$dbfile'." );

my $fig;
if ( $nr || $fids )
{
    require FIG;
    $fig = new FIG;
}

my @seq;
if ( $fids )
{
    foreach my $fid ( split /[,\s]+/, $fids )
    {
        my $seq = $fig->get_translation( $fid );
        push @seq, [ $fid, '', $seq ] if $seq;
    }
    @seq or usage( "Failed to get translations for '$fids'." );
}
else
{
    my $seqfile = shift @ARGV;
    @seq = read_fasta( $seqfile );
    @seq or usage( "Failed to read sequences from '$seqfile'." );
}

my $options =
    { min_coverage => $min_coverage,
      max_e_value  => $max_e_value,
      min_identity => $min_identity,
      extra_ends   => $extra_ends
    };
$options->{ tmp     } = $tmp     if $tmp;
$options->{ tmp_dir } = $tmp_dir if $tmp_dir;

my $found = collect_related_sequences::collect_related_sequences( $dbfile, \@seq, $options );

#  SEED nr has xxx_... ids.  We need fig ids:

if ( $nr )
{
    my ( $id, $def, $seq, @ids );
    @$found = map { ( $id, $def, $seq ) = @$_;
                    @ids = $fig->recast_ids( 'fig\|', [ $id ] );
                    map { [ $_, $def, $seq ] } @ids
                  }
              @$found;
}

# Merge exemplars and others:

if ( $merge )
{
    my %seen;
    @$found = grep { ! $seen{ $_->[0] }++ } @seq, @$found;
}

print_alignment_as_fasta( $found );
exit;


sub usage
{
    print STDERR join( "\n", @_, $usage );
    exit;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3