[Bio] / FigKernelScripts / collect_related_sequences.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/collect_related_sequences.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : golsen 1.1 ########################################################################
2 :     # -*- perl -*-
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     # collect_related_sequences [options] dbfile seqfile
21 :    
22 :     # use Data::Dumper;
23 :     use strict;
24 :     use gjoseqlib;
25 :     use collect_related_sequences;
26 :    
27 : golsen 1.2 my $is_SEED = 0;
28 :     eval { require FIG_Config; $is_SEED = 1 };
29 :    
30 :     my $usage =<<"End_of_Usage";
31 : golsen 1.1
32 :     Usage: collect_related_sequences [options] dbfile seqfile
33 :    
34 :     Options:
35 :    
36 :     -c min_coverage # D=0.80
37 :     -d tmp_dir # name of temporary directory
38 : golsen 1.2 -e max_e_value # D=0.001
39 :     -f 'fid ...' # use sequence(s) from SEED (instead of seqfile)
40 : golsen 1.1 -i min_identity # D=0.25
41 : golsen 1.2 -m # do NOT merge queries with found sequences
42 :     -n # use SEED nr database (instead of dbfile)
43 : golsen 1.1 -t tmp # place for temporary directory
44 :     -x extra_ends # extra length at ends
45 :    
46 :     End_of_Usage
47 :    
48 :     my $min_coverage = 0.80;
49 :     my $tmp_dir;
50 : golsen 1.2 my $max_e_value = 0.001;
51 :     my $fids;
52 : golsen 1.1 my $min_identity = 0.25;
53 : golsen 1.2 my $merge = 1;
54 :     my $nr = 0;
55 : golsen 1.1 my $tmp;
56 :     my $extra_ends = 10;
57 :    
58 :     while ( $ARGV[0] =~ /^-/ )
59 :     {
60 :     $_ = shift;
61 :     if ( s/^-c// ) { $min_coverage = $_ || shift }
62 :     elsif ( s/^-d// ) { $tmp_dir = $_ || shift }
63 :     elsif ( s/^-e// ) { $max_e_value = $_ || shift }
64 :     elsif ( s/^-i// ) { $min_identity = $_ || shift }
65 : golsen 1.2 elsif ( s/^-m// ) { $merge = 0 }
66 : golsen 1.1 elsif ( s/^-t// ) { $tmp = $_ || shift }
67 :     elsif ( s/^-x// ) { $extra_ends = $_ || shift }
68 : golsen 1.2 elsif ( $is_SEED && s/^-f// ) { $fids = $_ || shift }
69 :     elsif ( $is_SEED && s/^-n// ) { $nr = 1 }
70 :     elsif ( m/^-[fn]/ )
71 :     {
72 :     usage( "-f and -n are only valid in the SEED environment.\n" );
73 :     }
74 : golsen 1.1 else
75 :     {
76 :     usage( "Bad command flag '$_'\n" );
77 :     }
78 :     }
79 :    
80 : golsen 1.2 my ( $dbfile, $seqfile ) = @ARGV;
81 :    
82 :     my $dbfile = $nr ? "$FIG_Config::global/nr"
83 :     : shift @ARGV;
84 :     -f $dbfile || usage( "Cannot locate database file '$dbfile'." );
85 : golsen 1.1
86 : golsen 1.2 my $fig;
87 :     if ( $nr || $fids )
88 :     {
89 :     require FIG;
90 :     $fig = new FIG;
91 :     }
92 : golsen 1.1
93 : golsen 1.2 my @seq;
94 :     if ( $fids )
95 :     {
96 :     foreach my $fid ( split /[,\s]+/, $fids )
97 :     {
98 :     my $seq = $fig->get_translation( $fid );
99 :     push @seq, [ $fid, '', $seq ] if $seq;
100 :     }
101 :     @seq or usage( "Failed to get translations for '$fids'." );
102 :     }
103 :     else
104 :     {
105 :     my $seqfile = shift @ARGV;
106 :     @seq = read_fasta( $seqfile );
107 :     @seq or usage( "Failed to read sequences from '$seqfile'." );
108 :     }
109 : golsen 1.1
110 :     my $options =
111 :     { min_coverage => $min_coverage,
112 :     max_e_value => $max_e_value,
113 :     min_identity => $min_identity,
114 :     extra_ends => $extra_ends
115 :     };
116 :     $options->{ tmp } = $tmp if $tmp;
117 :     $options->{ tmp_dir } = $tmp_dir if $tmp_dir;
118 :    
119 : golsen 1.2 my $found = collect_related_sequences::collect_related_sequences( $dbfile, \@seq, $options );
120 :    
121 :     # SEED nr has xxx_... ids. We need fig ids:
122 :    
123 :     if ( $nr )
124 :     {
125 :     my ( $id, $def, $seq, @ids );
126 :     @$found = map { ( $id, $def, $seq ) = @$_;
127 :     @ids = $fig->recast_ids( 'fig\|', [ $id ] );
128 :     map { [ $_, $def, $seq ] } @ids
129 :     }
130 :     @$found;
131 :     }
132 :    
133 :     # Merge exemplars and others:
134 :    
135 :     if ( $merge )
136 :     {
137 :     my %seen;
138 :     @$found = grep { ! $seen{ $_->[0] }++ } @seq, @$found;
139 :     }
140 : golsen 1.1
141 : golsen 1.2 print_alignment_as_fasta( $found );
142 : golsen 1.1 exit;
143 :    
144 :    
145 :     sub usage
146 :     {
147 :     print STDERR join( "\n", @_, $usage );
148 :     exit;
149 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3