[Bio] / FigKernelScripts / svr_drug_resistance_proteins.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/svr_drug_resistance_proteins.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : fangfang 1.1 #
2 :     # This is a SAS Component
3 :     #
4 :     ########################################################################
5 :     # Copyright (c) 2003-2013 University of Chicago and Fellowship
6 :     # for Interpretations of Genomes. All Rights Reserved.
7 :     #
8 :     # This file is part of the SEED Toolkit.
9 :     #
10 :     # The SEED Toolkit is free software. You can redistribute
11 :     # it and/or modify it under the terms of the SEED Toolkit
12 :     # Public License.
13 :     #
14 :     # You should have received a copy of the SEED Toolkit Public License
15 :     # along with this program; if not write to the University of Chicago
16 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
17 :     # Genomes at veronika@thefig.info or download a copy from
18 :     # http://www.theseed.org/LICENSE.TXT.
19 :     ########################################################################
20 :    
21 :     my $usage = <<'End_of_Usage';
22 :    
23 :     Identify the drug resistance proteins in one or more genomes.
24 :    
25 :     Usage: svr_drug_resistance_proteins [options] < genome_ids > proteins_found
26 : fangfang 1.3 svr_drug_resistance_proteins -q [options] aa_query_file ... > proteins_found
27 : fangfang 1.1 svr_drug_resistance_proteins -g [options] [ genome_id ... ] > proteins_found
28 :    
29 :     Output:
30 :    
31 :     sequence_id \t function_of_most_similar_ref_protein \t sequence \t tag
32 :    
33 :     Options:
34 :    
35 :     -a 'role' # Proposed assignment for sequences whose closest referece match has no assignment
36 :     -b # Add a blank line between genomes (D = no blank)
37 : fangfang 1.3 -c min-cov # Min coverage (D = defined in the module of representatives)
38 : fangfang 1.1 -d domains # One or more of the letters A, B and/or E, run together
39 : fangfang 1.3 -e e-value # Max e-value for calling proteins (D = defined in the module of representatives)
40 : fangfang 1.1 -f # Fasta output format
41 :     -g # Parameters are genome ids. No ids gives all complete genomes.
42 : fangfang 1.3 -i ident # Min percent identity (D = defined in module of representatives)
43 : fangfang 1.1 -m module # Perl module with reference sequences for the RNA type
44 :     -p # Include partial Sapling genomes
45 : fangfang 1.3 -q # Parameters are query file names (D = genome ids from STDIN)
46 : fangfang 1.1 -r reffile # File of reference sequences for the protein type
47 :     -s # Do not show sequence
48 :     -t 'tag' # A short tage to identify the nature of the feature; allows
49 :     # mixing of different types; empty string supresses the
50 :     # field (D = '')
51 :     -u url # url of the desired sapling server
52 :     -v # Send some progress information to STDERR
53 :    
54 :     The reference sequence perl modules have common internal variable names. For
55 :     example, Prot_reps_drug_resistance begins with:
56 :    
57 :     package Prot_reps;
58 :     use strict;
59 :     use gjoseqlib;
60 :    
61 : golsen 1.2 our @prot_reps = gjoseqlib::read_fasta( \*DATA );
62 :     our $assignment = 'Protein matching drug resistance reference sequences';
63 :     our $feature_type = 'protein';
64 :     our $max_expect = 1e-10;
65 :     our $max_extrapolate = 10;
66 :     our $max_rep_sim = 0.87; # identity below which a new rep is needed
67 :     our $min_coverage = 0.70;
68 :     our $min_identity = 0.30;
69 :     our $tag = 'drug_resistance';
70 : fangfang 1.1
71 :     User-supplied command options will override the default values.
72 :    
73 :     End_of_Usage
74 :    
75 :     use SeedEnv;
76 :     use strict;
77 :     use find_homologs;
78 :     use gjoseqlib;
79 :     use gjoalignment;
80 :     use Data::Dumper;
81 :    
82 :     my $assignment = '';
83 :     my $blank = 0;
84 :     my $complete = 1;
85 :     my $queryfiles = 0;
86 :     my $coverage = undef;
87 :     my $domains = '';
88 :     my $expect = 1e-5;
89 :     my $extrapolate = undef;
90 :     my $fasta = 0;
91 :     my $genomeids = 0;
92 :     my $identity = undef;
93 :     my $just_ends = 0;
94 :     my $loc_format = 'SEED';
95 :     my $module = '';
96 :     my $no_seq = 0;
97 :     my $quality = 0;
98 :     my $reffile = '';
99 :     my $tag;
100 :     my $url;
101 :     my $verbose = 0;
102 :    
103 :     while ( @ARGV && $ARGV[0] =~ s/^-// )
104 :     {
105 :     local $_ = shift;
106 :    
107 :     if ( s/^a// ) { $assignment = /\S/ ? $_ : shift; next }
108 : fangfang 1.3 if ( s/^c// ) { $coverage = /\S/ ? $_ : shift; next }
109 : fangfang 1.1 if ( s/^d// ) { $domains = /\S/ ? $_ : shift; next }
110 : fangfang 1.3 if ( s/^e// ) { $expect = /\S/ ? $_ : shift; next }
111 : fangfang 1.1 if ( s/^i// ) { $identity = /\S/ ? $_ : shift; next }
112 :     if ( s/^m// ) { $module = /\S/ ? $_ : shift; next }
113 :     if ( s/^r// ) { $reffile = /\S/ ? $_ : shift; next }
114 :     if ( s/^t// ) { $tag = /\S/ ? $_ : shift; next }
115 :     if ( s/^u// ) { $url = /\S/ ? $_ : shift; next }
116 :    
117 :     if ( s/b//g ) { $blank = 1 }
118 : fangfang 1.3 if ( s/q//g ) { $queryfiles = 1 }
119 : fangfang 1.1 if ( s/g//g ) { $genomeids = 1 }
120 :     if ( s/s//g ) { $no_seq = 1 }
121 :     if ( s/v//g ) { $verbose = 1 }
122 :     if ( s/f//g ) { $fasta = 1 }
123 :    
124 :     if ( m/\S/ )
125 :     {
126 :     print STDERR "Bad flag '$_'\n", $usage;
127 :     exit;
128 :     }
129 :     }
130 :    
131 :     my @prot;
132 :    
133 :     if ( $reffile )
134 :     {
135 :     -f $reffile
136 :     or print STDERR "Invalid reference sequence file '$reffile'.\n", $usage
137 :     and exit;
138 :     @prot = gjoseqlib::read_fasta( $reffile )
139 :     or print STDERR "No sequences found in '$reffile'.\n", $usage
140 :     and exit;
141 :     $assignment ||= "Protein based on similarity to $reffile data";
142 :     }
143 :     else
144 :     {
145 :     $module ||= 'Prot_reps_drug_resistance';
146 :     $module =~ s/\.pm$//;
147 :     eval { require "$module.pm" };
148 :     if ( $@ )
149 :     {
150 :     print STDERR "Failed in require '$module'.\n$@\n", $usage
151 :     and exit;
152 :     }
153 :    
154 :     @prot = @Prot_reps::prot_reps;
155 : golsen 1.2 $assignment = $Prot_reps::assignment if ! $assignment && $Prot_reps::assignment;
156 :     $tag = $Prot_reps::tag if ! $tag && $Prot_reps::tag;
157 : fangfang 1.1
158 : golsen 1.2 $coverage = $Prot_reps::min_coverage if ! defined $coverage && $Prot_reps::min_coverage;
159 :     $identity = $Prot_reps::min_identity if ! defined $identity && $Prot_reps::min_identity;
160 :     $expect = $Prot_reps::max_expect if $Prot_reps::max_expect;
161 :     $extrapolate = $Prot_reps::max_extrapolate if ! defined $extrapolate && $Prot_reps::max_extrapolate;
162 : fangfang 1.1 }
163 :    
164 :     ensure_defined( $assignment, 'Protein matching reference data' );
165 :     ensure_defined( $coverage, 0.7 );
166 :     ensure_defined( $extrapolate, 20 );
167 :     ensure_defined( $identity, 0.3 );
168 :    
169 :     my $sapObject = $queryfiles ? '' : SAPserver->new( $url ? ( url => $url ) : () );
170 :    
171 :     my @genomes = @ARGV;
172 :     if ( ! $genomeids && ! @genomes )
173 :     {
174 :     @genomes = map { /(\d+\.\d+)/ ? $1 : () }
175 :     map { chomp; ( split /\t/ )[-1] }
176 :     <>;
177 :     }
178 :     elsif ( $genomeids && ! @genomes )
179 :     {
180 :     my $prok = $domains =~ m/E/i ? 0 : 1;
181 :     my $genomeH = $sapObject->all_genomes( -complete => $complete, -prokaryotic => $prok );
182 :     @genomes = map { $_->[0] }
183 :     sort { $a->[1] <=> $b->[1] || $a->[2] <=> $b->[2] }
184 :     map { [ $_, split /\./ ] } # [ genome_id, taxon_id, instance ]
185 :     keys %$genomeH;
186 :     }
187 :    
188 :     my $pegsH;
189 :     if ( ! $queryfiles )
190 :     {
191 :     $pegsH = $sapObject->all_features( -ids => \@genomes, -type => [ 'peg' ] );
192 :     }
193 :    
194 :     my $options = { description => $assignment,
195 :     extrapolate => $extrapolate,
196 :     refseq => \@prot
197 :     };
198 :     $options->{ coverage } = $coverage if $coverage;
199 :     $options->{ identity } = $identity if $identity;
200 :     $options->{ seedexp } = $expect if $expect;
201 :     $options->{ loc_format } = $loc_format if $loc_format;
202 :    
203 :     print STDERR " @{[scalar @genomes]} genomes to process\n" if $verbose;
204 :     foreach my $genome ( @genomes )
205 :     {
206 :     print STDERR "Processing $genome.\n" if $verbose;
207 :     my @aaseqs;
208 :     if ( $queryfiles )
209 :     {
210 :     -f $genome
211 :     or print STDERR "Could not find query sequence file '$genome'. Skipping.\n"
212 :     and next;
213 :     @aaseqs = gjoseqlib::read_fasta( $genome )
214 :     or print STDERR "No sequences found in '$genome'. Skipping.\n"
215 :     and next;
216 :     }
217 :     else
218 :     {
219 :     my $fids = $pegsH->{ $genome } || [];
220 :     @$fids
221 :     or print STDERR "Could not find pegs for genome '$genome'. Skipping.\n"
222 :     and next;
223 :     my $seqH = $sapObject->ids_to_sequences( -ids => $fids, -protein => 1 );
224 :     @aaseqs = map { [ $_, '', $seqH->{ $_ } ] } @$fids
225 :     or print STDERR "No sequences found for '$genome'. Skipping.\n"
226 :     and next;
227 :     }
228 :     print STDERR " @{[scalar @aaseqs]} proteins\n" if $verbose;
229 :    
230 :     my $instances = find_protein_homologs( \@aaseqs, $options );
231 :     my @instances = map { my $qid = $_->{ query_id };
232 :     my $seq = $_->{ sequence };
233 :     my $def = $_->{ reference_def } || $assignment;
234 : fangfang 1.3 my $sid = $_->{ reference_id };
235 :     [ $qid, $def, $seq, $sid ]
236 : fangfang 1.1 }
237 :     @$instances;
238 :    
239 :     print STDERR " @{[scalar @instances]} proteins found\n" if $verbose;
240 :     foreach ( @instances )
241 :     {
242 :     if ( ! $fasta )
243 :     {
244 :     print join( "\t", @$_[0..1],
245 : fangfang 1.3 $_->[3],
246 : fangfang 1.1 ( !$no_seq ? $_->[2] : () ),
247 :     ( $tag ? $tag : () )
248 :     ), "\n"
249 :     }
250 :     else
251 :     {
252 :     gjoseqlib::print_seq_as_fasta( @{$_} );
253 :     }
254 :     }
255 :    
256 :     print "\n" if @instances && $blank; # Blank line between genomes
257 :     }
258 :    
259 :     exit;
260 :    
261 :     sub ensure_defined { $_[0] = $_[1] if ! defined $_[0] }
262 :    
263 :     sub next_fasta_seq
264 :     {
265 :     my ( $seqs, $n ) = @_;
266 :     return $seqs->[ $n || 0 ] if ref $seqs eq 'ARRAY';
267 :     gjoseqlib::read_next_fasta_seq( $seqs );
268 :     }
269 :    
270 :    
271 :     sub find_protein_homologs
272 :     {
273 :     my ( $aaseqs, $options ) = @_;
274 :     -f $aaseqs && -s $aaseqs
275 :     or ref $aaseqs eq 'ARRAY' && ref $aaseqs->[0] eq 'ARRAY'
276 :     or print STDERR "find_nucleotide_homologs called with bad \\\@aaseqs\n"
277 :     and return [];
278 :     ref $options eq 'HASH'
279 :     or print STDERR "find_nucleotide_homologs called with bad \\%options\n"
280 :     and return [];
281 :    
282 :     my $blastall = $options->{ blastall } ||= SeedAware::executable_for( 'blastall' );
283 :     my $min_cover = $options->{ coverage } ||= 0.70; # Minimum fraction of reference covered
284 :     my $max_exp = $options->{ expect } ||= 0.01; # Maximum e-value for blast
285 :     my $extrapol = $options->{ extrapol } ||= $options->{ extrapolate } || 0; # Max to extrapolate ends of subj seq
286 :     my $formatdb = $options->{ formatdb } ||= SeedAware::executable_for( 'formatdb' );
287 :     my $ftr_type = $options->{ ftrtype } if exists $options->{ ftrtype };
288 :     my $descr = $options->{ descript } ||= $options->{ description } || "";
289 :     my $loc_form = $options->{ loc_format } ||= 'seed';
290 :     my $min_ident = $options->{ identity } ||= 0.60; # Minimum fraction sequence identity
291 :     my $max_split = $options->{ maxsplit } ||= 0;
292 :     my $max_gap = $options->{ maxgap } ||= 5000; # Maximum gap in genome match
293 :     my $prefix = $options->{ prefix } ||= "";
294 :     my $ref_file = $options->{ reffile };
295 :     my $ref_seq = $options->{ refseq };
296 :     my $seed_exp = $options->{ seedexp } ||= 1e-20; # Maximum e-value to nucleate match
297 :     my $verbose = $options->{ verbose } ||= 0;
298 :    
299 :     my ( $tmp_dir, $save_tmp ) = SeedAware::temporary_directory( $options );
300 :    
301 :     # Build the blast database of reference sequences:
302 :     #
303 :     # Four cases:
304 :     # "$ref_file.nsq" exists
305 :     # use it
306 :     # "$ref_file.nsq" and $ref_seq do not exist, but "$ref_file" exists
307 :     # run formatdb -i $ref_file -n "$tmp_dir/$tmp_name"
308 :     # "$ref_file.nsq" does not exist, but $ref_seq exists
309 :     # write a temp file in $tmp_dir and run formatdb -i "$tmp_dir/$tmp_name"
310 :     # nothing exists
311 :     # bail
312 :    
313 :     my $db;
314 :     if ( $ref_file && -f "$ref_file.psq" )
315 :     {
316 :     $db = $ref_file;
317 :     }
318 :     elsif ( ref $ref_seq eq 'ARRAY' && ref $ref_seq->[0] eq 'ARRAY' )
319 :     {
320 :     $db = "$tmp_dir/ref_seqs";
321 :     print_alignment_as_fasta( $db, $ref_seq );
322 :     my @cmd = ( $formatdb, -p => 'T', -i => $db );
323 :     system( @cmd ) and die join( ' ', 'Failed', @cmd );
324 :     }
325 :     elsif ( -f $ref_file && -s $ref_file )
326 :     {
327 :     my $name = $ref_file;
328 :     $name =~ s/^.*\///; # Remove leading path
329 :     $db = "$tmp_dir/$name";
330 :     my @cmd = ( $formatdb, -p => 'T', -i => $ref_file, -n => $db );
331 :     system( @cmd ) and die join( ' ', 'Failed', @cmd );
332 :     }
333 :     else
334 :     {
335 :     print STDERR "find_protein_homologs cannot locate reference sequence data\n";
336 :     return [];
337 :     }
338 :    
339 :     $options->{ db } = $db;
340 :     $options->{ seqtype } = 'p';
341 :    
342 :     # There are two ways to go for the aaseqs:
343 :     #
344 :     # $aaseqs is a file of aaseqs
345 :     # use it
346 :     # $aaseqs is a reference to an array of sequences
347 :     # write them to a file
348 :    
349 :     my $qfile;
350 :     if ( ref $aaseqs eq 'ARRAY' )
351 :     {
352 :     return [] if ! @$aaseqs;
353 :     return [] if ! ref $aaseqs->[0] eq 'ARRAY'; # Could do better diagnosis
354 :     $qfile = "$tmp_dir/query";
355 :     print_alignment_as_fasta( $qfile, $aaseqs ); # Write them all
356 :     }
357 :     else
358 :     {
359 :     -f $aaseqs
360 :     or print STDERR "Bad aaseqs file '$aaseqs'\n"
361 :     and return [];
362 :     $qfile = $aaseqs;
363 :     }
364 :    
365 :     my @cmd = ( $blastall,
366 :     -p => 'blastp',
367 :     -d => $db,
368 :     -i => $qfile,
369 :     -r => 1,
370 :     -q => -1,
371 :     -F => 'f',
372 :     -e => $max_exp,
373 :     -v => 5,
374 :     -b => 5,
375 : fangfang 1.3 -a => 8
376 : fangfang 1.1 );
377 :    
378 :     my $redirect = { stderr => '/dev/null' };
379 :     my $blastFH = SeedAware::read_from_pipe_with_redirect( @cmd, $redirect )
380 :     or die join( ' ', 'Failed:', @cmd );
381 :    
382 :     # Process blast results one sequence at a time
383 :    
384 :     my @out;
385 :     my $aaseq; # AA sequence data
386 :     my $n = 0;
387 :    
388 :     while ( $aaseq = next_fasta_seq( $aaseqs, $n++ ) )
389 :     {
390 :     my $query_results = find_homologs::next_blast_query( $blastFH ) or next;
391 :     my ( $qid, $qdef, $qlen, $q_matches ) = @$query_results;
392 :    
393 :     # Check the framing between aaseqs and blast queries:
394 :    
395 :     if ( $qid ne $aaseq->[0] )
396 :     {
397 :     die "Sequence data ($aaseq->[0]) and blastp output ($qid) are out of phase.\n";
398 :     }
399 :    
400 :     # A given query may hit zero or more reference sequences
401 :    
402 :     my @matches = ();
403 :     foreach my $subject_results ( @$q_matches )
404 :     {
405 :     my ( $sid, $sdef, $slen, $s_matches ) = @$subject_results;
406 :    
407 :     # Future work: merge aa hsps
408 :     # For now: only consider the hsp with the highest bit score
409 :    
410 :     my @by_score = sort { $b->[0] <=> $a->[0] }
411 :     grep { $_->[1] <= $max_exp
412 :     && ($_->[10] - $_->[9] + 1) >= $min_cover * $qlen
413 :     && gjoalignment::fraction_aa_identity( @$_[11,14] ) >= $min_ident
414 :     }
415 :     @$s_matches;
416 :    
417 :     my ($scr) = @by_score or next;
418 :     push @matches, [ $sid, $sdef, $slen, $scr ];
419 :     }
420 :    
421 :     @matches = sort { $b->[3] <=> $a->[3] } @matches;
422 :     my ($best_match) = @matches or next;
423 :    
424 :     push @out, { query_id => $qid,
425 :     query_def => $qdef,
426 :     reference_id => $best_match->[0],
427 :     reference_def => $best_match->[1],
428 :     sequence => $aaseq->[2]
429 :     };
430 :     }
431 :    
432 :     close( $blastFH );
433 :     system( '/bin/rm', '-r', $tmp_dir ) if ! $save_tmp;
434 :    
435 :     wantarray ? @out : \@out;
436 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3