[Bio] / FigKernelScripts / find_SEED_SSU_rRNA_genes.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/find_SEED_SSU_rRNA_genes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (view) (download) (as text)

1 : golsen 1.3 # -*- perl -*-
2 :     ########################################################################
3 : golsen 1.7 # Copyright (c) 2003-2013 University of Chicago and Fellowship
4 : golsen 1.3 # for Interpretations of Genomes. All Rights Reserved.
5 : overbeek 1.1 #
6 : golsen 1.3 # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 : overbeek 1.1 #
12 : golsen 1.3 # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     ########################################################################
18 :    
19 :     my $usage = <<'End_of_Usage';
20 :     Identify the SSU rRNA genes in one or more SEED genomes.
21 :    
22 :     Usage: find_SEED_SSU_rRNA_genes [options] [ genome_id ... ] > found
23 :    
24 :     Output:
25 :    
26 : golsen 1.4 genome_id \t type \t location \t sequence \t tag \t proposed_assignment
27 :    
28 :     where type is always rna.
29 : golsen 1.3
30 :     Options:
31 :    
32 : golsen 1.7 -a 'role' # Proposed assignment; empty string supresses the output field
33 : golsen 1.3 -b # Add a blank line between genomes (D = no blank)
34 :     -c nt # Condense the sequence to first and last nt nucleotides (D = full seq)
35 :     -e dist # Maximum nucleotides at end to extrapolate a match (D = 10)
36 : golsen 1.5 -f # Fasta output format
37 : golsen 1.3 -s # Do not show sequence
38 :     -t 'tag' # A short tage to identify the nature of the feature; allows
39 :     # mixing of different types; empty string supresses the field
40 :    
41 :     End_of_Usage
42 : overbeek 1.1
43 :     use strict;
44 :     use find_homologs;
45 :     use gjoseqlib;
46 :     use Data::Dumper;
47 :     use FIG;
48 :     my $fig = new FIG;
49 :    
50 : golsen 1.5 my $extrapolate;
51 :     my $assignment;
52 :     my $tag;
53 :     my @rna;
54 : golsen 1.3
55 : golsen 1.7 # This is the module name. The package name in the module is RNA_reps. Other
56 :     # modules use the same internal package structure.
57 :    
58 :     use RNA_reps_SSU_rRNA;
59 : fangfang 1.6 @rna = @RNA_reps::RNA_reps;
60 :     $assignment = $RNA_reps::assignment if $RNA_reps::assignment;
61 :     $tag = $RNA_reps::tag if $RNA_reps::tag;
62 : golsen 1.5
63 :     ensure_defined( $extrapolate, 10 );
64 :     ensure_defined( $assignment, 'SSU rRNA ## 16S rRNA, small subunit ribosomal RNA' );
65 :     ensure_defined( $tag, 'SSU_rRNA' );
66 :    
67 :     my $blank = 0;
68 :     my $condense = 0;
69 :     my $fasta = 0;
70 : golsen 1.3
71 :     while ( @ARGV && $ARGV[0] =~ s/^-// )
72 :     {
73 :     $_ = shift;
74 : golsen 1.5 if ( s/^a// ) { $assignment = /\S/ ? $_ : shift; next }
75 :     if ( s/^c// ) { $condense = /\S/ ? $_ : shift; next }
76 :     if ( s/^e// ) { $extrapolate = /\S/ ? $_ : shift; next }
77 :     if ( s/^t// ) { $tag = /\S/ ? $_ : shift; next }
78 : golsen 1.3
79 : golsen 1.5 if ( s/b// ) { $blank = ! $blank }
80 :     if ( s/f// ) { $fasta = ! $fasta }
81 : golsen 1.3 if ( m/\S/ )
82 :     {
83 :     print STDERR "Bad flag '$_'\n", $usage;
84 :     exit;
85 :     }
86 :     }
87 :    
88 : overbeek 1.1 my @genomes = @ARGV;
89 :     if ( ! @genomes )
90 :     {
91 :     @genomes = grep { $fig->is_prokaryotic($_) } $fig->genomes('complete');
92 :     }
93 :    
94 : golsen 1.3 my $pat;
95 :     $pat = qr/^(.{$condense})....+(.{$condense})$/o if $condense;
96 :    
97 : overbeek 1.1 foreach my $genome ( @genomes )
98 :     {
99 : overbeek 1.2 my $contig_file = "$FIG_Config::organisms/$genome/contigs";
100 : overbeek 1.1 -f $contig_file or print STDERR "Could not find contig file $contig_file\n"
101 :     and exit;
102 : golsen 1.3
103 : golsen 1.5 my $options = { description => $assignment,
104 : overbeek 1.1 extrapolate => 10,
105 : golsen 1.5 refseq => \@rna
106 : overbeek 1.1 };
107 : golsen 1.3
108 : overbeek 1.1 my $instances = find_homologs::find_nucleotide_homologs( $contig_file, $options );
109 : golsen 1.3
110 : overbeek 1.1 my @instances = map { $_->[2] }
111 :     sort { $a->[0] cmp $b->[0] || $a->[1] <=> $b->[1] }
112 :     map { my $loc = $_->{ location };
113 :     my $seq = $_->{ sequence };
114 :     my ( $c, $b, $e ) = $loc =~ /^(.*)_(\d+)_(\d+)$/;
115 : golsen 1.3 $seq =~ s/$pat/$1...$2/ if $condense;
116 : golsen 1.4 [ $c, $b+$e, [ $genome, 'rna', $loc, lc $seq ] ]
117 : overbeek 1.1 }
118 :     @$instances;
119 : golsen 1.3
120 :     foreach ( @instances )
121 :     {
122 : golsen 1.5 if ( ! $fasta )
123 :     {
124 :     print join( "\t", @$_,
125 :     ( $tag ? $tag : () ),
126 :     ( $assignment ? $assignment : () )
127 :     ), "\n"
128 :     }
129 :     else
130 :     {
131 :     # >genome:location assignment
132 :     gjoseqlib::print_seq_as_fasta( "$_->[0]:$_->[2]", $assignment, $_->[3] );
133 :     }
134 : golsen 1.3 }
135 :    
136 :     print "\n" if @instances && $blank; # Blank line between genomes
137 : overbeek 1.1 }
138 : golsen 1.5
139 :     exit;
140 :    
141 :     sub ensure_defined { $_[0] = $_[1] if ! defined $_[0] }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3