[Bio] / FigKernelScripts / fast_project.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/fast_project.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/env perl
2 :     #
3 :     # Copyright (c) 2003-2015 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     #
18 :    
19 :     use strict;
20 :     use warnings;
21 :     use FIG_Config;
22 :     use Data::Dumper;
23 :     use LWP::Simple;
24 :     use SeedUtils;
25 :     use SeedURLs;
26 :     use ScriptUtils;
27 :     use MapToRef;
28 :     use GenomeTypeObject;
29 :     use gjoseqlib;
30 :     use Time::HiRes qw(gettimeofday);
31 : parrello 1.2 use File::Slurp;
32 : parrello 1.1
33 :     =head1 Project Features from a Reference Genome to a Close Strain
34 :    
35 :     fast_project -r RererenceGenomeId [-k kmer-sz] < genome [ options ]
36 :    
37 :     This script compares the DNA in a reference genome to a new genome (the I<target>) and uses
38 :     the features in the reference genome to call features in the new one. The correspondence
39 :     between the DNA in the two genomes is computed using 2-of-3 kmers, that is, kmers that
40 :     use the first two of every three base pairs to form a sequence.
41 :    
42 :     The first phase of the script computes the unique kmers for both genomes and then
43 :     constructs a table of correspondences (called I<pins>). Two locations are considered
44 :     pinned if we are confident they represent the same DNA. A location in this case is
45 :     defined by a contig ID, a strand, and a position. Thus, it is possible for a location
46 :     on the B<+> strand of one genome to be pinned to a location on the B<-> strand of the other.
47 :    
48 :     Once we have computed the pins, we run through the features of the reference genome
49 :     looking for instances where a significant majority of the locations in the feature are
50 :     pinned to contiguous locations on the target genome. If this is the case, we call the
51 :     region so defined on the target genome as a feature with the same function as the
52 :     reference feature.
53 :    
54 :     =head2 Output
55 :    
56 :     The output is a JSON-format L<GenomeTypeObject> containing the newly-called features.
57 :     These will be in addition to any original features that were present.
58 :    
59 :     =head2 Parameters
60 :    
61 :     The input file (either the standard input or specified via L<ScriptUtils/ih_options>)
62 :     contains the target genome. It can either be a L<GenomeTypeObject> in JSON format or
63 :     a FASTA file. The command-line options are as follows.
64 :    
65 :     =over 4
66 :    
67 :     =item seed
68 :    
69 :     The name of the SEED containing the reference genome. The default is the core SEED (C<core>).
70 :    
71 :     =item reference
72 :    
73 : parrello 1.2 The ID of the reference genome that. This genome must exist in the named SEED.
74 :    
75 :     =item refFile
76 :    
77 :     The name of a file containing the reference genome as a L<GenomeTypeObject> in JSON
78 :     format. This parameter will override C<reference>.
79 : parrello 1.1
80 :     =item kmersize
81 :    
82 :     The kmer size to use when computing the pins. The default is C<30>.
83 :    
84 :     =item format
85 :    
86 :     The format of the target genome, either C<fasta> for a FASTA file, or C<gto> for
87 :     a L<GenomeTypeObject> in JSON format. The default is C<gto>.
88 :    
89 :     =item annotator (optional)
90 :    
91 :     The name of the annotator to use when describing the creation of new features in the
92 :     target genome.
93 :    
94 :     =item idprefix (optional)
95 :    
96 :     The prefix to use when assigning new feature IDs in the target genome.
97 :    
98 :     =item gid (optional)
99 :    
100 :     The ID of the target genome. This is only necessary if the target genome is in FASTA format.
101 :    
102 :     =back
103 :    
104 :     =cut
105 :    
106 :     # Get the names of the SEEDs we support.
107 :     my $seeds = SeedURLs::names();
108 :     # Get the command-line parameters.
109 :     my $opt = ScriptUtils::Opts(
110 :     '',
111 :     ScriptUtils::ih_options(),
112 : parrello 1.2 [ 'seed|s=s', "name of a SEED ($seeds), or a SEED URL", { default => 'core' } ],
113 :     [ 'reference|r=s', 'Id of the Reference Genome'],
114 :     [ 'refFile|f=s', 'file containing the reference genome'],
115 : parrello 1.1 [ 'kmersize|k=i', 'Number of base pairs per kmer', { default => 30 }],
116 :     [ 'annotator|a=s', 'string to use for the annotator name in new features'],
117 :     [ 'idprefix=s', 'prefix to use for new IDs, including genome ID or other identifying information'],
118 :     [ 'format|fmt=s', 'format of the input file-- "gto" or "fasta"', { default => 'gto' }],
119 :     [ 'gid|g=s', 'ID of the target genome', { default => 'new_genome' }],
120 :     );
121 : parrello 1.2 my $ref_text;
122 :     if ($opt->reffile) {
123 :     $ref_text = File::Slurp::read_file($opt->reffile);
124 :     } else {
125 :     my $seed = $opt->seed;
126 : parrello 1.1
127 : parrello 1.2 my $where = SeedURLs::url( $seed );
128 :     if ( ! $where )
129 :     {
130 :     die "Specified seed $seed not found.";
131 :     } else {
132 :    
133 :     my $refId = $opt->reference;
134 :     if (! $refId) {
135 :     die "No reference genome specified.";
136 :     } else {
137 :     $ref_text = LWP::Simple::get( "$where/genome_object.cgi?genome=$refId" );
138 :     }
139 :     }
140 :     }
141 :     my $json = JSON::XS->new;
142 :     my $ref_gto = $json->decode($ref_text);
143 : parrello 1.1
144 : parrello 1.2 $ref_gto = GenomeTypeObject->initialize($ref_gto);
145 : parrello 1.1
146 :    
147 : parrello 1.2 my $k = $opt->kmersize;
148 : parrello 1.1
149 : parrello 1.2 my $genetic_code = $ref_gto->{genetic_code};
150 :     my $ih = ScriptUtils::IH($opt->input);
151 :     my $g_gto;
152 :     if ($opt->format eq 'gto') {
153 :     # Create the GTO from a JSON input file.
154 :     $g_gto = GenomeTypeObject->create_from_file($ih);
155 :     $g_gto->setup_id_allocation();
156 :     } elsif ($opt->format eq 'fasta') {
157 :     # Create a new, blank GTO.
158 :     $g_gto = GenomeTypeObject->new();
159 :     $g_gto->set_metadata({id => $opt->gid, genetic_code => $genetic_code});
160 :     # Read the contigs from a FASTA file.
161 :     my @contigs = map { {id => $_->[0], dna => $_->[2]} } read_fasta($ih);
162 :     $g_gto->add_contigs(\@contigs);
163 : parrello 1.1 }
164 : parrello 1.2 $genetic_code = $g_gto->{genetic_code};
165 :     my @ref_tuples = map { [$_->{id},'',$_->{dna}] } $ref_gto->contigs;
166 :     my @g_tuples = map { [$_->{id},'',$_->{dna}] } $g_gto->contigs;
167 :    
168 :     my $map = &MapToRef::build_mapping($k, \@ref_tuples, \@g_tuples );
169 :     my @ref_features = map { my $loc = $_->{location};
170 :     (@$loc == 1) ? [$_->{id}, $_->{type}, $loc->[0], $_->{function} ] : () }
171 :     $ref_gto->features;
172 :    
173 :     my $gFeatures = &MapToRef::build_features($map, \@g_tuples, \@ref_features, $genetic_code);
174 :     # Add the features to the output genome.
175 :     my ($count,$num_pegs) = add_features_to_gto($g_gto, $gFeatures, $opt);
176 :     print STDERR "$count features added to genome.\n";
177 :     print STDERR "$num_pegs pegs added to genome.\n";
178 :     $g_gto->destroy_to_file(\*STDOUT);
179 :    
180 :    
181 : parrello 1.1
182 :     =head2 Internal Subroutines
183 :    
184 :     =head3 add_features_to_gto
185 :    
186 :     my $count = add_features_to_gto($gto, \@newFeatures, $opt);
187 :    
188 :     Add features from a list to a L<GenomeTypeObject>. This will have no
189 :     effect if the feature list is empty.
190 :    
191 :     =over 4
192 :    
193 :     =item gto
194 :    
195 :     L<GenomeTypeObject> to which the features should be added.
196 :    
197 :     =item newFeatures
198 :    
199 :     Reference to a list of features. Each feature is a 5-tuple consisting of (0) the feature type, (1) the location
200 :     tuple [contig, begin, strand, length], (2) the functional assignment, (3) the feature ID from which it was
201 :     projected, and (4) the DNA or protein string.
202 :    
203 :     =item opt
204 :    
205 :     L<Getopt::Long::Descriptive::Opts> of the parameters to this invocation.
206 :    
207 :     =item RETURN
208 :    
209 :     Returns the number of features added.
210 :    
211 :     =back
212 :    
213 :     =cut
214 :    
215 :     sub add_features_to_gto {
216 :     # Get the parameters.
217 :     my ($gto, $newFeatures, $opt) = @_;
218 :     # This will be the return count.
219 :     my $retVal = 0;
220 :     my $num_pegs = 0;
221 :     # Only proceed if we have features.
222 :     if ($newFeatures && @$newFeatures) {
223 :     # Compute the annotator.
224 :     my $annotator = $opt->annotator || "unknown";
225 :     # Get the ID prefix (if any).
226 :     my $idprefix = $opt->idprefix;
227 :     # Get the list of parameters.
228 :     my @parms;
229 :     if ($opt->_specified('reference')) {
230 :     push @parms, "reference => " . $opt->reference;
231 :     }
232 :     if ($opt->_specified('kmersize')) {
233 :     push @parms, "kmersize => " . $opt->kmersize;
234 :     }
235 :     if ($opt->_specified('annotator')) {
236 :     push @parms, "annotator => " . $annotator;
237 :     }
238 :     if ($opt->_specified('idprefix')) {
239 :     push @parms, "idprefix => " . $idprefix;
240 :     }
241 :     # Create the analysis event.
242 :     my %eventDef = (tool_name => 'fast_project_ref_to_GTO', execution_time => scalar gettimeofday,
243 :     parameters => \@parms, hostname => $gto->hostname);
244 :     my $event_id = $gto->add_analysis_event(\%eventDef);
245 :     # Create the quality measure.
246 :     my %quality_measure = (existence_confidence => 0.80, existence_priority => 100);
247 :     # Loop through the incoming features, adding them.
248 :     for my $newFeature (@$newFeatures) {
249 :     my ($type, $loc, $function, $fromFid, $seq) = @$newFeature;
250 :     # Create the feature object as expected by GTO.
251 :     my %featureH = (-type => $type,
252 :     -location => [$loc],
253 :     -function => $function,
254 :     -annotator => $annotator,
255 :     -annotation => "fast_project from $fromFid",
256 : olson 1.3 -analysis_event_id => $event_id,
257 : parrello 1.1 -quality_measure => \%quality_measure);
258 :     # If we have an ID prefix, add it.
259 :     if ($idprefix) {
260 :     $featureH{-id_prefix} = $idprefix;
261 :     }
262 :     # If this is a peg, add the protein translation.
263 :     if ($type eq 'peg' || $type eq 'CDS') {
264 :     $featureH{-protein_translation} = $seq;
265 :     $num_pegs++;
266 :     }
267 :     # Add the feature to the genome.
268 :     $gto->add_feature(\%featureH);
269 :     # Count this new feature.
270 :     $retVal++;
271 :     }
272 :     }
273 :     # Return the count of features added.
274 :     return ($retVal,$num_pegs);
275 :     }
276 :    
277 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3