Parent Directory
|
Revision Log
Revision 1.3 - (view) (download) (as text)
1 : | parrello | 1.1 | #!/usr/bin/perl -w |
2 : | |||
3 : | # -*- perl -*- | ||
4 : | # | ||
5 : | # Copyright (c) 2003-2006 University of Chicago and Fellowship | ||
6 : | # for Interpretations of Genomes. All Rights Reserved. | ||
7 : | # | ||
8 : | # This file is part of the SEED Toolkit. | ||
9 : | parrello | 1.2 | # |
10 : | parrello | 1.1 | # The SEED Toolkit is free software. You can redistribute |
11 : | # it and/or modify it under the terms of the SEED Toolkit | ||
12 : | parrello | 1.2 | # Public License. |
13 : | parrello | 1.1 | # |
14 : | # You should have received a copy of the SEED Toolkit Public License | ||
15 : | # along with this program; if not write to the University of Chicago | ||
16 : | # at info@ci.uchicago.edu or the Fellowship for Interpretation of | ||
17 : | # Genomes at veronika@thefig.info or download a copy from | ||
18 : | # http://www.theseed.org/LICENSE.TXT. | ||
19 : | # | ||
20 : | |||
21 : | =head1 SproutGFF | ||
22 : | |||
23 : | This is a fancy wrapper around B<seed2gff> that can be used to generate the | ||
24 : | GFF3 files for the NMPDR. The single parameter is the output directory name. | ||
25 : | The files will be organized by NMPDR group. | ||
26 : | |||
27 : | The currently-supported command-line options are as follows. | ||
28 : | |||
29 : | =over 4 | ||
30 : | |||
31 : | =item user | ||
32 : | |||
33 : | Name suffix to be used for log files. If omitted, the PID is used. | ||
34 : | |||
35 : | =item trace | ||
36 : | |||
37 : | Numeric trace level. A higher trace level causes more messages to appear. The | ||
38 : | default trace level is 2. Tracing will be directly to the standard output | ||
39 : | as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory, | ||
40 : | where I<User> is the value of the B<user> option above. | ||
41 : | |||
42 : | =item sql | ||
43 : | |||
44 : | If specified, turns on tracing of SQL activity. | ||
45 : | |||
46 : | =item background | ||
47 : | |||
48 : | Save the standard and error output to files. The files will be created | ||
49 : | in the FIG temporary directory and will be named C<err>I<User>C<.log> and | ||
50 : | C<out>I<User>C<.log>, respectively, where I<User> is the value of the | ||
51 : | B<user> option above. | ||
52 : | |||
53 : | =item h | ||
54 : | |||
55 : | Display this command's parameters and options. | ||
56 : | |||
57 : | =item scan | ||
58 : | |||
59 : | If specified, the genomes will be collected and the directories created, but no GFF | ||
60 : | files will be output. This is mostly useful for testing. | ||
61 : | |||
62 : | =item phone | ||
63 : | |||
64 : | Phone number to message when the script is complete. | ||
65 : | |||
66 : | =back | ||
67 : | |||
68 : | =cut | ||
69 : | |||
70 : | use strict; | ||
71 : | use Tracer; | ||
72 : | use Cwd; | ||
73 : | use File::Copy; | ||
74 : | use File::Path; | ||
75 : | use SFXlate; | ||
76 : | |||
77 : | # Get the command-line options and parameters. | ||
78 : | my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ], | ||
79 : | { | ||
80 : | phone => ["", "phone number (international format) to call when load finishes"], | ||
81 : | genome => ["", "genome to process; the default is to process all NMPDR core genomes"], | ||
82 : | scan => ["", "if specified, the output directories will be created but no files will be written"], | ||
83 : | }, | ||
84 : | "<output directory>", | ||
85 : | @ARGV); | ||
86 : | # Set a variable to contain return type information. | ||
87 : | my $rtype; | ||
88 : | # Insure we catch errors. | ||
89 : | eval { | ||
90 : | # Create a Sprout object. | ||
91 : | my $sprout = SFXlate->new_sprout_only(); | ||
92 : | # Insure the output directory exists. | ||
93 : | my $outDir = $parameters[0]; | ||
94 : | if (! $outDir) { | ||
95 : | Confess("No output directory specified."); | ||
96 : | } else { | ||
97 : | Insure($outDir, 0777); | ||
98 : | # Create the genome map. This lists all the genomes we want along with the corresponding | ||
99 : | # output file name. | ||
100 : | my %genomes; | ||
101 : | # Check for a single-genome situation. | ||
102 : | if ($options->{genome}) { | ||
103 : | # Get the genome name. | ||
104 : | my $genomeID = $options->{genome}; | ||
105 : | my $genomeName = $sprout->GenusSpecies($genomeID); | ||
106 : | # Compute the file name. | ||
107 : | my $fileName = CleanGenomeName($genomeName); | ||
108 : | $genomes{$genomeID} = "$outDir/$fileName.gff"; | ||
109 : | } else { | ||
110 : | # Here we want all the core organisms, split into super-groups. First, we get the | ||
111 : | # genomes for each group in a hash. | ||
112 : | my %baseGroups = $sprout->GetGroups(); | ||
113 : | # Fix it into a hash by super-group. | ||
114 : | parrello | 1.2 | my %coreGroups = $sprout->Fix(%baseGroups); |
115 : | parrello | 1.1 | for my $coreGroup (keys %coreGroups) { |
116 : | # Compute the directory and isure it exists. | ||
117 : | my $superDirectory = "$outDir/$coreGroup"; | ||
118 : | Insure($superDirectory, 0777); | ||
119 : | # Put all of this group's genomes in the output hash. | ||
120 : | for my $coreGenome (@{$coreGroups{$coreGroup}}) { | ||
121 : | my $fileName = CleanGenomeName($sprout->GenusSpecies($coreGenome)); | ||
122 : | $genomes{$coreGenome} = "$superDirectory/$fileName.gff"; | ||
123 : | } | ||
124 : | } | ||
125 : | } | ||
126 : | # Now we loop through %genomes, creating GFF files. | ||
127 : | for my $genome (sort keys %genomes) { | ||
128 : | my $fileName = $genomes{$genome}; | ||
129 : | if ($options->{scan}) { | ||
130 : | Trace("$genome would be written to $fileName") if T(2); | ||
131 : | } else { | ||
132 : | Trace("Writing $genome to $fileName.") if T(3); | ||
133 : | # Do the conversion. | ||
134 : | my @output = `seed2gff -g $genome -o "$fileName" -s -t all -nmpdr`; | ||
135 : | # At trace level 3, we show the output. | ||
136 : | Trace("Output from seed2gff:\n" . join("\n", @output)) if T(3) && scalar(@output); | ||
137 : | } | ||
138 : | } | ||
139 : | } | ||
140 : | |||
141 : | }; | ||
142 : | if ($@) { | ||
143 : | Trace("Script failed with error: $@") if T(0); | ||
144 : | $rtype = "error"; | ||
145 : | } else { | ||
146 : | Trace("Script complete.") if T(2); | ||
147 : | $rtype = "no error"; | ||
148 : | } | ||
149 : | if ($options->{phone}) { | ||
150 : | my $msgID = Tracer::SendSMS($options->{phone}, "SproutGFF terminated with $rtype."); | ||
151 : | if ($msgID) { | ||
152 : | Trace("Phone message sent with ID $msgID.") if T(2); | ||
153 : | } else { | ||
154 : | Trace("Phone message not sent.") if T(2); | ||
155 : | } | ||
156 : | } | ||
157 : | |||
158 : | =head3 CleanGenomeName | ||
159 : | |||
160 : | parrello | 1.2 | my $cleaned = CleanGenomeName($name); |
161 : | parrello | 1.1 | |
162 : | Clean up a genome name so it can be used as a file name. | ||
163 : | |||
164 : | =over 4 | ||
165 : | |||
166 : | =item name | ||
167 : | |||
168 : | Name of the genome, for cleaning purposes. | ||
169 : | |||
170 : | =item RETURN | ||
171 : | |||
172 : | Returns the incoming name with all its evil characters converted to dots. | ||
173 : | |||
174 : | =back | ||
175 : | |||
176 : | =cut | ||
177 : | |||
178 : | sub CleanGenomeName { | ||
179 : | # Get the parameters. | ||
180 : | my ($name) = @_; | ||
181 : | # Declare the return variable. | ||
182 : | my $retVal = $name; | ||
183 : | # Convert spaces to dots. | ||
184 : | $retVal =~ s/\s+/\./g; | ||
185 : | # Convert double dots to dots. | ||
186 : | $retVal =~ s/\.\./\./g; | ||
187 : | # Convert other bad guys to underscores. | ||
188 : | $retVal =~ tr/():/___/; | ||
189 : | # Return the result. | ||
190 : | return $retVal; | ||
191 : | } | ||
192 : | |||
193 : | |||
194 : | parrello | 1.2 | 1; |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |