[Bio] / FortyEight / create_import_job.pl Repository:
ViewVC logotype

Annotation of /FortyEight/create_import_job.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (view) (download) (as text)

1 : olson 1.1 #
2 :     # Create a SEED-import job.
3 :     #
4 :     # The job initially has a listing of SEED fasta sources, NR fasta sources, and
5 :     # RAST fasta sources. The new NR will not have been built - it will be the first stage
6 :     # in the pipeline.
7 :     #
8 :     # For now we need to pass in the path to the NR and peg.synonyms files we are building from.
9 :     #
10 :     # We create the following files:
11 :     #
12 :     # nr.dirs
13 :     # Tab-delimited data of db-name, source path, size of fasta file
14 :     #
15 :     # nr.sources
16 :     # Listing of all fasta source files from which the nr is to be built
17 :     #
18 :     # We hardcode in the script, for now, the source locations of things. This is an ANL internal
19 :     # application at this point.
20 :     #
21 :     #
22 :     #
23 :    
24 :     use strict;
25 :     use Data::Dumper;
26 :     use DirHandle;
27 :     use ImportJob;
28 :     use Job48;
29 :     use NRTools;
30 :    
31 : olson 1.5 my $usage = "create_import_job [-new-nr-data dir] [-import-biodb] [-from-job jobnum] [prev-nr prev-syn prev-sims]";
32 : olson 1.1
33 :     #
34 :     # Incoming NR data.
35 :     #
36 :     my $dir_biodb = "/vol/biodb";
37 : olson 1.4 my $dir_biodb_nr_input = "$dir_biodb/processed_data/for_build_nr";
38 : olson 1.1
39 :     #
40 :     # Existing SEED data
41 :     #
42 :    
43 : olson 1.3 my $dir_reference_seed_data = "/vol/seed-data-anno-mirror/Data.Jan3";
44 :     #my $dir_reference_seed_data = "/local/FIGdisk/FIG/Data";
45 : olson 1.1
46 :     #
47 :     # RAST server from which to pull genomes to import.
48 :     #
49 :    
50 :     my $dir_rast_jobs = "/vol/48-hour/Jobs.prod.2007-0601";
51 :    
52 :     #
53 :     # Startup.
54 :     #
55 :    
56 : olson 1.4 my $do_biodb_import;
57 :     my $from_job_id;
58 :    
59 :     while ((@ARGV > 0) && ($ARGV[0] =~ /^-/))
60 :     {
61 :     my $arg = shift @ARGV;
62 :     if ($arg =~ /^-import-biodb/i)
63 :     {
64 :     $do_biodb_import++;
65 :     }
66 : olson 1.5 elsif ($arg =~ /^-new-nr-data/)
67 :     {
68 :     $dir_biodb_nr_input = shift @ARGV;
69 :     $do_biodb_import++;
70 :     }
71 : olson 1.4 elsif ($arg =~ /^-from-job/)
72 :     {
73 :     $from_job_id = shift @ARGV;
74 :     }
75 :     else
76 :     {
77 :     die $usage;
78 :     }
79 :     }
80 :    
81 :     my $prev_nr_src;
82 :     my $prev_syn_src;
83 :     my $prev_sim_dir;
84 :     my $from_job;
85 : olson 1.1
86 : olson 1.4 if (defined($from_job_id))
87 :     {
88 :     @ARGV == 0 or die $usage;
89 :    
90 :     $from_job = ImportJob->new($from_job_id);
91 :     $from_job or die "From-job id $from_job_id does not exist";
92 :    
93 :     my $dir = $from_job->dir();
94 :     $prev_nr_src = "$dir/nr";
95 :     $prev_syn_src = "$dir/peg.synonyms";
96 :     $prev_sim_dir = sprintf("$dir/Sims.%03d", $from_job_id);
97 :     }
98 :     else
99 :     {
100 :     @ARGV == 3 or die $usage;
101 :    
102 :     $prev_nr_src = shift;
103 :     $prev_syn_src = shift;
104 :     $prev_sim_dir = shift;
105 :     }
106 : olson 1.1
107 :     #
108 :     # Validate
109 :     #
110 :     if (open(F, "<$prev_nr_src"))
111 :     {
112 :     $_ = <F>;
113 :     if (! /^>/)
114 :     {
115 :     die "$prev_nr_src does not look like a fasta file\n";
116 :     }
117 :     close(F);
118 :     }
119 :     else
120 :     {
121 :     die "Cannot open previous NR file $prev_nr_src: $!\n";
122 :     }
123 :    
124 :     if (open(F, "<$prev_syn_src"))
125 :     {
126 :     $_ = <F>;
127 :     if (!/^xxx\d+,\d+\t/)
128 :     {
129 :     die "$prev_syn_src does not look like a peg.synonyms file\n";
130 :     }
131 :     close(F);
132 :     }
133 :     else
134 :     {
135 :     die "Cannot open previous synonyms file $prev_syn_src: $!\n";
136 :     }
137 :    
138 : olson 1.3 my @sfiles = <$prev_sim_dir/sims*>;
139 :     if (not(-d $prev_sim_dir and @sfiles > 0))
140 : olson 1.1 {
141 :     die "previous sim dir $prev_sim_dir does not appear to contain sims\n";
142 :     }
143 :    
144 :    
145 : olson 1.4 print "Creating import job\n";
146 :     print "\tprev_nr=$prev_nr_src\n";
147 :     print "\tprev_syn=$prev_syn_src\n";
148 :     print "\tprev_sim=$prev_sim_dir\n";
149 :    
150 : olson 1.1 #
151 :     # Initial validation.
152 :     #
153 : olson 1.5 &validate_dirs($dir_reference_seed_data, $dir_rast_jobs);
154 : olson 1.1
155 : olson 1.4 if ($do_biodb_import)
156 :     {
157 :     &validate_dirs($dir_biodb_nr_input);
158 :     }
159 :    
160 : olson 1.1 #
161 :     # Create our jobdir.
162 :     #
163 :    
164 :     my ($jobnum, $err) = ImportJob->create_new_job();
165 :     #my ($jobnum, $err) = ('002', undef);
166 :    
167 :     if (!$jobnum)
168 :     {
169 :     die "Create failed with error: $err\n";
170 :     }
171 :    
172 :     my $job = ImportJob->new($jobnum);
173 :     my $jobdir = $job->dir;
174 :    
175 :     $job->meta->add_log_entry($0, "creating new job");
176 :    
177 :     #
178 :     # Symlink to prev_nr and prev_syn in the job directory.
179 :     #
180 :    
181 :     my $prev_nr = "$jobdir/prev_nr";
182 :     my $prev_syn = "$jobdir/prev_syn";
183 :     my $prev_sims = "$jobdir/prev_sims";
184 :    
185 :     unlink($prev_nr, $prev_syn, $prev_sims);
186 :    
187 :     symlink($prev_nr_src, $prev_nr) or die "symlimk $prev_nr_src $prev_nr failed: $!";
188 :     symlink($prev_syn_src, $prev_syn) or die "symlimk $prev_syn_src $prev_syn failed: $!";
189 :     symlink($prev_sim_dir, $prev_sims) or die "symlimk $prev_sim_dir $prev_sims failed: $!";
190 :    
191 :     #
192 :     # Build list of NR sources. We start with the directories in the reference
193 :     # SEED's NR dir, and override with anything in the biodb NR dir.
194 :     #
195 :    
196 :     my %NR_dirs;
197 :    
198 :     scan_NR_dir(\%NR_dirs, "$dir_reference_seed_data/NR");
199 : olson 1.5
200 :     if ($do_biodb_import)
201 :     {
202 :     scan_NR_dir(\%NR_dirs, "$dir_biodb_nr_input");
203 :     }
204 :    
205 : olson 1.1 #scan_NR_dir(\%NR_dirs, "$dir_biodb_nr_input", { skip => qr(^(SwissProt|.*\.bak)) });
206 :    
207 :     #
208 :     # And write to job dir.
209 :     #
210 :     open(F, ">$jobdir/nr.dirs");
211 :     for my $d (keys %NR_dirs)
212 :     {
213 :     print F join("\t", $d, @{$NR_dirs{$d}}{'path', 'size'}), "\n";
214 :     }
215 :     close(F);
216 :    
217 :     #
218 :     # Scan for SEED organisms.
219 :     #
220 :    
221 :     scan_seed_dir(\%NR_dirs, "$dir_reference_seed_data/Organisms");
222 :    
223 :     #
224 :     # Scan for RAST jobs to import.
225 :     #
226 : olson 1.2 # We update our NR component list with the peg features from the job,
227 :     # and we add the job directory of each to the rast.jobs file. This
228 :     # will be used later during the installation of these jobs into the SEED.
229 :     #
230 :    
231 : olson 1.3 open(JOBS, ">$jobdir/rast.jobs") or die "Cannot create $jobdir/rast.jobs: $!";
232 : olson 1.1
233 :     my @rast_jobs;
234 :     scan_rast_jobs(\@rast_jobs, $dir_rast_jobs);
235 :    
236 : olson 1.3 my @new_rast_jobs;
237 : olson 1.1 for my $job (@rast_jobs)
238 :     {
239 :     my $gid = $job->genome_id;
240 :     my $gname = $job->genome_name;
241 :     my $j = $job->id;
242 :    
243 :     print "RAST job #$j: $gid $gname\n";
244 :    
245 :     if (exists($NR_dirs{$gid}))
246 :     {
247 : olson 1.3 warn "Rast job $j already exists in SEED server\n";
248 :     next;
249 : olson 1.1 }
250 : olson 1.3 push(@new_rast_jobs, $job);
251 :    
252 :     print JOBS $job->dir(), "\n";
253 : olson 1.1
254 :     my $fasta = $job->orgdir() . "/Features/peg/fasta";
255 :     -f $fasta or die "Job $j has no fasta file in $fasta\n";
256 :     $NR_dirs{$gid} = {type => "rast_job", name => $gname, path => $job->orgdir,
257 :     fasta_path => $fasta, size => -s _ };
258 :     }
259 : olson 1.2 close(JOBS);
260 : olson 1.3 @rast_jobs = @new_rast_jobs;
261 : olson 1.1
262 :     open(F, ">$jobdir/all.nr.dirs");
263 :     open(F2, ">$jobdir/nr.sources");
264 :     for my $d (sort bydb keys %NR_dirs)
265 :     {
266 :     print F join("\t", $d, @{$NR_dirs{$d}}{'path', 'size'}), "\n";
267 :     print F2 $NR_dirs{$d}->{fasta_path} . "\n";
268 :     }
269 :     close(F);
270 :     close(F2);
271 :    
272 :     sub bydb
273 :     {
274 :     if ($a =~ /^(\d+)\.(\d+)$/)
275 :     {
276 :     my($ga, $ia) = ($1, $2);
277 :     if ($b =~ /^(\d+)\.(\d+)$/)
278 :     {
279 :     my($gb, $ib) = ($1, $2);
280 :     return $ga <=> $gb or $ia <=> $ib;
281 :     }
282 :     else
283 :     {
284 :     return 1;
285 :     }
286 :     }
287 :     elsif ($b =~ /^\d+\.\d+$/)
288 :     {
289 :     return -1;
290 :     }
291 :     else
292 :     {
293 :     return $a cmp $b;
294 :     }
295 :     }
296 :    
297 :    
298 :     sub validate_dirs
299 :     {
300 :     my(@dirs) = @_;
301 :    
302 :     my $err;
303 :     for my $dir (@dirs)
304 :     {
305 :     if (! -d $dir)
306 :     {
307 :     warn "Required directory $dir is not present\n";
308 :     $err++;
309 :     }
310 :     }
311 :     exit(1) if $err;
312 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3