[Bio] / FortyEight / create_import_job.pl Repository:
ViewVC logotype

Annotation of /FortyEight/create_import_job.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : olson 1.1 #
2 :     # Create a SEED-import job.
3 :     #
4 :     # The job initially has a listing of SEED fasta sources, NR fasta sources, and
5 :     # RAST fasta sources. The new NR will not have been built - it will be the first stage
6 :     # in the pipeline.
7 :     #
8 :     # For now we need to pass in the path to the NR and peg.synonyms files we are building from.
9 :     #
10 :     # We create the following files:
11 :     #
12 :     # nr.dirs
13 :     # Tab-delimited data of db-name, source path, size of fasta file
14 :     #
15 :     # nr.sources
16 :     # Listing of all fasta source files from which the nr is to be built
17 :     #
18 :     # We hardcode in the script, for now, the source locations of things. This is an ANL internal
19 :     # application at this point.
20 :     #
21 :     #
22 :     #
23 :    
24 :     use strict;
25 :     use Data::Dumper;
26 :     use DirHandle;
27 :     use ImportJob;
28 :     use Job48;
29 :     use NRTools;
30 :    
31 :     my $usage = "create_import_job prev-nr prev-syn prev-sims";
32 :    
33 :     #
34 :     # Incoming NR data.
35 :     #
36 :     my $dir_biodb = "/vol/biodb";
37 :     #my $dir_biodb_nr_input = "$dir_biodb/processed_data/for_build_nr";
38 :     my $dir_biodb_nr_input = "/local/FIGdisk/FIG/Data/NR";
39 :    
40 :    
41 :     #
42 :     # Existing SEED data
43 :     #
44 :    
45 :     #my $dir_reference_seed_data = "/vol/seed-data-anno-mirror/Data.Jan3";
46 :     my $dir_reference_seed_data = "/local/FIGdisk/FIG/Data";
47 :    
48 :     #
49 :     # RAST server from which to pull genomes to import.
50 :     #
51 :    
52 :     my $dir_rast_jobs = "/vol/48-hour/Jobs.prod.2007-0601";
53 :    
54 :     #
55 :     # Startup.
56 :     #
57 :    
58 :     @ARGV == 3 or die $usage;
59 :    
60 :     my $prev_nr_src = shift;
61 :     my $prev_syn_src = shift;
62 :     my $prev_sim_dir = shift;
63 :    
64 :     #
65 :     # Validate
66 :     #
67 :     if (open(F, "<$prev_nr_src"))
68 :     {
69 :     $_ = <F>;
70 :     if (! /^>/)
71 :     {
72 :     die "$prev_nr_src does not look like a fasta file\n";
73 :     }
74 :     close(F);
75 :     }
76 :     else
77 :     {
78 :     die "Cannot open previous NR file $prev_nr_src: $!\n";
79 :     }
80 :    
81 :     if (open(F, "<$prev_syn_src"))
82 :     {
83 :     $_ = <F>;
84 :     if (!/^xxx\d+,\d+\t/)
85 :     {
86 :     die "$prev_syn_src does not look like a peg.synonyms file\n";
87 :     }
88 :     close(F);
89 :     }
90 :     else
91 :     {
92 :     die "Cannot open previous synonyms file $prev_syn_src: $!\n";
93 :     }
94 :    
95 :     if (not(-d $prev_sim_dir and <$prev_sim_dir/sims*> > 0))
96 :     {
97 :     die "previous sim dir $prev_sim_dir does not appear to contain sims\n";
98 :     }
99 :    
100 :    
101 :     #
102 :     # Initial validation.
103 :     #
104 :     &validate_dirs($dir_biodb_nr_input, $dir_reference_seed_data, $dir_rast_jobs);
105 :    
106 :     #
107 :     # Create our jobdir.
108 :     #
109 :    
110 :     my ($jobnum, $err) = ImportJob->create_new_job();
111 :     #my ($jobnum, $err) = ('002', undef);
112 :    
113 :     if (!$jobnum)
114 :     {
115 :     die "Create failed with error: $err\n";
116 :     }
117 :    
118 :     my $job = ImportJob->new($jobnum);
119 :     my $jobdir = $job->dir;
120 :    
121 :     $job->meta->add_log_entry($0, "creating new job");
122 :    
123 :     #
124 :     # Symlink to prev_nr and prev_syn in the job directory.
125 :     #
126 :    
127 :     my $prev_nr = "$jobdir/prev_nr";
128 :     my $prev_syn = "$jobdir/prev_syn";
129 :     my $prev_sims = "$jobdir/prev_sims";
130 :    
131 :     unlink($prev_nr, $prev_syn, $prev_sims);
132 :    
133 :     symlink($prev_nr_src, $prev_nr) or die "symlimk $prev_nr_src $prev_nr failed: $!";
134 :     symlink($prev_syn_src, $prev_syn) or die "symlimk $prev_syn_src $prev_syn failed: $!";
135 :     symlink($prev_sim_dir, $prev_sims) or die "symlimk $prev_sim_dir $prev_sims failed: $!";
136 :    
137 :     #
138 :     # Build list of NR sources. We start with the directories in the reference
139 :     # SEED's NR dir, and override with anything in the biodb NR dir.
140 :     #
141 :    
142 :     my %NR_dirs;
143 :    
144 :     scan_NR_dir(\%NR_dirs, "$dir_reference_seed_data/NR");
145 :     scan_NR_dir(\%NR_dirs, "$dir_biodb_nr_input");
146 :     #scan_NR_dir(\%NR_dirs, "$dir_biodb_nr_input", { skip => qr(^(SwissProt|.*\.bak)) });
147 :    
148 :     #
149 :     # And write to job dir.
150 :     #
151 :     open(F, ">$jobdir/nr.dirs");
152 :     for my $d (keys %NR_dirs)
153 :     {
154 :     print F join("\t", $d, @{$NR_dirs{$d}}{'path', 'size'}), "\n";
155 :     }
156 :     close(F);
157 :    
158 :     #
159 :     # Scan for SEED organisms.
160 :     #
161 :    
162 :     scan_seed_dir(\%NR_dirs, "$dir_reference_seed_data/Organisms");
163 :    
164 :     #
165 :     # Scan for RAST jobs to import.
166 :     #
167 : olson 1.2 # We update our NR component list with the peg features from the job,
168 :     # and we add the job directory of each to the rast.jobs file. This
169 :     # will be used later during the installation of these jobs into the SEED.
170 :     #
171 :    
172 :     open(JOBS, ">jobdir/rast.jobs") or die "Cannot create $jobdir/rast.jobs: $!";
173 : olson 1.1
174 :     my @rast_jobs;
175 :     scan_rast_jobs(\@rast_jobs, $dir_rast_jobs);
176 :    
177 :     for my $job (@rast_jobs)
178 :     {
179 :     my $gid = $job->genome_id;
180 :     my $gname = $job->genome_name;
181 :     my $j = $job->id;
182 :    
183 :     print "RAST job #$j: $gid $gname\n";
184 :    
185 : olson 1.2 print JOBS $job->dir(), "\n";
186 :    
187 : olson 1.1 if (exists($NR_dirs{$gid}))
188 :     {
189 :     die "Rast job $j already exists in SEED server\n";
190 :     }
191 :    
192 :     my $fasta = $job->orgdir() . "/Features/peg/fasta";
193 :     -f $fasta or die "Job $j has no fasta file in $fasta\n";
194 :     $NR_dirs{$gid} = {type => "rast_job", name => $gname, path => $job->orgdir,
195 :     fasta_path => $fasta, size => -s _ };
196 :     }
197 : olson 1.2 close(JOBS);
198 : olson 1.1
199 :     open(F, ">$jobdir/all.nr.dirs");
200 :     open(F2, ">$jobdir/nr.sources");
201 :     for my $d (sort bydb keys %NR_dirs)
202 :     {
203 :     print F join("\t", $d, @{$NR_dirs{$d}}{'path', 'size'}), "\n";
204 :     print F2 $NR_dirs{$d}->{fasta_path} . "\n";
205 :     }
206 :     close(F);
207 :     close(F2);
208 :    
209 :     sub bydb
210 :     {
211 :     if ($a =~ /^(\d+)\.(\d+)$/)
212 :     {
213 :     my($ga, $ia) = ($1, $2);
214 :     if ($b =~ /^(\d+)\.(\d+)$/)
215 :     {
216 :     my($gb, $ib) = ($1, $2);
217 :     return $ga <=> $gb or $ia <=> $ib;
218 :     }
219 :     else
220 :     {
221 :     return 1;
222 :     }
223 :     }
224 :     elsif ($b =~ /^\d+\.\d+$/)
225 :     {
226 :     return -1;
227 :     }
228 :     else
229 :     {
230 :     return $a cmp $b;
231 :     }
232 :     }
233 :    
234 :    
235 :     sub validate_dirs
236 :     {
237 :     my(@dirs) = @_;
238 :    
239 :     my $err;
240 :     for my $dir (@dirs)
241 :     {
242 :     if (! -d $dir)
243 :     {
244 :     warn "Required directory $dir is not present\n";
245 :     $err++;
246 :     }
247 :     }
248 :     exit(1) if $err;
249 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3