[Bio] / FortyEightMeta / mg_port_job.pl Repository:
ViewVC logotype

Annotation of /FortyEightMeta/mg_port_job.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : olson 1.1 #
2 :     # Port a mgrast version 1 job to mgrast version 2.
3 :     #
4 :     # We assume the job directory has been copied verbatim into the v2 job directory.
5 :     #
6 :     # The following tasks are performed:
7 :     #
8 :     # Database tables created (as in mg_preprocess) and metadata updated to note their
9 :     # names.
10 :     #
11 :     # Taxonomy database list is scanned. If a sims directory in the proc directory
12 :     # exists that matches a taxonomy database, the sims are loaded into the
13 :     # database tables. We use the simsdir.raw file that contains all sims.
14 :     #
15 :     # We special-case sims.seed in order to point the analysis at the SEED
16 :     # database version "OLD".
17 :     #
18 :    
19 :     use strict;
20 :     use Data::Dumper;
21 :     use FortyEightMeta::SimDB;
22 :     use FortyEightMeta::MGDB;
23 :     use JobStage;
24 :     use File::Basename;
25 :     use DBrtns;
26 :    
27 :     my $STAGE = "port_job";
28 :    
29 :     @ARGV == 1 or die "Usage: $0 job-dir\n";
30 :    
31 :     my $jobdir = shift;
32 :    
33 :     -d $jobdir or die "$0: job dir $jobdir does not exist\n";
34 :    
35 :     my $stage = new JobStage('Job48', $STAGE, $jobdir);
36 :     $stage or die "Cannot create job for $jobdir\n";
37 :    
38 :     my $job_id = basename($jobdir);
39 :     my $job = $stage->job();
40 :     my $meta = $stage->meta();
41 :    
42 :     my $sim_db = FortyEightMeta::SimDB->new();
43 :    
44 :     my $mgdb;
45 :     eval {
46 :     $mgdb = new DBrtns($FIG_Config::mgrast_dbms, $FIG_Config::mgrast_db,
47 :     $FIG_Config::mgrast_dbuser, $FIG_Config::mgrast_dbpass,
48 :     $FIG_Config::mgrast_dbport, $FIG_Config::mgrast_dbhost,
49 :     $FIG_Config::mgrast_dbsock);
50 :     };
51 :     if ($@)
52 :     {
53 :     $stage->fatal("cannot connect to database: $@");
54 :     }
55 :    
56 :     $mgdb or $stage->fatal("Cannot open connection to database");
57 :    
58 :     #
59 :     # NR file translation table. We've moved data from the original
60 :     # location to a new place, but the old jobs have the old
61 :     # paths.
62 :     #
63 :     my %nr_trans = ('/vol/metagenome-48-hour/Data/greengenes.fasta' =>
64 :     '/vol/mg-rast-test/Data/db/greengenes/1/greengenes.fasta',
65 :     '/vol/metagenome-48-hour/Data/lsu.fa' =>
66 :     '/vol/mg-rast-test/Data/db/euro_rrna/1/lsu.fa',
67 :     '/vol/metagenome-48-hour/Data/ssu.fa' =>
68 :     '/vol/mg-rast-test/Data/db/euro_rrna/1/ssu.fa',
69 :     '/scratch/metagenome-48-hour/Data/SEED_2006_07_01' =>
70 :     '/vol/48-hour/Data/nr',
71 :     '/vol/metagenome-48-hour/Data/16s.fa' =>
72 :     '/vol/mg-rast-test/Data/db/rdp/9.27/16s.fa');
73 :    
74 :     #
75 :     # See if the job has already been ported, and bail.
76 :     #
77 :    
78 :     if ($meta->get_metadata('db.table_name') ne '')
79 :     {
80 :     warn "Job is already ported\n";
81 :     }
82 :    
83 :     #
84 :     # Scan fasta to find max id length.
85 :     #
86 :     my $fasta = $meta->get_metadata("preprocess.fasta_file");
87 :    
88 :     if (! -f $fasta)
89 :     {
90 :     if ($fasta =~ m,^(.*)/(proc/[^/]+\.fa)$,)
91 :     {
92 :     my($base, $rel) = ($1, $2);
93 :     if (! -d $base)
94 :     {
95 :     my $nfasta = "$jobdir/$rel";
96 :     if (! -f $nfasta)
97 :     {
98 :     die "Original fasta location not found, relocation to $nfasta not found either\n";
99 :     }
100 :     print "Updating fasta location from $fasta to $nfasta\n";
101 :     $meta->set_metadata("preprocess.fasta_file_org", $fasta);
102 :     $meta->set_metadata("preprocess.fasta_file", $nfasta);
103 :     $fasta = $nfasta;
104 :     }
105 :     }
106 :     else
107 :     {
108 :     die "Fasta $fasta not found, cannot compute relocation\n";
109 :     }
110 :     }
111 :    
112 :     open(F, "<", $fasta) or die "Cannot open fasta file $fasta: $!";
113 :     my $max_id_len = -1;
114 :     while (<F>)
115 :     {
116 :     if (/^>(\S+)/)
117 :     {
118 :     my $l = length($1);
119 :     $max_id_len = $l if $l > $max_id_len;
120 :     }
121 :     }
122 :     close(F);
123 :     print "got max $max_id_len from fasta $fasta\n";
124 :    
125 :     my ($table_name, $best_iden_name, $best_psc_name) = FortyEightMeta::MGDB::create_sims_db($mgdb, $job_id, $max_id_len);
126 :    
127 :     $stage->set_metadata("db.table_name", $table_name);
128 :     $stage->set_metadata("db.best_by_iden_table_name", $best_iden_name);
129 :     $stage->set_metadata("db.best_by_psc_table_name", $best_psc_name);
130 :    
131 :     #
132 :     # Determine list of computed sims dirs in the job. These are the directories
133 :     # under jobdir/proc that have a task.list file.
134 :     #
135 :    
136 :     my @dirs;
137 :     my %db_spec;
138 :     for my $tl (<$jobdir/proc/*/task.list>)
139 :     {
140 :     if ($tl =~ m,^(.*/proc/([^/]+))/task\.list$,)
141 :     {
142 :     my($sim_path, $sim_dir) = ($1, $2);
143 :     #
144 :     # Open it up to find the NR used.
145 :     #
146 :     open(TL, "<", $tl) or die "Cannot open $tl: $!";
147 :     my $l = <TL>;
148 :     my ($task, $in, $nr, $flags, $out, $err) = split(/\t/, $l);
149 :     close(TL);
150 :    
151 :     my $nr_used = $nr;
152 :     my $trf = $nr_trans{$nr};
153 :    
154 :     my($db_name, $db_version, $tax_files) = $sim_db->db_files_for_fasta_file($nr);
155 :     if (!$db_name)
156 :     {
157 :     ($db_name, $db_version, $tax_files) = $sim_db->db_files_for_fasta_file($trf);
158 :     $nr_used = $trf;
159 :     }
160 :    
161 :     if ($db_name)
162 :     {
163 :     $db_spec{$db_name} = $db_version;
164 :     }
165 :    
166 :     #
167 :     # Find the sims file.
168 :     #
169 :     my $sims = "$sim_path.raw";
170 :     if (! -f $sims)
171 :     {
172 :     die "Sims not found at $sims\n";
173 :     }
174 :    
175 :     push(@dirs, [$sim_path, $sim_dir, $sims, $nr, $trf, $nr_used, $db_name, $db_version, $tax_files]);
176 :     }
177 :     }
178 :    
179 :     my @databases = $sim_db->databases(\%db_spec);
180 :    
181 :     $meta->set_metadata('sims.database_list', \@databases);
182 :    
183 :     #
184 :     # We now have all the information that we should need to port the job.
185 :     #
186 :     # We don't actually need the db info we looked up, since mg_load_sims_file
187 :     # does the same lookup, but it verifies that we were able to find the
188 :     # data.
189 :     #
190 :    
191 :     for my $ent (@dirs)
192 :     {
193 :     my($sim_path, $sim_dir, $sim_file, $nr, $trans_nr, $nr_used, $db_name, $db_version, $tax_files) = @$ent;
194 :    
195 :     if (!defined($db_name))
196 :     {
197 :     warn "Skipping $sim_path - no database found\n";
198 :     }
199 :     my @cmd = ("$FIG_Config::bin/mg_load_sims_file",
200 :     $nr_used, $sim_file,
201 :     $table_name, $best_iden_name, $best_psc_name);
202 :     print "Run: @cmd\n";
203 :     my $rc = system(@cmd);
204 :     if ($rc == 0)
205 :     {
206 :     print "Success loading $sim_file\n";
207 :     }
208 :     else
209 :     {
210 :     die "Error loading $sim_file, rc=$rc\n";
211 :     }
212 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3