[Bio] / Babel / bin / load_MD5DATA2FILE.pl Repository:
ViewVC logotype

Annotation of /Babel/bin/load_MD5DATA2FILE.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : tharriso 1.1 #!/usr/bin/env perl
2 :    
3 :     use strict;
4 :     use warnings;
5 :    
6 :     use Data::Dumper;
7 :     use XML::Simple;
8 :     use Getopt::Long;
9 :    
10 :     my $verbose = 0;
11 :     my @datafile = ();
12 :     my @aliasfile = ();
13 :     my $src_file = '';
14 :     my $out_dir = '';
15 :    
16 :     my $data_tbl = "ach_data";
17 :     my $org_tbl = "ach_organisms";
18 :     my $contig_tbl = "ach_contigs";
19 :     my $func_tbl = "ach_functions";
20 :     my $alias_tbl = "ach_aliases";
21 :     my $source_tbl = "ach_sources";
22 :     my $id_ctg_tbl = "ach_id2contig";
23 :     my $count_tbl = "ach_counts";
24 :    
25 : tharriso 1.2 my $usage = qq(
26 : tharriso 1.1 DESCRIPTION: (load_MD5DATA2FILE)
27 :     Create tab-sperated files for loading as tables in ACH db.
28 :     Files created:
29 :     $data_tbl
30 :     $org_tbl
31 :     $contig_tbl
32 :     $func_tbl
33 :     $alias_tbl
34 :     $source_tbl
35 :     $id_ctg_tbl
36 :     $count_tbl
37 :    
38 :     USAGE:
39 :     --datafile source_data Required. This may be multiple files by calling the option multiple times.
40 :     Main data file: md5, id, function, organism, source, beg_pos*, end_pos*, strand*, contig_id*, contig_desc*, contig_length*
41 :     --aliasfile source_alias Optional. This may be multiple files by calling the option multiple times.
42 :     Alias data file: id, alias1, [alias2, alias3, ...]
43 :     --source source_info Optional. xml file with additional source information.
44 :     --outdir ouput_dir Optional. Dir path to place data files. Defualt is current dir.
45 :     --verbose Optional. Verbose output.
46 :    
47 :     );
48 :     if ( (@ARGV > 0) && ($ARGV[0] =~ /-h/) ) { print STDERR $usage; exit; }
49 :     if ( ! &GetOptions ('verbose!' => \$verbose,
50 :     'datafile=s' => \@datafile,
51 :     'aliasfile=s' => \@aliasfile,
52 :     'source:s' => \$src_file,
53 :     'outdir:s' => \$out_dir
54 :     ) )
55 :     { print STDERR $usage; exit; }
56 :    
57 :     if (@datafile == 0) { print STDERR $usage; exit; }
58 :     if ($out_dir) { $out_dir .= '/'; }
59 :    
60 : tharriso 1.2 my $sources = ($src_file && (-s $src_file)) ? XMLin($src_file, ContentKey => '-content') : {};
61 : tharriso 1.1
62 :     my $alias_num = 1;
63 :     if ($verbose) { print STDERR "\nPrinting table $alias_tbl ... \n"; }
64 :     open(ALIAS, ">${out_dir}$alias_tbl") || die "Can't open file ${out_dir}$alias_tbl\n";
65 :     foreach my $afile (@aliasfile) {
66 :     open(AFILE, "<$afile") || die "Can't open file $afile\n";
67 :     if ($verbose) { print STDERR "Parsing $afile ... \n"; }
68 :    
69 :     while (my $line = <AFILE>) {
70 :     chomp $line;
71 :     my ($id, @aliases) = split(/\t/, $line);
72 :     foreach (@aliases) {
73 :     if ($_ =~ /^(\S+?):(\S+)$/) {
74 :     print ALIAS "$alias_num\t$id\t$2\t$1\n";
75 :     $alias_num += 1;
76 : tharriso 1.2 unless ($alias_num % 2000000) {
77 :     if ($verbose) { print STDERR "$alias_num:\t$id , $2 , $1\n"; }
78 :     }
79 : tharriso 1.1 }
80 :     }
81 :     }
82 :     close AFILE;
83 :     }
84 :     close ALIAS;
85 :    
86 :     my $id_ids = {};
87 :     my $md5_ids = {};
88 :     my $org_ids = {};
89 :     my $ctg_ids = {};
90 :     my $func_ids = {};
91 :     my $src_ids = {};
92 :    
93 :     my $data_num = 1;
94 :     my $org_num = 1;
95 :     my $ctg_num = 1;
96 :     my $func_num = 1;
97 :     my $src_num = 1;
98 :     my $id_ctg_num = 1;
99 :    
100 : tharriso 1.2 my ($orgID, $ctgID, $funcID, $srcID);
101 : tharriso 1.1
102 :     if ($verbose) { print STDERR "\nPrinting tables $data_tbl, $id_ctg_tbl ... \n"; }
103 :     open(DATA, ">${out_dir}$data_tbl") || die "Can't open file ${out_dir}$data_tbl\n";
104 :     open(ID2CTG, ">${out_dir}$id_ctg_tbl") || die "Can't open file ${out_dir}$id_ctg_tbl\n";
105 :    
106 :     foreach my $dfile (@datafile) {
107 :     open(DFILE, "<$dfile") || die "Can't open file $dfile\n";
108 :     if ($verbose) { print STDERR "Parsing $dfile ... \n"; }
109 :    
110 :     while (my $line = <DFILE>) {
111 :     chomp $line;
112 :     my ($md5, $id, $func, $org, $source, $beg, $end, $strand, $ctg_id, $ctg_desc, $len) = split(/\t/, $line);
113 : tharriso 1.2
114 :     unless ($md5 && $id) { next; }
115 :     $id_ids->{$id} = 1;
116 :     $md5_ids->{$md5} = 1;
117 : tharriso 1.1
118 : tharriso 1.2 if ($source) {
119 :     if (exists $src_ids->{$source}) {
120 :     $srcID = $src_ids->{$source}[0];
121 :     $src_ids->{$source}[1]{$id} = 1;
122 :     $src_ids->{$source}[2]{$md5} = 1;
123 :     } else {
124 :     $srcID = $src_num;
125 :     # source counts: id, md5, org, contig, func
126 :     $src_ids->{$source} = [$src_num, {}, {}, {}, {}, {}];
127 :     $src_num += 1;
128 :     }
129 : tharriso 1.1 } else {
130 : tharriso 1.2 $srcID = "\\N";
131 : tharriso 1.1 }
132 :    
133 : tharriso 1.2 if ($org) {
134 : tharriso 1.3 $org =~ s/'/\\'/g;
135 :     $org =~ s/\\//g;
136 : tharriso 1.2 if (exists $org_ids->{$org}) {
137 :     $orgID = $org_ids->{$org};
138 :     } else {
139 :     $orgID = $org_num;
140 :     $org_ids->{$org} = $org_num;
141 :     $org_num += 1;
142 :     }
143 :     $src_ids->{$source}[3]{$org} = 1;
144 : tharriso 1.1 } else {
145 : tharriso 1.2 $orgID = "\\N";
146 : tharriso 1.1 }
147 :    
148 :     if (defined($beg) && defined($end) && $strand && $ctg_id && $ctg_desc && $len) {
149 : tharriso 1.3 $ctg_id =~ s/'/\\'/g;
150 :     $ctg_id =~ s/\\//g;
151 :     $ctg_desc =~ s/'/\\'/g;
152 :     $ctg_desc =~ s/\\//g;
153 : tharriso 1.1 if (exists $ctg_ids->{$ctg_id}) {
154 :     $ctgID = $ctg_ids->{$ctg_id}[0];
155 :     } else {
156 :     $ctgID = $ctg_num;
157 :     $ctg_ids->{$ctg_id} = [$ctg_num, $ctg_desc, $len, $orgID];
158 :     $ctg_num += 1;
159 :     }
160 : tharriso 1.2 print ID2CTG "$id_ctg_num\t$data_num\t$ctgID\t$strand\t$beg\t$end\n";
161 :     $id_ctg_num += 1;
162 :     $src_ids->{$source}[4]{$ctg_id} = 1;
163 : tharriso 1.1 }
164 :    
165 : tharriso 1.2 if ($func) {
166 : tharriso 1.3 $func =~ s/'/\\'/g;
167 :     $func =~ s/\\//g;
168 : tharriso 1.2 if (exists $func_ids->{$func}) {
169 :     $funcID = $func_ids->{$func};
170 :     } else {
171 :     $funcID = $func_num;
172 :     $func_ids->{$func} = $func_num;
173 :     $func_num += 1;
174 :     }
175 :     $src_ids->{$source}[5]{$func} = 1;
176 : tharriso 1.1 } else {
177 : tharriso 1.2 $func = "\\N";
178 : tharriso 1.1 }
179 :    
180 : tharriso 1.2 print DATA "$data_num\t$md5\t$id\t$funcID\t$srcID\t$orgID\t\\N\t\\N\n";
181 : tharriso 1.1 $data_num += 1;
182 : tharriso 1.2 unless ($data_num % 1000000) {
183 : tharriso 1.1 if ($verbose) { print STDERR "$data_num:\t$md5 , $id , $func , $org , $source\n"; }
184 :     }
185 :     }
186 :     close DFILE;
187 :     }
188 :     close ID2CTG;
189 :     close DATA;
190 :    
191 :     if ($verbose) { print STDERR "\nPrinting table $count_tbl ... \n"; }
192 :     my @counts = ( "ids\t" . scalar(keys %$id_ids), "md5s\t" . scalar(keys %$md5_ids),
193 :     "organisms\t$org_num", "contigs\t$ctg_num", "functions\t$func_num", "sources\t$src_num" );
194 :     open(COUNT, ">${out_dir}$count_tbl") || die "Can't open file ${out_dir}$count_tbl\n";
195 :     print COUNT join("\n", @counts) . "\n";
196 :     close COUNT;
197 :    
198 :     if ($verbose) { print STDERR "\nPrinting table $org_tbl ... \n"; }
199 :     open(ORG, ">${out_dir}$org_tbl") || die "Can't open file ${out_dir}$org_tbl\n";
200 :     foreach (sort {$org_ids->{$a} <=> $org_ids->{$b}} keys %$org_ids) {
201 : tharriso 1.2 print ORG $org_ids->{$_} . "\t$_" . ("\t\\N" x 12) . "\n";
202 : tharriso 1.1 }
203 :     close ORG;
204 :    
205 :     if ($verbose) { print STDERR "\nPrinting table $contig_tbl ... \n"; }
206 :     open(CONTIG, ">${out_dir}$contig_tbl") || die "Can't open file ${out_dir}$contig_tbl\n";
207 :     foreach (sort {$ctg_ids->{$a}[0] <=> $ctg_ids->{$b}[0]} keys %$ctg_ids) {
208 :     print CONTIG join( "\t", ($ctg_ids->{$_}[0], $_, @{$ctg_ids->{$_}}[1,2,3] )) . "\n";
209 :     }
210 :     close CONTIG;
211 :    
212 :     if ($verbose) { print STDERR "\nPrinting table $func_tbl ... \n"; }
213 :     open(FUNC, ">${out_dir}$func_tbl") || die "Can't open file ${out_dir}$func_tbl\n";
214 :     foreach (sort {$func_ids->{$a} <=> $func_ids->{$b}} keys %$func_ids) {
215 :     print FUNC $func_ids->{$_} . "\t$_\n";
216 :     }
217 :     close FUNC;
218 :    
219 :     if ($verbose) { print STDERR "\nPrinting table $source_tbl ... \n"; }
220 :     open(SOURCE, ">${out_dir}$source_tbl") || die "Can't open file ${out_dir}$source_tbl\n";
221 :     foreach (sort {$src_ids->{$a}[0] <=> $src_ids->{$b}[0]} keys %$src_ids) {
222 : tharriso 1.2 my @src_row = ( $src_ids->{$_}[0], $_ );
223 : tharriso 1.1 if (exists $sources->{'names'}->{$_}) {
224 :     my $src = $sources->{'names'}->{$_};
225 : tharriso 1.2 push @src_row, $src, $sources->{'sources'}->{$src}->{'type'}, $sources->{'sources'}->{$src}->{'url'}, "\\N", "\\N";
226 :     } else {
227 :     push @src_row, "\\N", "\\N", "\\N", "\\N", "\\N";
228 :     }
229 :     print SOURCE join("\t", @src_row) . "\t" . join("\t", map {scalar(keys %$_)} @{$src_ids->{$_}}[1,2,3,4,5]) . "\n";
230 : tharriso 1.1 }
231 :     close SOURCE;
232 :    
233 :     if ($verbose) { print STDERR "Done.\n"; }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3