[Bio] / Babel / bin / load_MD5DATA2FILE.pl Repository:
ViewVC logotype

Annotation of /Babel/bin/load_MD5DATA2FILE.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : tharriso 1.1 #!/usr/bin/env perl
2 :    
3 :     use strict;
4 :     use warnings;
5 :    
6 :     use Data::Dumper;
7 :     use XML::Simple;
8 :     use Getopt::Long;
9 :    
10 :     use FIG_Config;
11 :    
12 :     my $verbose = 0;
13 :     my @datafile = ();
14 :     my @aliasfile = ();
15 :     my $src_file = '';
16 :     my $out_dir = '';
17 :    
18 :     my $data_tbl = "ach_data";
19 :     my $org_tbl = "ach_organisms";
20 :     my $contig_tbl = "ach_contigs";
21 :     my $func_tbl = "ach_functions";
22 :     my $alias_tbl = "ach_aliases";
23 :     my $source_tbl = "ach_sources";
24 :     my $id_ctg_tbl = "ach_id2contig";
25 :     my $count_tbl = "ach_counts";
26 :    
27 :     my $usage = qq(
28 :     DESCRIPTION: (load_MD5DATA2FILE)
29 :     Create tab-sperated files for loading as tables in ACH db.
30 :     Files created:
31 :     $data_tbl
32 :     $org_tbl
33 :     $contig_tbl
34 :     $func_tbl
35 :     $alias_tbl
36 :     $source_tbl
37 :     $id_ctg_tbl
38 :     $count_tbl
39 :    
40 :     USAGE:
41 :     --datafile source_data Required. This may be multiple files by calling the option multiple times.
42 :     Main data file: md5, id, function, organism, source, beg_pos*, end_pos*, strand*, contig_id*, contig_desc*, contig_length*
43 :     --aliasfile source_alias Optional. This may be multiple files by calling the option multiple times.
44 :     Alias data file: id, alias1, [alias2, alias3, ...]
45 :     --source source_info Optional. xml file with additional source information.
46 :     --outdir ouput_dir Optional. Dir path to place data files. Defualt is current dir.
47 :     --verbose Optional. Verbose output.
48 :    
49 :     );
50 :     if ( (@ARGV > 0) && ($ARGV[0] =~ /-h/) ) { print STDERR $usage; exit; }
51 :     if ( ! &GetOptions ('verbose!' => \$verbose,
52 :     'datafile=s' => \@datafile,
53 :     'aliasfile=s' => \@aliasfile,
54 :     'source:s' => \$src_file,
55 :     'outdir:s' => \$out_dir
56 :     ) )
57 :     { print STDERR $usage; exit; }
58 :    
59 :     if (@datafile == 0) { print STDERR $usage; exit; }
60 :     if ($out_dir) { $out_dir .= '/'; }
61 :    
62 :     my $sources = ($src_file && (-s $src_file)) ? XMLin($source_file, ContentKey => '-content') : {};
63 :    
64 :     my $alias_num = 1;
65 :     if ($verbose) { print STDERR "\nPrinting table $alias_tbl ... \n"; }
66 :     open(ALIAS, ">${out_dir}$alias_tbl") || die "Can't open file ${out_dir}$alias_tbl\n";
67 :     foreach my $afile (@aliasfile) {
68 :     open(AFILE, "<$afile") || die "Can't open file $afile\n";
69 :     if ($verbose) { print STDERR "Parsing $afile ... \n"; }
70 :    
71 :     while (my $line = <AFILE>) {
72 :     chomp $line;
73 :     my ($id, @aliases) = split(/\t/, $line);
74 :     foreach (@aliases) {
75 :     if ($_ =~ /^(\S+?):(\S+)$/) {
76 :     print ALIAS "$alias_num\t$id\t$2\t$1\n";
77 :     $alias_num += 1;
78 :     }
79 :     }
80 :     }
81 :     close AFILE;
82 :     }
83 :     close ALIAS;
84 :    
85 :     my $id_ids = {};
86 :     my $md5_ids = {};
87 :     my $org_ids = {};
88 :     my $ctg_ids = {};
89 :     my $func_ids = {};
90 :     my $src_ids = {};
91 :     my $id_ctg_ids = {};
92 :    
93 :     my $data_num = 1;
94 :     my $org_num = 1;
95 :     my $ctg_num = 1;
96 :     my $func_num = 1;
97 :     my $src_num = 1;
98 :     my $id_ctg_num = 1;
99 :    
100 :     my ($orgID, $ctgID, $funcID, $srcID, $id_ctgID);
101 :    
102 :     if ($verbose) { print STDERR "\nPrinting tables $data_tbl, $id_ctg_tbl ... \n"; }
103 :     open(DATA, ">${out_dir}$data_tbl") || die "Can't open file ${out_dir}$data_tbl\n";
104 :     open(ID2CTG, ">${out_dir}$id_ctg_tbl") || die "Can't open file ${out_dir}$id_ctg_tbl\n";
105 :    
106 :     foreach my $dfile (@datafile) {
107 :     open(DFILE, "<$dfile") || die "Can't open file $dfile\n";
108 :     if ($verbose) { print STDERR "Parsing $dfile ... \n"; }
109 :    
110 :     while (my $line = <DFILE>) {
111 :     chomp $line;
112 :     my ($md5, $id, $func, $org, $source, $beg, $end, $strand, $ctg_id, $ctg_desc, $len) = split(/\t/, $line);
113 :    
114 :     if (exists $src_ids->{$source}) {
115 :     $srcID = $src_ids->{$source}[0];
116 :     } else {
117 :     $srcID = $src_num;
118 :     # source counts: id, md5, org, contig, func
119 :     $src_ids->{$source} = [$src_num, 0, 0, 0, 0, 0];
120 :     $src_num += 1;
121 :     }
122 :    
123 :     if (! exists $id_ids->{$id}) {
124 :     $id_ids->{$id} = 1;
125 :     $src_ids->{$source}[1] += 1;
126 :     }
127 :    
128 :     if (! exists $md5_ids->{$md5}) {
129 :     $md5_ids->{$md5} = 1;
130 :     $src_ids->{$source}[2] += 1;
131 :     }
132 :    
133 :     if (exists $org_ids->{$org}) {
134 :     $orgID = $org_ids->{$org};
135 :     } else {
136 :     $orgID = $org_num;
137 :     $org_ids->{$org} = $org_num;
138 :     $org_num += 1;
139 :     $src_ids->{$source}[3] += 1;
140 :     }
141 :    
142 :     if (defined($beg) && defined($end) && $strand && $ctg_id && $ctg_desc && $len) {
143 :     if (exists $ctg_ids->{$ctg_id}) {
144 :     $ctgID = $ctg_ids->{$ctg_id}[0];
145 :     } else {
146 :     $ctgID = $ctg_num;
147 :     $ctg_ids->{$ctg_id} = [$ctg_num, $ctg_desc, $len, $orgID];
148 :     $ctg_num += 1;
149 :     $src_ids->{$source}[4] += 1;
150 :     }
151 :     print ID2CTG "$id_ctgID\t$data_num\t$ctgID\t$strand\t$beg\t$end\n";
152 :     }
153 :    
154 :     if (exists $func_ids->{$func}) {
155 :     $funcID = $func_ids->{$func};
156 :     } else {
157 :     $funcID = $func_num;
158 :     $func_ids->{$func} = $func_num;
159 :     $func_num += 1;
160 :     $src_ids->{$source}[5] += 1;
161 :     }
162 :    
163 :     print DATA "$data_num\t$md5\t$id\t$func\t$source\t$org\n";
164 :    
165 :     $data_num += 1;
166 :     unless ($data_num % 100000) {
167 :     if ($verbose) { print STDERR "$data_num:\t$md5 , $id , $func , $org , $source\n"; }
168 :     }
169 :     }
170 :     close DFILE;
171 :     }
172 :     close ID2CTG;
173 :     close DATA;
174 :    
175 :     if ($verbose) { print STDERR "\nPrinting table $count_tbl ... \n"; }
176 :     my @counts = ( "ids\t" . scalar(keys %$id_ids), "md5s\t" . scalar(keys %$md5_ids),
177 :     "organisms\t$org_num", "contigs\t$ctg_num", "functions\t$func_num", "sources\t$src_num" );
178 :     open(COUNT, ">${out_dir}$count_tbl") || die "Can't open file ${out_dir}$count_tbl\n";
179 :     print COUNT join("\n", @counts) . "\n";
180 :     close COUNT;
181 :    
182 :     if ($verbose) { print STDERR "\nPrinting table $org_tbl ... \n"; }
183 :     open(ORG, ">${out_dir}$org_tbl") || die "Can't open file ${out_dir}$org_tbl\n";
184 :     foreach (sort {$org_ids->{$a} <=> $org_ids->{$b}} keys %$org_ids) {
185 :     print ORG $org_ids->{$_} . "\t$_\n";
186 :     }
187 :     close ORG;
188 :    
189 :     if ($verbose) { print STDERR "\nPrinting table $contig_tbl ... \n"; }
190 :     open(CONTIG, ">${out_dir}$contig_tbl") || die "Can't open file ${out_dir}$contig_tbl\n";
191 :     foreach (sort {$ctg_ids->{$a}[0] <=> $ctg_ids->{$b}[0]} keys %$ctg_ids) {
192 :     print CONTIG join( "\t", ($ctg_ids->{$_}[0], $_, @{$ctg_ids->{$_}}[1,2,3] )) . "\n";
193 :     }
194 :     close CONTIG;
195 :    
196 :     if ($verbose) { print STDERR "\nPrinting table $func_tbl ... \n"; }
197 :     open(FUNC, ">${out_dir}$func_tbl") || die "Can't open file ${out_dir}$func_tbl\n";
198 :     foreach (sort {$func_ids->{$a} <=> $func_ids->{$b}} keys %$func_ids) {
199 :     print FUNC $func_ids->{$_} . "\t$_\n";
200 :     }
201 :     close FUNC;
202 :    
203 :     if ($verbose) { print STDERR "\nPrinting table $source_tbl ... \n"; }
204 :     open(SOURCE, ">${out_dir}$source_tbl") || die "Can't open file ${out_dir}$source_tbl\n";
205 :     foreach (sort {$src_ids->{$a}[0] <=> $src_ids->{$b}[0]} keys %$src_ids) {
206 :     my @src_row = ( $src_ids->{$_}[0], $_, @{$src_ids->{$_}}[1,2,3,4,5] );
207 :     if (exists $sources->{'names'}->{$_}) {
208 :     my $src = $sources->{'names'}->{$_};
209 :     push @src_row, $src, $sources->{'sources'}->{$src}->{'type'}, $sources->{'sources'}->{$src}->{'url'};
210 :     }
211 :     print SOURCE join("\n", @src_row) . "\n";
212 :     }
213 :     close SOURCE;
214 :    
215 :     if ($verbose) { print STDERR "Done.\n"; }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3