[Bio] / Babel / bin / load_MD5DATA2FILE.pl Repository:
ViewVC logotype

Diff of /Babel/bin/load_MD5DATA2FILE.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Thu Sep 16 21:31:24 2010 UTC revision 1.2, Wed Sep 22 15:03:54 2010 UTC
# Line 7  Line 7 
7  use XML::Simple;  use XML::Simple;
8  use Getopt::Long;  use Getopt::Long;
9    
 use FIG_Config;  
   
10  my $verbose   = 0;  my $verbose   = 0;
11  my @datafile  = ();  my @datafile  = ();
12  my @aliasfile = ();  my @aliasfile = ();
# Line 59  Line 57 
57  if (@datafile == 0) { print STDERR $usage; exit; }  if (@datafile == 0) { print STDERR $usage; exit; }
58  if ($out_dir)       { $out_dir .= '/'; }  if ($out_dir)       { $out_dir .= '/'; }
59    
60  my $sources = ($src_file && (-s $src_file)) ? XMLin($source_file, ContentKey => '-content') : {};  my $sources = ($src_file && (-s $src_file)) ? XMLin($src_file, ContentKey => '-content') : {};
61    
62  my $alias_num = 1;  my $alias_num = 1;
63  if ($verbose) { print STDERR "\nPrinting table $alias_tbl ... \n"; }  if ($verbose) { print STDERR "\nPrinting table $alias_tbl ... \n"; }
# Line 75  Line 73 
73        if ($_ =~ /^(\S+?):(\S+)$/) {        if ($_ =~ /^(\S+?):(\S+)$/) {
74          print ALIAS "$alias_num\t$id\t$2\t$1\n";          print ALIAS "$alias_num\t$id\t$2\t$1\n";
75          $alias_num += 1;          $alias_num += 1;
76            unless ($alias_num % 2000000) {
77              if ($verbose) { print STDERR "$alias_num:\t$id , $2 , $1\n"; }
78            }
79        }        }
80      }      }
81    }    }
# Line 88  Line 89 
89  my $ctg_ids    = {};  my $ctg_ids    = {};
90  my $func_ids   = {};  my $func_ids   = {};
91  my $src_ids    = {};  my $src_ids    = {};
 my $id_ctg_ids = {};  
92    
93  my $data_num   = 1;  my $data_num   = 1;
94  my $org_num    = 1;  my $org_num    = 1;
# Line 97  Line 97 
97  my $src_num    = 1;  my $src_num    = 1;
98  my $id_ctg_num = 1;  my $id_ctg_num = 1;
99    
100  my ($orgID, $ctgID, $funcID, $srcID, $id_ctgID);  my ($orgID, $ctgID, $funcID, $srcID);
101    
102  if ($verbose) { print STDERR "\nPrinting tables $data_tbl, $id_ctg_tbl ... \n"; }  if ($verbose) { print STDERR "\nPrinting tables $data_tbl, $id_ctg_tbl ... \n"; }
103  open(DATA, ">${out_dir}$data_tbl") || die "Can't open file ${out_dir}$data_tbl\n";  open(DATA, ">${out_dir}$data_tbl") || die "Can't open file ${out_dir}$data_tbl\n";
# Line 111  Line 111 
111      chomp $line;      chomp $line;
112      my ($md5, $id, $func, $org, $source, $beg, $end, $strand, $ctg_id, $ctg_desc, $len) = split(/\t/, $line);      my ($md5, $id, $func, $org, $source, $beg, $end, $strand, $ctg_id, $ctg_desc, $len) = split(/\t/, $line);
113    
114        unless ($md5 && $id) { next; }
115        $id_ids->{$id}   = 1;
116        $md5_ids->{$md5} = 1;
117    
118        if ($source) {
119      if (exists $src_ids->{$source}) {      if (exists $src_ids->{$source}) {
120        $srcID = $src_ids->{$source}[0];        $srcID = $src_ids->{$source}[0];
121            $src_ids->{$source}[1]{$id}  = 1;
122            $src_ids->{$source}[2]{$md5} = 1;
123      } else {      } else {
124        $srcID = $src_num;        $srcID = $src_num;
125        # source counts: id, md5, org, contig, func        # source counts: id, md5, org, contig, func
126        $src_ids->{$source} = [$src_num, 0, 0, 0, 0, 0];          $src_ids->{$source} = [$src_num, {}, {}, {}, {}, {}];
127        $src_num += 1;        $src_num += 1;
128      }      }
129        } else {
130      if (! exists $id_ids->{$id}) {        $srcID = "\\N";
       $id_ids->{$id} = 1;  
       $src_ids->{$source}[1] += 1;  
     }  
   
     if (! exists $md5_ids->{$md5}) {  
       $md5_ids->{$md5} = 1;  
       $src_ids->{$source}[2] += 1;  
131      }      }
132    
133        if ($org) {
134          $org =~ s/'/\\'/;
135      if (exists $org_ids->{$org}) {      if (exists $org_ids->{$org}) {
136        $orgID = $org_ids->{$org};        $orgID = $org_ids->{$org};
137      } else {      } else {
138        $orgID = $org_num;        $orgID = $org_num;
139        $org_ids->{$org} = $org_num;        $org_ids->{$org} = $org_num;
140        $org_num += 1;        $org_num += 1;
141        $src_ids->{$source}[3] += 1;        }
142          $src_ids->{$source}[3]{$org} = 1;
143        } else {
144          $orgID = "\\N";
145      }      }
146    
147      if (defined($beg) && defined($end) && $strand && $ctg_id && $ctg_desc && $len) {      if (defined($beg) && defined($end) && $strand && $ctg_id && $ctg_desc && $len) {
148          $ctg_id =~ s/'/\\'/;
149        if (exists $ctg_ids->{$ctg_id}) {        if (exists $ctg_ids->{$ctg_id}) {
150          $ctgID = $ctg_ids->{$ctg_id}[0];          $ctgID = $ctg_ids->{$ctg_id}[0];
151        } else {        } else {
152          $ctgID = $ctg_num;          $ctgID = $ctg_num;
153          $ctg_ids->{$ctg_id} = [$ctg_num, $ctg_desc, $len, $orgID];          $ctg_ids->{$ctg_id} = [$ctg_num, $ctg_desc, $len, $orgID];
154          $ctg_num += 1;          $ctg_num += 1;
         $src_ids->{$source}[4] += 1;  
155        }        }
156        print ID2CTG "$id_ctgID\t$data_num\t$ctgID\t$strand\t$beg\t$end\n";        print ID2CTG "$id_ctg_num\t$data_num\t$ctgID\t$strand\t$beg\t$end\n";
157          $id_ctg_num += 1;
158          $src_ids->{$source}[4]{$ctg_id} = 1;
159      }      }
160    
161        if ($func) {
162          $func =~ s/'/\\'/;
163      if (exists $func_ids->{$func}) {      if (exists $func_ids->{$func}) {
164        $funcID = $func_ids->{$func};        $funcID = $func_ids->{$func};
165      } else {      } else {
166        $funcID = $func_num;        $funcID = $func_num;
167        $func_ids->{$func} = $func_num;        $func_ids->{$func} = $func_num;
168        $func_num += 1;        $func_num += 1;
169        $src_ids->{$source}[5] += 1;        }
170          $src_ids->{$source}[5]{$func} = 1;
171        } else {
172          $func = "\\N";
173      }      }
174    
175      print DATA "$data_num\t$md5\t$id\t$func\t$source\t$org\n";      print DATA "$data_num\t$md5\t$id\t$funcID\t$srcID\t$orgID\t\\N\t\\N\n";
   
176      $data_num += 1;      $data_num += 1;
177      unless ($data_num % 100000) {      unless ($data_num % 1000000) {
178        if ($verbose) { print STDERR "$data_num:\t$md5 , $id , $func , $org , $source\n"; }        if ($verbose) { print STDERR "$data_num:\t$md5 , $id , $func , $org , $source\n"; }
179      }      }
180    }    }
# Line 182  Line 193 
193  if ($verbose) { print STDERR "\nPrinting table $org_tbl ... \n"; }  if ($verbose) { print STDERR "\nPrinting table $org_tbl ... \n"; }
194  open(ORG, ">${out_dir}$org_tbl") || die "Can't open file ${out_dir}$org_tbl\n";  open(ORG, ">${out_dir}$org_tbl") || die "Can't open file ${out_dir}$org_tbl\n";
195  foreach (sort {$org_ids->{$a} <=> $org_ids->{$b}} keys %$org_ids) {  foreach (sort {$org_ids->{$a} <=> $org_ids->{$b}} keys %$org_ids) {
196    print ORG $org_ids->{$_} . "\t$_\n";    print ORG $org_ids->{$_} . "\t$_" . ("\t\\N" x 12) . "\n";
197  }  }
198  close ORG;  close ORG;
199    
# Line 203  Line 214 
214  if ($verbose) { print STDERR "\nPrinting table $source_tbl ... \n"; }  if ($verbose) { print STDERR "\nPrinting table $source_tbl ... \n"; }
215  open(SOURCE, ">${out_dir}$source_tbl") || die "Can't open file ${out_dir}$source_tbl\n";  open(SOURCE, ">${out_dir}$source_tbl") || die "Can't open file ${out_dir}$source_tbl\n";
216  foreach (sort {$src_ids->{$a}[0] <=> $src_ids->{$b}[0]} keys %$src_ids) {  foreach (sort {$src_ids->{$a}[0] <=> $src_ids->{$b}[0]} keys %$src_ids) {
217    my @src_row = ( $src_ids->{$_}[0], $_, @{$src_ids->{$_}}[1,2,3,4,5] );    my @src_row = ( $src_ids->{$_}[0], $_ );
218    if (exists $sources->{'names'}->{$_}) {    if (exists $sources->{'names'}->{$_}) {
219      my $src = $sources->{'names'}->{$_};      my $src = $sources->{'names'}->{$_};
220      push @src_row, $src, $sources->{'sources'}->{$src}->{'type'}, $sources->{'sources'}->{$src}->{'url'};      push @src_row, $src, $sources->{'sources'}->{$src}->{'type'}, $sources->{'sources'}->{$src}->{'url'}, "\\N", "\\N";
221      } else {
222        push @src_row, "\\N", "\\N", "\\N", "\\N", "\\N";
223    }    }
224    print SOURCE join("\n", @src_row) . "\n";    print SOURCE join("\t", @src_row) . "\t" . join("\t", map {scalar(keys %$_)} @{$src_ids->{$_}}[1,2,3,4,5]) . "\n";
225  }  }
226  close SOURCE;  close SOURCE;
227    

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.2

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3