[Bio] / Babel / bin / load_MD5DATA2FILE.pl Repository:
ViewVC logotype

View of /Babel/bin/load_MD5DATA2FILE.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Thu Sep 16 21:31:24 2010 UTC (9 years, 9 months ago) by tharriso
Branch: MAIN
*** empty log message ***

#!/usr/bin/env perl

use strict;
use warnings;

use Data::Dumper;
use XML::Simple;
use Getopt::Long;

use FIG_Config;

my $verbose   = 0;
my @datafile  = ();
my @aliasfile = ();
my $src_file  = '';
my $out_dir   = '';

my $data_tbl   = "ach_data";
my $org_tbl    = "ach_organisms";
my $contig_tbl = "ach_contigs";
my $func_tbl   = "ach_functions";
my $alias_tbl  = "ach_aliases";
my $source_tbl = "ach_sources";
my $id_ctg_tbl = "ach_id2contig";
my $count_tbl  = "ach_counts";

my $usage     = qq(
DESCRIPTION: (load_MD5DATA2FILE)
Create tab-sperated files for loading as tables in ACH db.
Files created:
   $data_tbl
   $org_tbl
   $contig_tbl
   $func_tbl
   $alias_tbl
   $source_tbl
   $id_ctg_tbl
   $count_tbl

USAGE:
  --datafile   source_data   Required. This may be multiple files by calling the option multiple times.
                             Main data file: md5, id, function, organism, source, beg_pos*, end_pos*, strand*, contig_id*, contig_desc*, contig_length*
  --aliasfile  source_alias  Optional. This may be multiple files by calling the option multiple times.
                             Alias data file: id, alias1, [alias2, alias3, ...]
  --source     source_info   Optional. xml file with additional source information.
  --outdir     ouput_dir     Optional. Dir path to place data files. Defualt is current dir.
  --verbose                  Optional. Verbose output.

);
if ( (@ARGV > 0) && ($ARGV[0] =~ /-h/) ) { print STDERR $usage; exit; }
if ( ! &GetOptions ('verbose!'    => \$verbose, 
		    'datafile=s'  => \@datafile,
		    'aliasfile=s' => \@aliasfile,
		    'source:s'    => \$src_file,
		    'outdir:s'    => \$out_dir
		   ) )
  { print STDERR $usage; exit; }

if (@datafile == 0) { print STDERR $usage; exit; }
if ($out_dir)       { $out_dir .= '/'; }

my $sources = ($src_file && (-s $src_file)) ? XMLin($source_file, ContentKey => '-content') : {};

my $alias_num = 1;
if ($verbose) { print STDERR "\nPrinting table $alias_tbl ... \n"; }
open(ALIAS, ">${out_dir}$alias_tbl") || die "Can't open file ${out_dir}$alias_tbl\n";
foreach my $afile (@aliasfile) {
  open(AFILE, "<$afile") || die "Can't open file $afile\n";
  if ($verbose) { print STDERR "Parsing $afile ... \n"; }

  while (my $line = <AFILE>) {
    chomp $line;
    my ($id, @aliases) = split(/\t/, $line);
    foreach (@aliases) {
      if ($_ =~ /^(\S+?):(\S+)$/) {
	print ALIAS "$alias_num\t$id\t$2\t$1\n";
	$alias_num += 1;
      }
    }
  }
  close AFILE;
}
close ALIAS;

my $id_ids     = {};
my $md5_ids    = {};
my $org_ids    = {};
my $ctg_ids    = {};
my $func_ids   = {};
my $src_ids    = {};
my $id_ctg_ids = {};

my $data_num   = 1;
my $org_num    = 1;
my $ctg_num    = 1;
my $func_num   = 1;
my $src_num    = 1;
my $id_ctg_num = 1;

my ($orgID, $ctgID, $funcID, $srcID, $id_ctgID);

if ($verbose) { print STDERR "\nPrinting tables $data_tbl, $id_ctg_tbl ... \n"; }
open(DATA, ">${out_dir}$data_tbl") || die "Can't open file ${out_dir}$data_tbl\n";
open(ID2CTG, ">${out_dir}$id_ctg_tbl") || die "Can't open file ${out_dir}$id_ctg_tbl\n";

foreach my $dfile (@datafile) {
  open(DFILE, "<$dfile") || die "Can't open file $dfile\n";
  if ($verbose) { print STDERR "Parsing $dfile ... \n"; }
  
  while (my $line = <DFILE>) {
    chomp $line;
    my ($md5, $id, $func, $org, $source, $beg, $end, $strand, $ctg_id, $ctg_desc, $len) = split(/\t/, $line);
    
    if (exists $src_ids->{$source}) {
      $srcID = $src_ids->{$source}[0];
    } else {
      $srcID = $src_num;
      # source counts: id, md5, org, contig, func
      $src_ids->{$source} = [$src_num, 0, 0, 0, 0, 0];
      $src_num += 1;
    }

    if (! exists $id_ids->{$id}) {
      $id_ids->{$id} = 1;
      $src_ids->{$source}[1] += 1;
    }

    if (! exists $md5_ids->{$md5}) {
      $md5_ids->{$md5} = 1;
      $src_ids->{$source}[2] += 1;
    }

    if (exists $org_ids->{$org}) {
      $orgID = $org_ids->{$org};
    } else {
      $orgID = $org_num;
      $org_ids->{$org} = $org_num;
      $org_num += 1;
      $src_ids->{$source}[3] += 1;
    }
    
    if (defined($beg) && defined($end) && $strand && $ctg_id && $ctg_desc && $len) {
      if (exists $ctg_ids->{$ctg_id}) {
	$ctgID = $ctg_ids->{$ctg_id}[0];
      } else {
	$ctgID = $ctg_num;
	$ctg_ids->{$ctg_id} = [$ctg_num, $ctg_desc, $len, $orgID];
	$ctg_num += 1;
	$src_ids->{$source}[4] += 1;
      }
      print ID2CTG "$id_ctgID\t$data_num\t$ctgID\t$strand\t$beg\t$end\n";
    }

    if (exists $func_ids->{$func}) {
      $funcID = $func_ids->{$func};
    } else {
      $funcID = $func_num;
      $func_ids->{$func} = $func_num;
      $func_num += 1;
      $src_ids->{$source}[5] += 1;
    }

    print DATA "$data_num\t$md5\t$id\t$func\t$source\t$org\n";
    
    $data_num += 1;
    unless ($data_num % 100000) {
      if ($verbose) { print STDERR "$data_num:\t$md5 , $id , $func , $org , $source\n"; }
    }
  }
  close DFILE;
}
close ID2CTG;
close DATA;

if ($verbose) { print STDERR "\nPrinting table $count_tbl ... \n"; }
my @counts = ( "ids\t" . scalar(keys %$id_ids), "md5s\t" . scalar(keys %$md5_ids),
	       "organisms\t$org_num", "contigs\t$ctg_num", "functions\t$func_num", "sources\t$src_num" );
open(COUNT, ">${out_dir}$count_tbl") || die "Can't open file ${out_dir}$count_tbl\n";
print COUNT join("\n", @counts) . "\n";
close COUNT;

if ($verbose) { print STDERR "\nPrinting table $org_tbl ... \n"; }
open(ORG, ">${out_dir}$org_tbl") || die "Can't open file ${out_dir}$org_tbl\n";
foreach (sort {$org_ids->{$a} <=> $org_ids->{$b}} keys %$org_ids) {
  print ORG $org_ids->{$_} . "\t$_\n";
}
close ORG;

if ($verbose) { print STDERR "\nPrinting table $contig_tbl ... \n"; }
open(CONTIG, ">${out_dir}$contig_tbl") || die "Can't open file ${out_dir}$contig_tbl\n";
foreach (sort {$ctg_ids->{$a}[0] <=> $ctg_ids->{$b}[0]} keys %$ctg_ids) {
  print CONTIG join( "\t", ($ctg_ids->{$_}[0], $_, @{$ctg_ids->{$_}}[1,2,3] )) . "\n";
}
close CONTIG;

if ($verbose) { print STDERR "\nPrinting table $func_tbl ... \n"; }
open(FUNC, ">${out_dir}$func_tbl") || die "Can't open file ${out_dir}$func_tbl\n";
foreach (sort {$func_ids->{$a} <=> $func_ids->{$b}} keys %$func_ids) {
  print FUNC $func_ids->{$_} . "\t$_\n";
}
close FUNC;

if ($verbose) { print STDERR "\nPrinting table $source_tbl ... \n"; }
open(SOURCE, ">${out_dir}$source_tbl") || die "Can't open file ${out_dir}$source_tbl\n";
foreach (sort {$src_ids->{$a}[0] <=> $src_ids->{$b}[0]} keys %$src_ids) {
  my @src_row = ( $src_ids->{$_}[0], $_, @{$src_ids->{$_}}[1,2,3,4,5] );
  if (exists $sources->{'names'}->{$_}) {
    my $src = $sources->{'names'}->{$_};
    push @src_row, $src, $sources->{'sources'}->{$src}->{'type'}, $sources->{'sources'}->{$src}->{'url'};
  }
  print SOURCE join("\n", @src_row) . "\n";
}
close SOURCE;

if ($verbose) { print STDERR "Done.\n"; }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3