[Bio] / GBBergeyDB / GenomeParser.pm Repository:
ViewVC logotype

View of /GBBergeyDB/GenomeParser.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Thu Apr 5 15:28:33 2007 UTC (12 years, 8 months ago) by bartels
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +88 -254 lines
*** empty log message ***

#!/usr/bin/env /home/bartels/FIGdisk/env/cee/bin/perl

use strict;
use warnings;
use lib "../PPO/";
use DBMaster;
use GenomeMapper;

package GenomeParser;

1;

sub GenomeParser::parseLiterature {
  
  my ( $dbmaster, $file ) = @_;

  my $Lithash;

  open( LIT, $file ) or die "$!";

  my $species;
  my $taxid;
  my $schalter;

  while ( <LIT> ) {

    if ( $_ =~ /^>Org:(.+) (\d+) publication/ ) {
      $species = $1;
      $schalter = 1;
      $Lithash->{ 'Species' }->{ $species }->{ 'number' } = $2;
      next;
    }
    if ( $_ =~ /^>TAXID:(.+) (\d+) publication/ ) {
      $taxid = $1;
      $schalter = 0;
      $Lithash->{ 'TAXID' }->{ $taxid }->{ 'number' } = $2;
      next;
    }
    if ( $_ =~ /(\d+) .*/ ) {
      if ( $schalter ) {
	$Lithash->{ 'Species' }->{ $species }->{ 'pubs' }->{ $1 } = $_;
      }
      else {
	$Lithash->{ 'TAXID' }->{ $taxid }->{ 'pubs' }->{ $1 } = $_;
      }
    }
  }

  close LIT;
  
  return $Lithash;

}

sub GenomeParser::parseCCUG {
  
  my ( $dbmaster, $file ) = @_;

  my $CCUGhash;

  open( CCUG, $file ) or die "$!";

  while ( <CCUG> ) {

    chomp;
    next if ( $_ eq '' || $_ eq ' ' || $_ =~ /^\s+$/ );
    next if ( $_ =~ /^CCUG, Göteborg/ );
    next if ( $_ =~ /List of type strains/ );
    next if ( $_ =~ /UltraEdit/ );
    next if ( $_ =~ /^Species/ );
    next if ( $_ =~ /=============/ );
    
    my $genomename;
    my $ccugnum;
    my $strainstxt;

    
    if ( $_ =~ /(^\w+\s\w+)\s+(\d+\s[ABDC]?[\!]?[b]?)\s+(.*)/ ) {
      $genomename = $1;
      $ccugnum = $2;
      $strainstxt = $3;
    }
    elsif ( $_ =~ /(^[\w\s\.]+\w)\s+(\d+\s[ABDC]?[\!]?[b]?)\s+(.*)/ ) {
      $genomename = $1;
      $ccugnum = $2;
      $strainstxt = $3;
    }
    elsif ( $_ =~ /(^[\w\s\.]+\w \'Bird-C\')\s+(\d+\s[ABDC]?[\!]?[b]?)\s+(.*)/ ) {
      $genomename = $1;
      $ccugnum = $2;
      $strainstxt = $3;
    }
    else {
      next;
    }

    $ccugnum =~ s/\s$//;
    $ccugnum = "CCUG ".$ccugnum;
        
    my @strains = split( /;/, $strainstxt );
    unshift @strains, $ccugnum;
    
    my @st;
    
    foreach my $strain ( @strains ) {
      next if ( $strain eq '' || $strain eq ' ' );
      next if ( $strain =~ /^\s+$/ );
      push @st, $strain;
    }
    
    my $gname = $genomename;
    $gname =~ s/subsp\. //g;

    my ( $speci, $gens ) = GenomeMapper::getGenusSpecies( $dbmaster, $genomename );
    
    if ( defined( $speci ) ) {
      $speci = $gens ." ". $speci;
    }
    else {
      $speci = $gens;
    }

    my $genome = { 'strains'   => \@st };
    
#    my ( $gname ) = gethashname( $gens, $speci );
#    my $gname = $speci;

    if ( defined( $CCUGhash->{ $gname } ) ) {
      my $shash;
      foreach my $strain ( @strains ) {
	$shash->{ $strain } = 1;
      }
      my $otherstrains = $CCUGhash->{ $gname }->{ 'strains' };
      foreach my $strain ( @$otherstrains ) {
	$shash->{ $strain } = 1;
      }
      
      my @allstrains = keys %$shash;
      $genome->{ 'strains' } = \@allstrains;
    }
    
    
    $CCUGhash->{ $gname } = $genome;
  
  }
    
  return( $CCUGhash );
}

##################################
# Parse the file Garrity gave us #
#                                #
# returns a hash with species as #
# key and a hash of the species  #
# information as the value       #
##################################
sub GenomeParser::parseWFCC {

  my ( $dbmaster, $file ) = @_;

  my $specieshash;

  open ( WFCC, $file ) or die "$!";

  # the first line of the file is the header, erase it
  my $header = <WFCC>;

  while ( <WFCC> ) {

    my $genome;
    my ( $species, $nameDOI, $taxonDOI, $exemplarDOI, $typestrain, $score, $priorscore ) = split( /\t/, $_ );

    $genome->{ 'Species' } = $species;
    $genome->{ 'nameDOI' } = $nameDOI;
    $genome->{ 'taxonDOI' } = $taxonDOI;
    $genome->{ 'exemplarDOI' } = $exemplarDOI;
    $genome->{ 'typestrain' } = $typestrain;
    $genome->{ 'score' } = $score;
    $genome->{ 'priorscore' } = $priorscore;

    $specieshash->{ $species } = $genome;
  }

  close WFCC;
  
  return $specieshash;

}


################################################
# Parse GreenGenes current_prokMSA_unaligned   #
#                                              #
# returns the accno hash again                 #
################################################
sub GenomeParser::parseGreenGenesRNA {

  my ( $dbmaster, $file, $file2, $accnoHash ) = @_;

  my $specieshash;

  print STDERR $file."\n";
  open ( GG, $file ) or die "$!";

  my $wanttoaccess = undef;
  my $sequence = '';
  my $proknr;

  while ( <GG> ) {

    if ( $_ =~ /^>(\d+) (\w+)\./ ) {
      my $tmp2 = $1;
      my $tmp = $2;
      if ( defined( $wanttoaccess ) ) {
	$accnoHash->{ $wanttoaccess }->{ 'sequence' } = $sequence;
	$accnoHash->{ $wanttoaccess }->{ 'prokMSAid' } = $proknr;
      }
      $wanttoaccess = $tmp;
      $proknr = $tmp2;
      if ( !defined( $accnoHash->{ $wanttoaccess } ) ) {
	$wanttoaccess = undef;
	$proknr = undef;
      }
      $sequence = '';
    }
    else {
      chomp;
      $sequence .= $_;
    }
  }
  if ( defined( $wanttoaccess ) ) {
    $accnoHash->{ $wanttoaccess }->{ 'sequence' } = $sequence;
    $accnoHash->{ $wanttoaccess }->{ 'prokMSAid' } = $proknr;
  }
  
  close GG;

  open ( GG, $file2 ) or die "$!";

  $wanttoaccess = undef;
  $proknr = undef;
  $sequence = '';

  while ( <GG> ) {

    if ( $_ =~ /^>(\d+) (\w+)\./ ) {
      my $tmp2 = $1;
      my $tmp = $2;
      if ( defined( $wanttoaccess ) ) {
	$accnoHash->{ $wanttoaccess }->{ 'sequence' } = $sequence;
	$accnoHash->{ $wanttoaccess }->{ 'prokMSAid' } = $proknr;
	$sequence = '';
      }
      $wanttoaccess = $tmp;
      $proknr = $tmp2;
      if ( !defined( $accnoHash->{ $wanttoaccess } ) ) {
	$wanttoaccess = undef;
	$proknr = undef;
      }
    }
    else {
      chomp;
      $sequence .= $_;
    }
  }
  if ( defined( $wanttoaccess ) ) {
    $accnoHash->{ $wanttoaccess }->{ 'sequence' } = $sequence;
    $accnoHash->{ $wanttoaccess }->{ 'prokMSAid' } = $proknr;
  }
  
  close GG;
  
  return $accnoHash;
  
}

################################################
# Parse RDP information provided by Jim Cole   #
#                                              #
# returns a hash with species as key and a     #
# hash of the species information as the value #
################################################
sub GenomeParser::parseRDPJimCole {

  my ( $dbmaster, $fileTS, $fileSN, $fileFS, $fileGreenGenesRNA, $fileAdded ) = @_;

  my $specieshash;

  my $rdpIdentHash;

  my $accnohash;

  #get the sequences with their identifier
  open( FILEFS, $fileFS ) or die "$!";

  my $rdpIdent;
  my $rdpSeq = '';
  while ( <FILEFS> ) {
    if ( $_ =~ /^>(\w+) .*/ ) {
      my $var = $1;
      if ( defined( $rdpIdent ) ) {
	$rdpIdentHash->{ $rdpIdent } = $rdpSeq;
	$rdpSeq = '';
      }
      $rdpIdent = $var;
    }
    else {
      chomp $_;
      $rdpSeq .= $_;
    }
  }

  close FILEFS;

  open( FILETS, $fileTS ) or die "$!";
  
  while ( <FILETS> ) {
    chomp;
    next if ( $_ eq '' or $_ eq ' ' );
    next if ( $_ =~ /name\_doi/ );
    
    my ( $name, $accno, $seqid, $namedoi, $txdoi, $exdoi ) = split( /\t/, $_ );
    
    if ( !( $seqid =~ /^S/ ) ) {
      $accno = $seqid;
      $seqid = undef;
      $accnohash->{ $accno }->{ 'species' } = $name;
    }

    my $seq;

    if ( defined( $seqid ) ) {
      $seq = $rdpIdentHash->{ $seqid };
    }
    
    $specieshash->{ $name }->{ 'species' } = $name;
    $specieshash->{ $name }->{ 'accno' } = $accno;
    $specieshash->{ $name }->{ 'seqid' } = $seqid;
    $specieshash->{ $name }->{ 'sequence' } = $seq;
    $specieshash->{ $name }->{ 'nameDOI' } = $namedoi;
    $specieshash->{ $name }->{ 'taxDOI' } = $txdoi;
    $specieshash->{ $name }->{ 'exDOI' } = $exdoi;
    $specieshash->{ $name }->{ 'typestrain' } = 1;
  }
  
  close FILETS;
  
  open( FILESN, $fileSN ) or die "$!";
  
  while ( <FILESN> ) {
    chomp;
    next if ( $_ eq '' or $_ eq ' ' );
    next if ( $_ =~ /name\_doi/ );
    next if ( $_ =~ /Surrogates/ );
    
    last if ( $_ =~ /No sequence available/ );
    
    my ( $name, $namedoi, $txdoi, $exdoi, $strain, $accno, $seqid ) = split( /\t/, $_ );

    my $seq = $rdpIdentHash->{ $seqid };
    $specieshash->{ $name }->{ 'species' } = $name;
    $specieshash->{ $name }->{ 'accno' } = $accno;
    $specieshash->{ $name }->{ 'seqid' } = $seqid;
    $specieshash->{ $name }->{ 'sequence' } = $seq;
    $specieshash->{ $name }->{ 'nameDOI' } = $namedoi;
    $specieshash->{ $name }->{ 'taxDOI' } = $txdoi;
    $specieshash->{ $name }->{ 'exDOI' } = $exdoi;
    $specieshash->{ $name }->{ 'strain' } = $strain;
    $specieshash->{ $name }->{ 'typestrain' } = 0;
  }
  
  close FILESN;
  
  $accnohash = GenomeParser::parseGreenGenesRNA( $dbmaster, $fileGreenGenesRNA, $fileAdded, $accnohash );

  foreach my $accno ( keys %$accnohash ) {

    if ( defined( $accnohash->{ $accno }->{ 'sequence' } ) ) {

      my $name = $accnohash->{ $accno }->{ 'species' };
      $specieshash->{ $name }->{ 'sequence' } = $accnohash->{ $accno }->{ 'sequence' };

    }
  }
  
  return ( $specieshash );
  
}


sub GenomeParser::parseDMSZ {
  
  my ( $dbmaster, $file ) = @_;
  
  my $dmszhash;
  
  open ( DMSZ, $file ) or die "$!";
  
  my @sections;
  
  my $counter = 0;
  
  my $actualGenus   = undef;
  my $actualSpecies = undef;
  
  while ( <DMSZ> ) {
    
    # filter out Genus information
    next if ( $_ =~ /New Genus/ );
    $counter++;
    chomp $_;
    next if ( $_ eq "" || $_ eq " " );
    next if ( $_ =~ /bactnam/ );
 
    if ( $counter == 1 ) {
      @sections = split( /\t/, $_ );
      next;
    }
    
    my @stuff = split( /\t/, $_ );

    if ( $stuff[0] =~ /^[A-Z]+$/ ) {
      next;
    }
    
    my @st = ();

    if ( defined( $stuff[8] ) && $stuff[8] ne "" && $stuff[8] ne " " ) {
      
      my $strainsstring = $stuff[8];
      $strainsstring =~ s/\"//g;

      my @strains = split( /, /, $strainsstring );
      
      foreach my $s ( @strains ) {
	push @st, $s;
      }
      
      my $genome;
      
      my $rspeci;
      if ( defined( $stuff[1] ) ) {
	$rspeci = $stuff[0]." ".$stuff[1];
      }
      else {
	$rspeci = $stuff[0];
      }
      if ( defined( $stuff[2] ) && $stuff[2] ne '' && $stuff[2] ne ' ' ) {
	$rspeci = $rspeci . " " .$stuff[2];
      }
 
      my $gname = $rspeci;
#      my ( $gname ) = gethashname( $stuff[0], $rspeci );
      
      for ( my $i = 0; $i <= scalar( @sections ); $i++ ) {
	
	if ( !defined( $sections[ $i ] ) ) {
	  next;
	}
	
	$genome->{ $sections[ $i ] } = $stuff[ $i ];
      }
	 
      $genome->{ 'strains' } = \@st;

      $dmszhash->{ $gname } = $genome;
    } 
    else {
      
      my $genome;

      my $rspeci;
      if ( defined( $stuff[1] ) ) {
	$rspeci = $stuff[0]." ".$stuff[1];
      }
      else {
	$rspeci = $stuff[0];
      }
      
#      my ( $gname ) = gethashname( $stuff[0], $rspeci );
      my $gname = $rspeci;

      for ( my $i = 0; $i <= scalar( @sections ); $i++ ) {

	if ( !defined( $sections[ $i ] ) ) {
	  next;
	}
	
	$genome->{ $sections[ $i ] } = $stuff[ $i ];
	
      }
      $dmszhash->{ $gname } = $genome;
    }
  }  
  
  close DMSZ;
  
  return ( $dmszhash );
  
}
  
################################
# Parse the file Qiong gave me #
#                              #
# returns a hash the complete  #
# Bergeys taxonomy             #
################################
sub GenomeParser::getBergeysTaxonomy {

  my ( $dbmaster, $file ) = @_;

  my $taxhash;

  open( BERGTAX, $file ) or die "$!";

  while ( <BERGTAX> ) {

    next if ( $_ =~ /^name\trank/ );
    next if ( $_ =~ /^Root\tno rank/ );

    chomp;
    my ( $name, $kind, $taxDOI, $parentDOI ) = split( /\t/, $_ );

    $taxhash->{ $kind }->{ $name }->{ 'taxDOI' } = $taxDOI;
    $taxhash->{ $kind }->{ $name }->{ 'parentDOI' } = $parentDOI;

  }

  close BERGTAX;

  return $taxhash;

}

################################################
# Parse the files coming from Ross' clustering #
#                                              #
# returns a hash containing all the clusters,  #
# their scores and representatives             #
################################################
sub GenomeParser::getClustersNew {
  
  my ( $dbmaster, $clusterfile, $repfile, $scorefile ) = @_;

  my $clusterhash;

  open ( CLUSFILE, $clusterfile ) or die "cannot open $clusterfile";
  while ( <CLUSFILE> ) {

    if ( $_ =~ /(.*)\t(.*)/ ) {

      my $cnum = $1;
      my $sname = $2;
      
      $clusterhash->{ $cnum }->{ 'strains' }->{ $sname } = 1;
    }
  }
  close CLUSFILE;

  open ( REPFILE, $repfile ) or die "cannot open $repfile";
  while ( <REPFILE> ) {

    if ( $_ =~ /(.*)\t(.*)/ ) {

      my $cnum = $1;
      my $sname = $2;
      
      $clusterhash->{ $cnum }->{ 'representative' } = $sname;
    }
  }
  close REPFILE;

  open ( SCOREFILE, $scorefile ) or die "cannot open $scorefile";
  while ( <SCOREFILE> ) {

    if ( $_ =~ /(.*)\t(.*)/ ) {

      my $cnum = $1;
      my $score = $2;
      
      $clusterhash->{ $cnum }->{ 'score' } = $score;
    }
  }
  close SCOREFILE;

  return $clusterhash;
}

############################
# Parser for the NCBI data #
############################
sub GenomeParser::parseNCBI {

  my ( $dbmaster, $file ) = @_;

  my $NCBIhash;
  my $NCBIstrainhash;

  open( NCBI, $file ) or die "cannot open file $file\n";
  
  while ( <NCBI> ) {
    my $line = $_;
    chomp $line;

    if ( $line =~ /environmental sample/ ) {
      next;
    }
    if ( $line =~ /unclassified/ ) {
      next;
    }
    if ( $line =~ /unidentified/ ) {
      next;
    }
    if ( $line =~ /uncultured/ ) {
      next;
    }

    my @dat = split ( /\t/, $line );

    my $genome;
    my $textstrain;
    $genome->{ 'taxid' } = $dat[0];
    $genome->{ 'domain' } = $dat[1];
    $genome->{ 'phylum' } = $dat[2];

    if ( $line =~ /Rickettsieae/ ) {
      $genome->{ 'class' } = $dat[3];
      $genome->{ 'gorder' } = $dat[4];
      $genome->{ 'family' } = $dat[5];
      $genome->{ 'genus' } = $dat[7];
      $textstrain = $dat[9];
    }
    elsif ( $dat[3] =~ /\(class\)/ ) {
      $genome->{ 'class' } = $dat[3];
      if ( ( $dat[5] =~ /ales/ ) && !( $dat[5] =~ /incertae/ ) ) {
	$genome->{ 'gorder' } = $dat[5];
	if ( $dat[7] =~ /eae/ ) {
	  $genome->{ 'family' } = $dat[7];
	  $genome->{ 'genus' } = $dat[8];
	  if ( defined( $dat[10] ) ) {
	    $genome->{ 'species' } = $dat[9];
	    $textstrain = $dat[10];
	  }
	  elsif ( defined( $dat[9] ) && $dat[9] =~/Mycobacterium/ ) {
	    $genome->{ 'species' } = $dat[9];
	  }
	  else {
	    $textstrain = $dat[8];
	  }
	}
	elsif ( $dat[6] =~ /eae/ ) {
	  $genome->{ 'family' } = $dat[5];
	  $genome->{ 'genus' } = $dat[7];
	  if ( defined( $dat[9] ) ) {
	    $genome->{ 'species' } = $dat[8];
	    $textstrain = $dat[9];
	  }
	  else {
	    $textstrain = $dat[8];
	  }
	}
      }
      elsif ( $dat[4] =~ /ales/ ) {
	$genome->{ 'gorder' } = $dat[4];
	if ( $dat[5] =~ /ceae/ ) {
	  $genome->{ 'family' } = $dat[5];
	  $genome->{ 'genus' } = $dat[6];
	  if ( defined( $dat[8] ) ) {
	    $genome->{ 'species' } = $dat[7];
	    $textstrain = $dat[8];
	  }
	  else {
	    $textstrain = $dat[7];
	  }
	}
	elsif ( $dat[5] =~ /incertae/ ) {
	  $genome->{ 'family' } = $dat[5];
	  $genome->{ 'genus' } = $dat[6];
	  if ( defined( $dat[8] ) ) {
	    $genome->{ 'species' } = $dat[7];
	    $textstrain = $dat[8];
	  }
	  else {
	    $textstrain = $dat[7];
	  }
	}
      }
    }
    elsif ( $dat[4] =~ /ales/ || $dat[4] =~ /aeles/ ) {
      $genome->{ 'gorder' } = $dat[4];
      $genome->{ 'class' } = $dat[3];
      if ( $dat[5] =~ /ceae/ ) {
	$genome->{ 'family' } = $dat[5];

	if ( $dat[6] =~ /group/ ) {
#	  print STDERR $dat[6]." DAT6\n";
	  $genome->{ 'genus' } = $dat[7];
	  if ( defined( $dat[9] ) ) {
	    $genome->{ 'species' } = $dat[8];
	    $textstrain = $dat[9];
	  }
	  else {
	    $textstrain = $dat[8];
	  }
	}
	else {
	  $genome->{ 'genus' } = $dat[6];
	  if ( defined( $dat[8] ) ) {
	    $genome->{ 'species' } = $dat[7];
	    $textstrain = $dat[8];
	  }
	  else {
	    $textstrain = $dat[7];
	  }
	}
      }
      else {
	$genome->{ 'genus' } = $dat[5];
	if ( defined( $dat[7] ) ) {
	  $genome->{ 'species' } = $dat[6];
	  $textstrain = $dat[7];
	}
	else {
	  $textstrain = $dat[6];
	}
      }
    }
    elsif ( $dat[3] =~ /ales/ ) {
      $genome->{ 'gorder' } = $dat[3];
      if ( $dat[4] =~ /ceae/ ) {
	$genome->{ 'family' } = $dat[4];
	$genome->{ 'genus' } = $dat[5];
	if ( defined( $dat[7] ) ) {
	  $genome->{ 'species' } = $dat[6];
	  $textstrain = $dat[7];
	}
	else {
	  $textstrain = $dat[6];
	}
      }
      else {
	$genome->{ 'genus' } = $dat[4];
	if ( defined( $dat[6] ) ) {
	  $genome->{ 'species' } = $dat[5];
	  $textstrain = $dat[6];
	}
	else {
	  $textstrain = $dat[5];
	}
      }
    }
    elsif ( $dat[5] =~ /ales/ ) {
      $genome->{ 'gorder' } = $dat[4];
      if ( $dat[6] =~ /ceae/ ) {
	$genome->{ 'family' } = $dat[6];
	$genome->{ 'genus' } = $dat[7];
	if ( defined( $dat[9] ) ) {
	  $genome->{ 'species' } = $dat[8];
	  $textstrain = $dat[9];
	}
	else {
	  $textstrain = $dat[8];
	}
      }
      else {
	$genome->{ 'genus' } = $dat[4];
	if ( defined( $dat[6] ) ) {
	  $genome->{ 'species' } = $dat[5];
	  $textstrain = $dat[6];
	}
	else {
	  $textstrain = $dat[5];
	}
      }
    }
    elsif ( $dat[3] =~ /etes/ ) {
      $genome->{ 'class' } = $dat[3];
      $genome->{ 'genus' } = $dat[4];
      if ( defined( $dat[6] ) ) {
	$genome->{ 'species' } = $dat[5];
	$textstrain = $dat[6];
      }
      else {
	$textstrain = $dat[5];
      }
    }
    else {
      #Specials
      if ( $line =~ /Verrucomicrobia/ ) {
	$genome->{ 'domain' } = $dat[1];
	$genome->{ 'phyla' } = $dat[3];
	$genome->{ 'class' } = $dat[4];
	$genome->{ 'genus' } = $dat[5];
	$textstrain = $dat[6];
      }
    }
    
    my $gname;
    my $sname;

    if ( defined( $textstrain ) ) {
      $textstrain =~ s/\'//g;

      my ( $speci, $gens, $strain, $specaff ) = GenomeMapper::getGenusSpecies( $dbmaster, $textstrain );

      if ( !defined( $gens ) ) {
	next;
      }

      $genome->{ 'specaff' } = $specaff;
      my $rspeci;
      if ( defined( $speci ) ) {
	$rspeci = $gens." ".$speci;
      }
      else {
	$rspeci = $gens;
      }
      $genome->{ 'species' } = $rspeci;

      ( $gname, $sname ) = gethashname( $gens, $rspeci, $strain );

      if ( defined( $strain ) && $strain ne "" && $strain ne " " ) {
	$genome->{ 'strain' } = $strain;
      }
    }
    else {
      ( $gname, $sname ) = gethashname( $genome->{ 'genus' }, $genome->{ 'species' } );
      if ( !defined( $genome->{ 'species' } ) ) {
	$genome->{ 'species' } = $genome->{ 'genus' };
      }
    }

    if ( defined( $gname ) ) {
      $NCBIhash->{ $gname } = $genome;
    }
    if ( defined( $sname ) ) {
      $NCBIstrainhash->{ $sname } = $genome;
    }

    if ( $genome->{ 'species' } =~ /Leuconostoc/ ) {
    }
  }

  close NCBI;

  return ( $NCBIhash, $NCBIstrainhash );

}


####################################
# Parser for the SEEDLinkData data #
####################################

sub GenomeParser::parseSEEDLinkData {

  my( $dbmaster, $file ) = @_;

  my $SEEDstrainhash;

  open( FILE, $file ) or die "cannot open file $file\n";

  while( <FILE> ){

    my $genome;
    my @data = split( /\t/, $_ );
    $genome->{ 'figid' } = $data[0];
    $genome->{ 'taxid' } = $data[1];
    $genome->{ 'seedname' } = $data[2];
    $genome->{ 'ncbiname' } = $data[3];
    
    my $nom = $genome->{ 'ncbiname' };
    $nom =~ s/ //g;
    $SEEDstrainhash->{ $nom } = $genome;
  }

  close FILE;

  return $SEEDstrainhash;

}


############################
# Parser for the GOLD data #
############################
sub GenomeParser::parseGold {
  
  my( $dbmaster, $file ) = @_;
  
  my @Gold;
  my $Goldhash;

  open( GOLD, $file ) or die "cannot open file $file\n";
  
  my $counter = 0;
  my @headers;
  
  while ( <GOLD> ) {

    # drop all EUKS
    next if ( $_ =~ /ARTHROPODA/ );
    next if ( $_ =~ /FUNGI/ );
    next if ( $_ =~ /PROTOZOA/ );
    next if ( $_ =~ /PLANTS/ );
    next if ( $_ =~ /CNIDARIA/ );
    next if ( $_ =~ /CHORDATA/ );
    next if ( $_ =~ /ANNELIDA/ );
    next if ( $_ =~ /NEMATODES/ );
    next if ( $_ =~ /PLATYHELMINTHES/ );
    next if ( $_ =~ /MOLLUSKA/ );
    next if ( $_ =~ /PLACOZOA/ );
    next if ( $_ =~ /ECHINODERMATA/ );

    $counter++;
    if ( $counter == 1 ) {
      @headers = split( /\t/, $_ );
    }
    else {
      my @dat = split( /\t/, $_ );
      my $genome;
      for ( my $i = 0; $i <= scalar( @dat ); $i++ ) {
	if ( defined( $headers[ $i ] ) && defined( $dat[ $i ] ) ) {
	  $genome->{ $headers[ $i ] } = $dat[ $i ];
	}
      }
    
      push @Gold, $genome;
    }
  } 

  my $coun = 0;
  my $anticoun = 0;

  foreach my $g ( @Gold ) {

    if ( !defined( $g->{ 'TYPE' } ) ) {
      next;
    }

    if ( $g->{ 'TYPE' } ne 'Genome' ) {
      next;
    }

    my $Genome;

    my $Organism = $g->{ 'Organism' };
    if ( $Organism =~ /(\w*) \(.+\)( .*)/ ) {
      $Organism = $1 . $2;
    }

    my ( $speci, $genus, $strain, $specaff ) = GenomeMapper::getGenusSpecies( $dbmaster, $Organism );

    $g->{ 'species' } = $speci;
    $g->{ 'genus' } = $genus;
    $g->{ 'specaff' } = $specaff;
    
    if ( defined( $g->{ 'CULTURE' } ) && ( $g->{ 'CULTURE' } ne '' ) ) {
      $strain = $g->{ 'CULTURE' };
    }
    elsif ( defined( $g->{ 'STRAIN' } ) && ( $g->{ 'STRAIN' } ne '' ) ) {
      $strain = $g->{ 'STRAIN' };
    }
    elsif ( !defined( $strain ) || ( $strain eq '' ) ) {
      $strain = undef;
    }
    $g->{ 'strain' } = $strain;

    my $rspeci;
    if ( defined( $speci ) ) {
      $rspeci = $genus. " " . $speci;
    }
    else {
      $rspeci = $genus;
    }
    my $gname = $rspeci;
 
#    if ( $gname eq "Leifsoniaxyli" ) {
#      $gname = "Leifsoniaxylisubsp.xyli";
#      $sname = "Leifsoniaxylisubsp.xyliCTCB07";
#    }
#    if ( $gname eq "Acidianusbrierleyi" ) {
#      $gname = "Sulfolobusbrierleyi";
#      $sname = "Sulfolobusbrierleyi";
#    }

    push @{ $Goldhash->{ $gname }->{ 'strains' } }, $g;
    
  }
  close GOLD;
  
  return ( $Goldhash );
  
}

sub gethashname {
 
  my ( $genus, $species, $strain ) = @_;

  my $gname;
  my $sname;

  if ( defined( $species ) ) {
    if ( defined( $strain) ) {
      $sname = $species." ". $strain;
      $gname = $species;
    }
    else {
      $gname = $species;
      $sname = $gname;
    }
  }
  else {
    if ( defined( $strain ) && $strain ne "" && $strain ne " " ) {
      $gname = $genus;
      $sname = $genus ." ". $strain;
    }
    else {
      $gname = $genus;
      $sname = $gname;
    }
  }
  
  $gname =~ s/ //g;
  $sname =~ s/ //g;

  if ( $gname =~ /^Leifsoniaxylixyli/ ) {
    $gname = "Leifsoniaxylisubsp.xyli";
    $sname = $gname . $strain;
  }

  return ( $gname, $sname );

}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3