[Bio] / FigKernelScripts / update_lineage.pl Repository:
ViewVC logotype

View of /FigKernelScripts/update_lineage.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Fri Dec 21 21:23:09 2007 UTC (12 years, 5 months ago) by wilke
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.2: +187 -21 lines
draft version to update FIG taxonomy entries

use FIG;
use strict;
use warnings;

use vars qw( $opt_d $opt_g $opt_t $opt_f $opt_s $opt_u $opt_v);
use Getopt::Std;


getopts('d:go:f:s:uvt:');

my $success;
my $usage = "get_lineage (-g | -f FILE ) [-s skip_n_first_lines]  -d DIR\n";

unless ($opt_d and -d $opt_d) {
  print $usage;
  exit;
}

my $org_dir = $opt_d;

my @list;  # organism list

my $fig = new FIG;

my $counter = 0;

if ( $opt_g ){
  
  my @genomes;

  if ($opt_t){
    push @genomes , $opt_t ;
  }
  else{
    @genomes = $fig->genomes;
  }
  
  print scalar @genomes , " genomes are currently in SEED\n";
  
  foreach my $genome (@genomes) {
    
    my $overview=request_ncbi( $org_dir , $genome);
   
  }
}

if ($opt_f and -f $opt_f ){
  
  open(FILE , $opt_f) or die "Can't open $opt_f for reading!\n";
  
  if ( $opt_s ){
    my $skip = $opt_s;
    while ( $skip ){
      my $line = <FILE>;
      $skip-- ;
    }
  }
  
  
  while (my $line = <FILE>){
    chomp $line;
    my @fields = split "\t" , $line;

    my $overview; 
    if (scalar @fields == 3){
      $overview = { 
		   taxonomy_id => $fields[1],
		   seed_id     => $fields[0],
		   lineage     => $fields[2],
		  }; 
    }
    elsif (scalar @fields == 5) {
      $overview = { 
		   taxonomy_id => $fields[1],
		   seed_id     => $fields[0],
		   lineage     => $fields[2],
		   seed_org    => $fields[3],
		   ncbi_org    => $fields[4],
		  }; 
    }
    else{
     #  print STDERR "Wrong table format,\n";
#       print STDERR scalar @fields , " fields in file,\n";
#       print STDERR "either 3 or 5 columns required\n";
#       print STDERR $line , "\n";
      my $tax_id = get_taxonomy_id($org_dir , $fields[0]);
   
      $overview  = $fig->get_organism_info_from_ncbi($tax_id);
      #$overview->{ taxonomy_id } = $fields[1] unless $overview->{ taxonomy_id };
      $overview->{ seed_org } = $fig->genus_species($fields[0] ) ;
      $overview->{ ncbi_org } = $overview->{ scientific_name };
      unless ( $overview->{ lineage } ){
	print STDERR "Still a problem with\t" , $line , "\n";
	next;
      }
    }
    push @list , $overview;
  }

  foreach my $entry ( @list ){
    #  my $overview = request_ncbi( $org_dir , $entry->{ seed_id } );
  }
  
}

if ( $opt_v ){
  _statistic( $fig , $org_dir , \@list )
}


if ($opt_u){

  foreach my $entry ( @list ){
    update_organism_dir( $fig , $org_dir , $entry->{ seed_id } , $entry);
  }

}

sub _statistic{
  my ( $fig , $dir , $org , $list) = @_;
  my $statistic;
  
  my $missing;
  foreach my $entry (@$list){

   

    # get seed words

    my @seed_words = split " " , $entry->{ seed_org };

    foreach my $word ( split " " , $entry->{ ncbi_org } ) {
      print join " " , @seed_words , "\n";
      for ( my $i=0 ; $i< scalar @seed_words;  $i++ ) {
	delete $seed_words[$i] if ( $word eq $seed_words[$i])
      }
      print join " " , @seed_words , "\n";
    }
    
    
    
  }
  
}

sub update_organism_dir{
  my ( $fig , $dir , $org , $overview) = @_;
  
  return 0 unless (-d "$dir/$org");
  
  return 0 unless ( $org );
  # update TAXONOMY file
  #  1) write to log
  open(LOG ,">>$dir/$org/.change_log" ) || die "could not open $dir/$org/.change_log";
  open(TMP,"$dir/$org/TAXONOMY") || die "could not open $dir/$org/TAXONOMY";
 
  my $line = <TMP>;
  chomp $line;
  print LOG  $line , "\t" , time , "\n";
  
  close(TMP);
  close(LOG);

  # 2) update entries
  open(TMP,">$dir/$org/TAXONOMY") || die "could not open $dir/$org/TAXONOMY";
  print TMP $overview->{lineage}."; ".$fig->genus_species($org)."\n";
  close(TMP);

  open(TMP,">$dir/$org/TAXONOMY_ID") || die "could not open $dir/$org/TAXONOMY";
  print TMP $overview->{taxonomy_id}."\n";
  close(TMP);

  chmod(0777,"$dir/$org/TAXONOMY_ID");

  #update database entry

  my $old_lineage = $fig->taxonomy_of($org);
  
  return 1;
}


sub get_taxonomy_id{
   my ($dir , $org) = @_;

   my $file = "$dir/$org/TAXONOMY_ID";

   my $tax_id;
   
   if (-f $file){
     open(TAX, $file) || die "could not open $file";
     $tax_id = <TAX>;
     chomp $tax_id;
     close(TAX);
   }
   else{
     ( $tax_id ) = $org =~ /(\d+)\.\d+/ ;
     unless ( $tax_id ){
       print STDERR "No taxonomy id for $org\n";
     }
   }

   return $tax_id;
}


sub request_ncbi{
  my ($org_dir , $genome) = @_;
  
  my $tax_id = get_taxonomy_id($org_dir , $genome);
  #print STDERR "Tax id is $tax_id\n";
  my ( $overview ) = $fig->get_organism_info_from_ncbi($tax_id);
  my $seed_lineage = $overview->{lineage} . "; ".$fig->genus_species($genome);
  
  print $genome , "\t";
  print "$tax_id\t$seed_lineage" ;
  if ($fig->genus_species($genome) eq $overview->{scientific_name} ){
    print "\t\t\n" ;
  }
  else{
    print "\t".$fig->genus_species($genome)."\t".$overview->{scientific_name}."\n" ;
    print STDERR $genome , "\t$tax_id\t$seed_lineage" ;
    print STDERR "\t".$fig->genus_species($genome)."\t".$overview->{scientific_name}."\n" ;
  }
  
  #$success= update_organism_dir( $fig , $dir, $genome , $overview);
  #{ exit } unless $success;
  
  $counter ++;
  #exit if ($counter > 10);
  sleep 5 ;
  
  return $overview;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3