[Bio] / FigKernelScripts / get_genomes_from_ncbi.pl Repository:
ViewVC logotype

View of /FigKernelScripts/get_genomes_from_ncbi.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Tue Oct 20 18:42:27 2009 UTC (10 years, 1 month ago) by wilke
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
*** empty log message ***

#! /usr/bin/perl
use strict;
use warnings;
use LWP::Simple;
use Getopt::Long;
use Bio::DB::RefSeq;
use Bio::SeqIO;

# read in parameters
my $key         = '';
my $contig_id          = '';
my $destination = "/tmp/";
my $file_type   = "genbank";
my $tax_id      = "";
my $project     = "",
my $org_name    = "";
my $plasmid     = 0;
my $file        = '';

GetOptions ( 'key=s'         => \$key ,
	     'contigID=s'    => \$contig_id,
	     'destination=s' => \$destination,
	     'type=s'        => \$file_type,
	     'taxonomyID=s'  => \$tax_id,
	     'organism=s'    => \$org_name,
	     'plasmid'       => \$plasmid,
	     'file=s'        => \$file,
	     'project=s'     => \$project,
	   );



if ($tax_id){

  print "Output file = " , get_entries_for_tax_id($tax_id);

}
if ($project) {
  print "Output file = " , get_entries_for_project_id($project);
}
if($contig_id){

  $contig_id =~ s /\s*,\s*/,/g;
  my @ids = split "," , $contig_id ;

  # connection to NCBI
  my $db = new Bio::DB::RefSeq;

  # define output
  my $seqout = Bio::SeqIO->new(
			       -file => ">/tmp/out.gbff",
			       -format => 'Genbank',
			      );

  # get data for ids
  my $seqio = $db->get_Stream_by_id( \@ids );
  while( my $seq  =  $seqio->next_seq ) {
    print "seqid is ", $seq->id, "\n";
    print $seq->get_dates , "\t" , $seq->division , "\n";
    # write data to output file
    $seqout->write_seq($seq);
  }

  # most of the time RefSeq_ID eq RefSeq acc
  # my $seq = $db->get_Seq_by_id($contig_id); # RefSeq ID
  # print "accession is ", $seq->accession_number, "\n";



}





#
# Functions
#


sub get_entries_for_project_id{
  my ($project) = @_;
  

  my $url = "http://www.ncbi.nlm.nih.gov/sites/entrez?Db=genomeprj&Cmd=Retrieve&list_uids=";
  my $search_result = get($url.$project);
  
  my $url_seq_overview = "http://www.ncbi.nlm.nih.gov/";
  
  
  # print $search_result;
  
  my @lines = split ( "\n" , $search_result);
  
  my $nr_seq = 0;
  my $nr_proj = 0;
  my $url_seq = "";
  my $url_proj = "";
  my $genome_name = "";
  
  my $found_genome_information_table = 0;
  
  my $next = "";
  my $id_list  = "";
  foreach my $line ( @lines ){
    
    if ($line =~/Genome information:/){
      $found_genome_information_table = 1;
      next; # skip table line
    };

    $found_genome_information_table = 0 if ( $found_genome_information_table and $line =~ /<\/table>/);
    $id_list .= $line if ( $found_genome_information_table ); # collect id entries
    
    print $line , "\n"  if (  $found_genome_information_table );
 
    
  }
 
  my @ids;
  my @blocks = split "<\/tr>" , $id_list ; 
  foreach my $block (@blocks){
    my @local_ids = $block =~/([^>]+)<\/a><\/td>/gc ; 
    print join "\t" , @local_ids  , "\n";
    push @ids , $local_ids[0];
  }
   

  my $query = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=".join ("," , @ids) ."&rettype=gb" ;
  # print $query , "\n";
  my $file = get($query);
  
  # print $file;

  open(TMP , ">/tmp/$project.gbff") or die "Can't open /tmp/$project.gbff";
  print TMP $file ;
  close(TMP);
  
  my $result = check_contigs("/tmp/$project.gbff");

  return " /tmp/$project.gbff" , "\n";
}


sub get_entries_for_tax_id{
  my ($tax_id) = @_;
  
  my $url = "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=";
  my $search_result = get($url.$tax_id);
  
  my $url_seq_overview = "http://www.ncbi.nlm.nih.gov/";
  
  
  # print $search_result;
  
  my @lines = split ( "\n" , $search_result);
  
  my $nr_seq = 0;
  my $nr_proj = 0;
  my $url_seq = "";
  my $url_proj = "";
  my $genome_name = "";
  
  my $next = "";
  foreach my $line ( @lines ){
    
    if ( $next eq "Sequences"){
      ($url_seq)   = $line =~ m/href=\"([^\"]*)\"/;
      ($nr_seq) = $line =~ m/>(\d*)<\/font/;
      
      print STDERR "Genome Sequences: $nr_seq\n";
      $url_seq =~s/&amp;/&/g;
      # print "URL:\t$url_seq\n";
      
      $next = "";
    }
    elsif ( $next eq "Projects"){
      ($url_proj) = $line =~ m/href=\"([^\"]*)\"/;
      ($nr_proj) = $line =~ m/>(\d*)<\/font/;
      
      
      print STDERR "Genome Projects: $nr_proj \n";
	
      
      $next = "";
    }
    
    if ( $line =~ /<title>Taxonomy browser/ ){
      # print STDERR $line,"\n";
    }
    if ( $line =~ m/<title>Taxonomy browser/){
	# print STDERR $line,"\n";
      
    }
    if ( $line =~ /<title>Taxonomy browser\s*\(([^()]+)\)\<\/title\>/ ) {
      $genome_name = $1;
      print STDERR "Genome Name = $1\n";
    }
    
    if ($line =~ m/(Genome[\w;&]+Sequence)/){
      $next = "Sequences";
    } 
    elsif ($line =~ m/(Genome[\w;&]+Projects)/){
      $next = "Projects";
    }
    
  }
  
  
  my $page =  get($url_seq_overview.$url_seq);
  
  #print "\n $url_seq_overview$url_seq\n";
  
  my (@ids) = $page =~m/www.ncbi.nlm.nih.gov\/sites\/entrez\?Db=genome\&amp\;Cmd=ShowDetailView\&amp\;TermToSearch=\d+\">(\w+)<\/a>/gc;
  #<a href="http://www.ncbi.nlm.nih.gov/sites/entrez?Db=genome&amp;Cmd=ShowDetailView&amp;TermToSearch=19221">AC_000091</a>
  # print @ids , "\n";
  

  my $query = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=".join ("," , @ids) ."&rettype=gb" ;
  # print $query , "\n";
  my $file = get($query);
  
  # print $file;

  open(TMP , ">/tmp/$tax_id.gbff") or die "Can't open /tmp/$tax_id.gbff";
  print TMP $file ;
  close(TMP);
  
  my $result = check_contigs("/tmp/$tax_id.gbff");

  return " /tmp/$tax_id.gbff";
}


sub check_contigs{

  my ($file) = @_;

  my $seqio_object = Bio::SeqIO->new(
				     -file => $file ,
				    -format => "genbank",
				    );
  
  while ( my $seq = $seqio_object->next_seq ) {
  
    print    $seq->accession_number , "\n";
    print $seq->get_dates , "\t" , $seq->division , "\n";
    #    print $seq->seq , "\n";
  }

}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3