[Bio] / FigKernelScripts / jgi_data_merge.pl Repository:
ViewVC logotype

View of /FigKernelScripts/jgi_data_merge.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Mon Feb 7 23:03:27 2011 UTC (9 years ago) by golsen
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, mgrast_release_3_0, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.1: +6 -5 lines
Fix the taxonomy to use NCBI full taxonomy if TAXONOMY does not exist.

#!/usr/bin/env perl -w
#
#   jgi_data_merge  directory  > GenBank
#
#   Build a GenBank file using gff, etc. in a partially built SEED
#   genome directory.
#
use strict;
use gjogff;
use gjoseqlib;
use gjogenbank;
use NCBI_genetic_code;
use NCBI_taxonomy;
use Data::Dumper;

my $usage = <<'End_of_Usage';

jgi_data_merge  directory  > GenBank

End_of_Usage


my $dir = @ARGV ? shift : '.';
-d $dir
    or print STDERR "Could not locate rectified JGI directory '$dir'.\n", $usage
        and exit;

chdir $dir;

#  Possible GenBank files for adding at end:

my %gbk = map { chomp; /\.gbk$/ ? ( $_ => -s $_ ) : () } `ls`;

#  Get contig ids to analyze gff file contigs:

( -f 'contigs' && open( CONTIGS, "<contigs" ) )
    or print STDERR "Could find or open 'contigs'.\n"
        and exit;

my %contig_ids;
while ( defined( $_ = gjoseqlib::read_next_fasta_seq( \*CONTIGS ) ) )
{
    $contig_ids{ $_->[0] } = length( $_->[2] );
}
# print STDERR scalar keys %contig_ids, " contig ids read.\n"; exit;

#  Rewind for reading the sequences later
seek( CONTIGS, 0, 0 );

keys %contig_ids
    or print STDERR "Failed to read any contig ids.\n"
        and exit;

my $gff_opts = { contig_ids => \%contig_ids };

my $gff_file = -f 'gff' ? 'gff' : -f 'gtf' ? 'gtf' : '';
$gff_file
    or print STDERR "Could not find a gff or gtf file.\n"
        and exit;

my $gff_ftrs = gjogff::read_gff( $gff_file, $gff_opts );

open( GENOME, "<GENOME" )
    or print STDERR "Could not open 'GENOME'.\n"
        and exit;
my $genome = <GENOME>;
close( GENOME );
chomp $genome;

my $organism = $genome;

#  Taxonomy id is optional:

my $taxid;
if ( -f 'TAXONOMY_ID' && open( TAXID, "<TAXONOMY_ID" ) )
{
    $taxid = <TAXID>;
    close( TAXID );
    chomp $taxid;
    $taxid = int( $taxid );
}

#  Taxonomy can come from a file, or from NCBI:

my $taxonomy;
if ( open( TAXONOMY, "<TAXONOMY" ) )
{
    $taxonomy = <TAXONOMY>;
    close( TAXONOMY );
    chomp $taxonomy;
    $taxonomy =~ s/\.$//;
}
elsif ( $taxid )
{
    my $data = NCBI_taxonomy::taxonomy( $taxid, { hash => 1 } );
    ( $taxonomy ) = @{ $data->{ Lineage } || [] };
    $taxonomy .= "; $genome"  if $taxonomy;
}

if ( ! $taxonomy )
{
    print STDERR "Could not open 'TAXONOMY' or get if from NCBI.\n";
    exit;
}

$taxonomy =~ s/^cellular organisms; //i;

#  Get assigned roles:

open( KOG, "<KOG.tab" )
    or print STDERR "Could not open 'KOG.tab'.\n"
        and exit;
my @KOG = map { chomp; [ split /\t/ ] } <KOG>;
close( KOG );
my $kogdef = { map { $_->[1] => $_->[3] } @KOG };

my %gen_code;
my %replicon;
if ( -f 'contig.properties' && open( PROP, "<contig.properties" ) )
{
    while ( <PROP> )
    {
        chomp;
        my ( $c, $type, $val ) = split /\t/;
        next unless $type && $val;
        if ( $type eq 'genetic_code' )
        {
            $gen_code{ $c } = $val;
        }
        elsif ( $type eq 'replicon' )
        {
            $val = 'chloroplast'    if $val =~ m/^chloroplast/i;
            $val = 'mitochondrion'  if $val =~ m/^mitochondri/i;
            $replicon{ $c } = $val;
        }
    }
}

#  Read the DNA and write each LOCUS:

my $contig;
while ( defined( $contig = gjoseqlib::read_next_fasta_seq( \*CONTIGS ) ) )
{
    my $contig_id = $contig->[0];
    my $len = length( $contig->[2] );

    my $code_num = $gen_code{ $contig_id } || 11;
    my $code;
    $code = $NCBI_genetic_code::genetic_code{ $code_num } if $code_num != 1 && $code_num != 11;

    my $src_qual = { organism     => [ $organism ],
                     genetic_code => [ $code_num ],
                   };
    $src_qual->{ mol_type }  = [ 'genomic DNA' ];
    $src_qual->{ organelle } = [ $replicon{ $contig_id } ] if $replicon{ $contig_id };
    $src_qual->{ db_xref }   = [ "taxon:$taxid" ]       if $taxid;

    my @gb_ftrs = ( [ 'source', "1..$len", $src_qual ] );

    my $dnadex = { $contig_id => $contig };    #  This is stubbed in to deal with translations
    push @gb_ftrs, map { gff_to_gb_ftr( $_, $dnadex, $kogdef, $code ) }
                   $gff_ftrs->{ $contig_id } ? @{ $gff_ftrs->{ $contig_id } } : ();

    #  Build GenBank entries for each contig.  Start with generic template.

    my %entry = ( DEFINITION => $contig_id,
                  ACCESSION  => $contig_id,
                  VERSION    => $contig_id,
                  SOURCE     => $organism,
                  ORGANISM   => $organism,
                  TAXONOMY   => [ split /; +/, $taxonomy ],
                  SEQUENCE   => $contig->[2],
                  locus_id   => $contig_id,
                  length     => $len,
                  mol_type   => 'dsDNA',
                  ftr_list   => \@gb_ftrs
                );

    write_genbank( \%entry );
}
close( CONTIGS );

#  Concatenate any exiting GenBank files (e.g., organelles).
#  We compare the size at the start of the run to the current size to
#  make sure that we are not looking at our output file.

foreach ( sort keys %gbk )
{
    if ( ( $gbk{$_} == -s $_ ) && open( GENBANK, "<$_" ) )
    {
        print STDERR "Appending existing GenBank file '$_'\n";
        while ( <GENBANK> ) { print $_ }
        close( GENBANK );
    }
}

exit;

#===============================================================================
#  Subroutines below:
#===============================================================================
#  Features are:
#
#     [ $type, $loc, $id, $name, \%keys_values ]
#
#     Location is Sapling style contig_begĀ±length
#
sub gff_to_gb_ftr
{
    my ( $gff_ftr, $dnadex, $product, $code ) = @_;
    my ( $type, $loc, $id, $name, $attrib ) = @$gff_ftr;
    return () if $type eq 'exon';

    my @cbdl = map { [ /^(\S+)_(\d+)([+-])(\d+)$/ ] } split /,/, $loc;
    my $gb_loc = gjogenbank::cbdl_2_genbank( \@cbdl );

    delete $attrib->{ exonNumber } if $attrib->{ exonNumber };
    delete $attrib->{ exon_id }    if $attrib->{ exon_id };

    $attrib->{ translation } ||= [ CDS_translation( $loc, $dnadex, $code ) ] if $type eq 'CDS';

    $attrib->{ locus_tag } ||= [ $id ] if $id;

    my $pid = $attrib->{ protein_id    } ? $attrib->{ protein_id    }->[0] : '';
    my $tid = $attrib->{ transcript_id } ? $attrib->{ transcript_id }->[0] : '';
    my $prod = $product->{ $id }  || $product->{ $name }
            || $product->{ $pid } || $product->{ $tid }
            || '';

    $attrib->{ product } ||= [ $prod ] if $prod;

    [ $type, $gb_loc, $attrib ]
}


sub CDS_translation
{
    my ( $loc, $dnadex, $code ) = @_;
    my $okay;
    my @loc1 = map { my ( $c, $b, $dir, $len ) = /^(.+)_(\d+)([-+])(\d+)$/;
                     $okay = $b && $dir && $len && $c && $dnadex->{ $c };
                     $okay or print STDERR "Bad location description in '$_'.\n";
                     $okay ? [ $c, $b, $dir, $len ] : ();
                   }
               split /,/, $loc;

    return undef if ! @loc1;

    my @seqs = map { my ( $c, $b, $dir, $len ) = @$_;
                     my $e = $b + ($dir eq '+' ? $len-1 : -($len-1) );
                     my $subseq = gjoseqlib::DNA_subseq( \$dnadex->{$c}->[2], $b, $e );
                     ( $len > 1 || $dir eq '+' ) ? $subseq : gjoseqlib::complement_DNA_seq( $subseq );
                   }
               @loc1;

    my $seq = join( '', @seqs );

    #  make sure it is multiple of 3 nucleotides:
    $seq = substr( $seq, 0, 3*(int(length($seq)/3)) ) if length( $seq ) % 3;

    my $pep = $code ? gjoseqlib::translate_seq_with_user_code( $seq, $code, 1 )
                    : gjoseqlib::translate_seq( $seq, 1 );

    $pep =~ s/\*$//;

    $pep;
}




MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3