[Bio] / FigKernelScripts / find_SEED_SSU_rRNA_genes.pl Repository:
ViewVC logotype

View of /FigKernelScripts/find_SEED_SSU_rRNA_genes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (download) (as text) (annotate)
Thu Sep 19 19:09:49 2013 UTC (6 years, 1 month ago) by golsen
Branch: MAIN
CVS Tags: rast_rel_2014_0912, rast_rel_2014_0729, HEAD
Changes since 1.6: +6 -5 lines
Update the reference RNA module name.

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2013 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

my $usage = <<'End_of_Usage';
Identify the SSU rRNA genes in one or more SEED genomes.

Usage: find_SEED_SSU_rRNA_genes [options] [ genome_id ... ] > found

Output:

    genome_id \t type \t location \t sequence \t tag \t proposed_assignment

    where type is always rna.

Options:

    -a 'role'  #  Proposed assignment; empty string supresses the output field
    -b         #  Add a blank line between genomes (D = no blank)
    -c nt      #  Condense the sequence to first and last nt nucleotides (D = full seq)
    -e dist    #  Maximum nucleotides at end to extrapolate a match (D = 10)
    -f         #  Fasta output format
    -s         #  Do not show sequence
    -t 'tag'   #  A short tage to identify the nature of the feature; allows
               #    mixing of different types; empty string supresses the field

End_of_Usage

use strict;
use find_homologs;
use gjoseqlib;
use Data::Dumper;
use FIG;
my $fig = new FIG;

my $extrapolate;
my $assignment;
my $tag;
my @rna;

#  This is the module name. The package name in the module is RNA_reps. Other
#  modules use the same internal package structure.

use RNA_reps_SSU_rRNA;
@rna        = @RNA_reps::RNA_reps;
$assignment = $RNA_reps::assignment if $RNA_reps::assignment;
$tag        = $RNA_reps::tag        if $RNA_reps::tag;

ensure_defined( $extrapolate, 10 );
ensure_defined( $assignment, 'SSU rRNA ## 16S rRNA, small subunit ribosomal RNA' );
ensure_defined( $tag,        'SSU_rRNA' ); 

my $blank    = 0;
my $condense = 0;
my $fasta    = 0;

while ( @ARGV && $ARGV[0] =~ s/^-// )
{
    $_ = shift;
    if ( s/^a// ) { $assignment  = /\S/ ? $_ : shift; next }
    if ( s/^c// ) { $condense    = /\S/ ? $_ : shift; next }
    if ( s/^e// ) { $extrapolate = /\S/ ? $_ : shift; next }
    if ( s/^t// ) { $tag         = /\S/ ? $_ : shift; next }

    if ( s/b//  ) { $blank = ! $blank }
    if ( s/f//  ) { $fasta = ! $fasta }
    if ( m/\S/ )
    {
        print STDERR "Bad flag '$_'\n", $usage;
        exit;
    }
}

my @genomes = @ARGV;
if ( ! @genomes )
{
    @genomes = grep { $fig->is_prokaryotic($_) } $fig->genomes('complete');
}

my $pat;
$pat = qr/^(.{$condense})....+(.{$condense})$/o if $condense;

foreach my $genome ( @genomes )
{
    my $contig_file = "$FIG_Config::organisms/$genome/contigs";
    -f $contig_file or print STDERR "Could not find contig file $contig_file\n"
                    and exit;

    my $options = { description => $assignment,
                    extrapolate =>  10,
                    refseq      => \@rna
                  };

    my $instances = find_homologs::find_nucleotide_homologs( $contig_file, $options );

    my @instances = map  { $_->[2] }
                    sort { $a->[0] cmp $b->[0] || $a->[1] <=> $b->[1] }
                    map  { my $loc = $_->{ location };
                           my $seq = $_->{ sequence };
                           my ( $c, $b, $e ) = $loc =~ /^(.*)_(\d+)_(\d+)$/;
                           $seq =~ s/$pat/$1...$2/ if $condense;
                           [ $c, $b+$e, [ $genome, 'rna', $loc, lc $seq ] ]
                         }
                    @$instances;

    foreach ( @instances )
    {
        if ( ! $fasta )
        {
            print join( "\t", @$_,
                              ( $tag        ? $tag        : () ),
                              ( $assignment ? $assignment : () )
                      ), "\n"
        }
        else
        {
            # >genome:location assignment
            gjoseqlib::print_seq_as_fasta( "$_->[0]:$_->[2]", $assignment, $_->[3] );
        }
    }

    print "\n" if @instances && $blank;  #  Blank line between genomes
}

exit;

sub ensure_defined { $_[0] = $_[1] if ! defined $_[0] }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3