[Bio] / FigKernelPackages / AlignsAndTrees.pm Repository:
ViewVC logotype

View of /FigKernelPackages/AlignsAndTrees.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Tue Mar 22 23:17:33 2011 UTC (8 years, 10 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_04082011, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_3, mgrast_dev_03312011, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, mgrast_dev_04052011
Changes since 1.3: +5 -5 lines
debugged

#
# Copyright (c) 2003-2010 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

package AlignsAndTrees;

#===============================================================================
#  perl functions for loading and accessing Alignments and Trees based on md5
#
#  Usage:  use AlignsAndTrees;
#
#    @alignIDs = all_alignIDs();
#   \@alignIDs = all_alignIDs();
#
#    @alignIDs = aligns_with_md5ID( $md5 );
#   \@alignIDs = aligns_with_md5ID( $md5 );
#
#    @md5IDs   = md5IDs_in_align( $alignID );
#   \@md5IDs   = md5IDs_in_align( $alignID );
#
#   \@seqs               = md5_alignment_by_ID( $alignID );
# ( \@seqs, \%metadata ) = md5_alignment_by_ID( $alignID );
#   \%metadata           = md5_alignment_metadata( $alignID );
#
#       $metadata{ $md5 } = [ $peg_length, $trim_beg, $trim_end, $location_string ]
#
#    @treeIDs  = all_treeIDs( );
#   \@treeIDs  = all_treeIDs( );
#
#    @treeIDs  = trees_with_md5ID( $md5 );
#   \@treeIDs  = trees_with_md5ID( $md5 );
#
#    @md5IDs   = md5IDs_in_tree( $treeID );
#   \@md5IDs   = md5IDs_in_tree( $treeID );
#
#    $tree     = md5_tree_by_ID( $treeID );
#
#===============================================================================

use strict;
use gjoseqlib    qw( read_fasta );
use gjonewicklib qw( read_newick_tree );

my $data_dir = $ENV{ ATNG } && -d $ENV{ ATNG } ? $ENV{ ATNG } :
               -d '/home/fangfang/ATNG'        ? '/home/fangfang/ATNG' :
                                                 '.';

my $align_tree_data = "$data_dir/md5IDs_in_align_and_tree.tab";

#-------------------------------------------------------------------------------
#
#    @alignIDs = all_alignIDs();
#   \@alignIDs = all_alignIDs();
#
#-------------------------------------------------------------------------------
sub all_alignIDs
{
    -f $align_tree_data
        or die "all_alignIDs() could not locate alignment and tree data file '$align_tree_data'.\n";
    my @ids = map { chomp; $_ } `cut -f1 < '$align_tree_data' | sort -u`;
    wantarray ? @ids : \@ids;
}

#-------------------------------------------------------------------------------
#
#    @alignIDs = aligns_with_md5ID( $md5 );
#   \@alignIDs = aligns_with_md5ID( $md5 );
#
#-------------------------------------------------------------------------------
sub aligns_with_md5ID
{
    my ( $md5 ) = @_;
    $md5
        or print STDERR "aligns_with_md5ID() called with invalid md5 '$md5'\n"
            and return wantarray ? () : [];
    -f $align_tree_data
        or die "aligns_with_md5ID() could not locate alignment and tree data file '$align_tree_data'.\n";
    my @ids = map { chomp; $_ } `grep -w '$md5' < '$align_tree_data' | cut -f1 | sort -u`;
    wantarray ? @ids : \@ids;
}

#-------------------------------------------------------------------------------
#
#    @md5IDs = md5IDs_in_align( $alignID );
#   \@md5IDs = md5IDs_in_align( $alignID );
#
#-------------------------------------------------------------------------------
sub md5IDs_in_align
{
    my ( $alignID ) = @_;
    $alignID
        or print STDERR "md5IDs_in_align() called with invalid alignment ID '$alignID'\n"
            and return wantarray ? () : [];
    -f $align_tree_data
        or die "md5IDs_in_align() could not locate alignment and tree data file '$align_tree_data'.\n";
    my @md5IDs = map { chomp; $_ } `grep -w '^$alignID' < '$align_tree_data' | cut -f2 | sort -u`;
    wantarray ? @md5IDs : \@md5IDs;
}

#-------------------------------------------------------------------------------
#
#   \@seqs               = md5_alignment_by_ID( $alignID );
# ( \@seqs, \%metadata ) = md5_alignment_by_ID( $alignID );
#           \%metadata   = md5_alignment_metadata( $alignID );
#
#       $metadata{ $md5 } = [ $peg_length, $trim_beg, $trim_end, $location_string ]
#
#-------------------------------------------------------------------------------
sub md5_alignment_by_ID
{
    my ( $alignID ) = @_;
    my $file = "$data_dir/ali$alignID.fa";
    -f $file
        or print STDERR "Could not locate data for alignment '$alignID':\n$file\n\n"
            and return wantarray ? () : undef;

    my @align = map { $_->[1] = ''; $_ } gjoseqlib::read_fasta( $file );
    
    wantarray ? ( \@align, md5_alignment_metadata( $alignID ) ) : \@align;
}

sub md5_alignment_metadata
{
    my ( $alignID ) = @_;
    $alignID
        or print STDERR "md5_alignment_metadata() called with invalid alignment ID '$alignID'\n"
            and return {};
    -f $align_tree_data
        or die "md5IDs_in_align() could not locate alignment and tree data file '$align_tree_data'.\n";
    my %metadata = map { chomp;
                         my ( undef, $md5, @data ) = split /\t/;
                         ( $md5 => \@data );
                       }
                  `grep -w '^$alignID' < '$align_tree_data'`;
    \%metadata;
}


#-------------------------------------------------------------------------------
#
#    @treeIDs = all_treeIDs( );
#   \@treeIDs = all_treeIDs( );
#
#-------------------------------------------------------------------------------
sub all_treeIDs
{
    -f $align_tree_data
        or die "all_treeIDs() could not locate alignment and tree data file '$align_tree_data'.\n";
    my @ids = map { chomp; $_ } `cut -f1 < '$align_tree_data' | sort -u`;
    wantarray ? @ids : \@ids;
}

#-------------------------------------------------------------------------------
#
#    @treeIDs = trees_with_md5ID( $md5 );
#   \@treeIDs = trees_with_md5ID( $md5 );
#
#-------------------------------------------------------------------------------
sub trees_with_md5ID
{
    my ( $md5 ) = @_;
    return wantarray ? () : [] if ! $md5;
    -f $align_tree_data
        or die "trees_with_md5ID() could not locate alignment and tree data file '$align_tree_data'.\n";
    my @ids = map { chomp; $_ } `grep -w '$md5' < '$align_tree_data' | cut -f1 | sort -u`;
    wantarray ? @ids : \@ids;
}

#-------------------------------------------------------------------------------
#
#    @md5IDs = md5IDs_in_tree( $treeID );
#   \@md5IDs = md5IDs_in_tree( $treeID );
#
#-------------------------------------------------------------------------------
sub md5IDs_in_tree
{
    my ( $treeID ) = @_;
    $treeID
        or print STDERR "md5IDs_in_tree() called with invalid tree ID '$treeID'\n"
            and return wantarray ? () : [];
    -f $align_tree_data
        or die "md5IDs_in_tree() could not locate alignment and tree data file '$align_tree_data'.\n";
    my @md5IDs = map { chomp; $_ } `grep -w '^$treeID' < '$align_tree_data' | cut -f2 | sort -u`;
    wantarray ? @md5IDs : \@md5IDs;
}

#-------------------------------------------------------------------------------
#
#    $tree = md5_tree_by_ID( $treeID );
#
#-------------------------------------------------------------------------------
sub md5_tree_by_ID
{
    my ( $treeID ) = @_;
    my $file = "$data_dir/tree$treeID.nwk";
    -f $file
        or print STDERR "Could not locate data for alignment '$treeID':\n$file\n\n"
            and return wantarray ? () : [];
    gjonewicklib::read_newick_tree( $file );
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3