[Bio] / FigKernelScripts / sparse_genome_sample.pl Repository:
ViewVC logotype

View of /FigKernelScripts/sparse_genome_sample.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Fri Jan 11 23:58:39 2013 UTC (6 years, 10 months ago) by golsen
Branch: MAIN
CVS Tags: rast_rel_2014_0729, rast_rel_2014_0912, HEAD
A script to provide a set of diverse genomes (< 90% identical) that are
similar (>50% identical) to a aminoacyl tRNA synthetases in a reference
genome.

#!/usr/bin/env perl -w
#
#  sparse_genome_sample [options] genome_id > sparse_genome_ids_and_genome_names
#
use strict;
use gjoseqlib;
use SeedUtils;   # genome_of()
use SAPserver;
use ALITREserver;
use AlignsAndTreesServer;
use Data::Dumper;

my $sapO = SAPserver->new()    or die 'Could not get a SAPserver.';
my $alnO = ALITREserver->new() or die 'Could not get an ALITREserver.';

my $usage = <<'End_of_Usage';

Usage:  sparse_genome_sample  [Options]  genome_id > sparse_genome_ids

Output is one per line: genome_id tab genome_name

Options:

    -min identity  # minimum identity to reference genome sequence (D = 0.50)
    -max identity  # maximum identity between diverse sequences (D = 0.90)

End_of_Usage


my $max_id  = 0.90;
my $min_id  = 0.50;
while ( @ARGV && $ARGV[0] =~ s/^--?// )
{
    local $_ = shift;
    if ( s/^max// ) { $max_id = /./ ? $_ : shift; next }
    if ( s/^min// ) { $min_id = /./ ? $_ : shift; next }

    if ( m/./ )
    {
        print STDERR "Bad flag(s) '$_'\n", $usage;
        exit;
    }
}


my $gid0;

( @ARGV && $ARGV[0]
    && ( ( $gid0 ) = $ARGV[0] =~ /^(?:fig\|)?(\d+\.\d+)$/ )
    && $gid0
    )
    or print STDERR "Missing reference genome.\n", $usage
        and exit;


my %AARS_alns = ( Ala   => [ qw( 00004658 00003916 00002337 00010970 ) ],
                  Arg   => [ qw( 00003884 ) ],
                  Asp   => [ qw( 00004264 00001622 00023632 00023460 ) ],
                  Cys   => [ qw( 00015530 00000473 00003882 ) ],
                  Glu   => [ qw( 00004397 00000129 00004993 00004327 ) ],
                  His   => [ qw( 00015178 00002192 ) ],
                  Ile   => [ qw( 00000021 00003881 ) ],
                  Leu   => [ qw( 00000582 00003900 ) ],
                  Met   => [ qw( 00023409 00014224 00015624 00002670 ) ],
                  PheA  => [ qw( 00001490 ) ],
                  PheB  => [ qw( 00003901 00006585 00021118 ) ],
                  ProAE => [ qw( 00016196 00019605 00023512 ) ],
                  ProB  => [ qw( 00000177 00007027 ) ],
                  Ser   => [ qw( 00000789 00003904 ) ],
                  SerA  => [ qw( 00006634 ) ],
                  Thr   => [ qw( 00019605 00023512 00001494 00014693 ) ],
                  Trp   => [ qw( 00002945 00018907 ) ],
                  Tyr   => [ qw( 00018443 00003874 00001422 00018907 ) ],
                  Val   => [ qw( 00003719 ) ]
                );

my %role = ( Ala   => 'Alanyl-tRNA synthetase (EC 6.1.1.7)',
             Arg   => 'Arginyl-tRNA synthetase (EC 6.1.1.19)',
             Asp   => 'Aspartyl-tRNA synthetase (EC 6.1.1.12)',
             Cys   => 'Cysteinyl-tRNA synthetase (EC 6.1.1.16)',
             Glu   => 'Glutamyl-tRNA synthetase (EC 6.1.1.17)',
             His   => 'Histidyl-tRNA synthetase (EC 6.1.1.21)',
             Ile   => 'Isoleucyl-tRNA synthetase (EC 6.1.1.5)',
             Leu   => 'Leucyl-tRNA synthetase (EC 6.1.1.4)',
             Met   => 'Methionyl-tRNA synthetase (EC 6.1.1.10)',
             PheA  => 'Phenylalanyl-tRNA synthetase alpha chain (EC 6.1.1.20)',
             PheB  => 'Phenylalanyl-tRNA synthetase beta chain (EC 6.1.1.20)',
             ProAE => 'Prolyl-tRNA synthetase (EC 6.1.1.15), archaeal/eukaryal type',
             ProB  => 'Prolyl-tRNA synthetase (EC 6.1.1.15), bacterial type',
             Ser   => 'Seryl-tRNA synthetase (EC 6.1.1.11)',
             SerA  => 'Seryl-tRNA synthetase (EC 6.1.1.11), archaeal',
             Thr   => 'Threonyl-tRNA synthetase (EC 6.1.1.3)',
             Trp   => 'Tryptophanyl-tRNA synthetase (EC 6.1.1.2)',
             Tyr   => 'Tyrosyl-tRNA synthetase (EC 6.1.1.1)',
             Val   => 'Valyl-tRNA synthetase (EC 6.1.1.9)'
           );

my @aas   = qw( Ala Arg Asp Glu His Ile Leu Met Thr Trp Tyr Val );

#  These are the roles that we will look for in the reference genome

my @roles = map { $role{ $_ } } @aas;

#  These are the alignments with the role

my %aln_for_role;
foreach my $aa ( @aas )
{
    my $role = $role{$aa};
    foreach my $aln ( @{ $AARS_alns{ $aa } } ) { $aln_for_role{ $role }->{ $aln } = 1 }
}

#
#  Given genome gid0:
#
#  Get fids with roles
#  Remove roles with more than one fid
#  Get alignments with fid
#  Filter alignment by min_id
#  Remove genomes with duplicates
#  Index by genome
#  Make representative set
#
#
#  Stage 1: Pick the genes to define the related genomes
#
#  Get fids in reference genome with roles
#

my $fids_w_role = $sapO->occ_of_role( { -roles => \@roles, -genomes => [$gid0] } ) || {};

#  Remove roles with more than one fid in the reference genome

my @fid_role = map { $fids_w_role->{$_} && @{$fids_w_role->{$_}} == 1 ? [ $fids_w_role->{$_}->[0], $_ ] : () }
               @roles;

my @best_aligns = ();
foreach my $fid_role ( @fid_role )
{
    my ( $fid, $role ) = @$fid_role;

    my @alignIDs = grep { $aln_for_role{ $role }->{ $_ } }
                   AlignsAndTreesServer::aligns_with_pegID( $sapO, $alnO, $fid );
    next unless @alignIDs;

    my $best_align = [];
    foreach my $alignID ( @alignIDs )
    {
        my ( $align, $meta ) = AlignsAndTreesServer::peg_alignment_by_ID( $sapO, $alnO, $alignID );

        # Relabel the sequences by genome id:

        my %gen_cnt;
        my $ref_seq;
        foreach ( @$align )
        {
            $_->[0] =~ s/_\d+$//;
            my $gen = SeedUtils::genome_of($_->[0]);
            @$_[0,1] = ( $gen, $->[0] );
            $gen_cnt{ $gen }++;
            $ref_seq = [ @$_ ] if $gen eq $gid0;
        }
        next unless $gen_cnt{ $gid0 } == 1;

        # Remove genomes that occur more than once in the alignment
        # Also, filter out low identity sequences

        my $seq1 = $ref_seq->[2];
        my ( $nmat, $nid );
        @$align = grep { ( $gen_cnt{ $_->[0] } == 1 )
                      && ( protein_identity( $ref_seq, $_ ) >= $min_id )
                       }
                  @$align;

        # Looks like a good alignment to me

        $best_align = $align if @$align > @$best_align;
    }

    next unless @$best_align;
    push @best_aligns, scalar gjoseqlib::pack_alignment( $best_align );
}

#  Discard really small alignments:

my @sizes = sort { $a <=> $b }
            map  { scalar @$_ }
            @best_aligns;
my $min_size = 0.8 * $sizes[ int( @sizes / 2 ) ];

@best_aligns = grep { @$_ >= $min_size } @best_aligns;

#  Find the gid's that are in all of the alignments

my %gen_cnt;
foreach ( @best_aligns )
{
    foreach ( @$_ ) { $gen_cnt{ $_->[0] }++ }
}

my $n_aligns = @best_aligns;
my %keep_gid = map  { $_ => 1 }
               grep { $gen_cnt{ $_ } == $n_aligns }
               keys %gen_cnt;

my %merged_seq;
foreach ( @best_aligns )
{
    foreach ( @$_ )
    {
        next unless $keep_gid{ $_->[0] };
        $merged_seq{ $_->[0] } = ( $merged_seq{ $_->[0] } || '' ) . $_->[2];
    }
}

my @merged_align = gjoseqlib::pack_alignment( map { [ $_, '', $merged_seq{ $_ } ] } keys %keep_gid );

my @diverse_align = ( [ $gid0, '', $merged_seq{ $gid0 } ] );
foreach my $test_seq ( @merged_align )
{
    my $too_sim = 0;
    foreach ( @diverse_align )
    {
        next if protein_identity( $test_seq, $_ ) < $max_id;
        $too_sim = 1;
        last;
    }
    next if $too_sim;

    push @diverse_align, $test_seq;
}

my @gids = map { $_->[0] } @diverse_align;
my $gs = $sapO->genome_names( -ids => \@gids );
my @id_gs = sort { lc $a->[1] cmp lc $b->[1] }
            map  { [ $_, $gs->{ $_ } ] }
            @gids;

foreach ( @id_gs )
{
    print "$_->[0]\t$_->[1]\n";
}


#
#     ( $nmat, $nid, ... ) = gjoseqlib::interpret_aa_align( $seq1, $seq2 )
#
sub protein_identity
{
    my ( $nmat, $nid ) = gjoseqlib::interpret_aa_align( $_[0]->[2], $_[1]->[2] );
    $nid / ( $nmat || 1 );
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3