[Bio] / FigKernelScripts / export_subsystem.pl Repository:
ViewVC logotype

View of /FigKernelScripts/export_subsystem.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Thu Jun 2 18:00:25 2016 UTC (3 years, 5 months ago) by olson
Branch: MAIN
CVS Tags: HEAD
Subsystem exporter for NCBI.

use Data::Dumper;
use strict;
use FIG;
use File::Path 'make_path';
use Getopt::Long::Descriptive;
use gjoseqlib;
use Number::Range;

=head1 NAME

export_subsystem

=head1 DESCRIPTION    

This script writes the following files into the output directory:

spreadsheet.txt contains the subsystem data. Rows are genomes, columns are variant code and peg lists.
The header line contains the role abbreviations.

proteins.fa contains the sequences of the proteins appearing in the subsystem.
The defline for each is of the form
>SEED-identifier (alias,alias...) contig-id_start_stop function [genus species]

annotations.txt contains the protein functional annotations. It is a two-column
table of feature-id, annotation.

roles.txt contains the subsystem roles. It is a two-column table of 
role-abbreviation, role-name pairs

=cut

my($opt, $usage) = describe_options("%c %o subsystem output-dir",
				    ["variant-code|v=s@" => "Write genomes with this variant code only (may be repeated)"],
				    ["role|r=s@" =>
				     "Only write this functional role (may be repeated; may be a role name, abbreviation, or index)"],
				    ["help|h" => "Show this help message."]);

print($usage->text), exit 0 if $opt->help;
die($usage->text) if @ARGV != 2;

my $ss_name = shift;
my $base_dir = shift;

my $fig = FIG->new();
my $ss = $fig->get_subsystem($ss_name);

my $dir = "$base_dir/$ss->{name}";
print "Writing to $dir\n";
make_path($dir);

#
# Determine roles to include.
#

my @roles;

if ($opt->role)
{
    for my $r (@{$opt->role})
    {
	if ($r =~ /^[\.-\d]+$/)
	{
	    $r =~ s/-/../g;
	    my $range = Number::Range->new($r);
	    for my $r2 ($range->range)
	    {
		my $role = $ss->get_role($r2);
		defined($role) or die "No role found for $r2\n";
		push(@roles, $role)
	    }
	}
	elsif (defined(my $i = $ss->get_role_index($r)))
	{
	    print "$r has index $i\n";
	    push(@roles, $r);
	}
	elsif (defined(my $a = $ss->get_role_from_abbr($r)))
	{
	    print "abbr $a from $r\n";
	    my $i = $ss->get_role_index($a);
	    print "idx $i\n";
	    push(@roles, $r);
	}
    }
}
else
{
    @roles = $ss->get_roles();
}

@roles = map { my $ri = $ss->get_role_index($_); my $a = $ss->get_role_abbr($ri); [$ri, $a, $_] }
	grep { ! $ss->is_aux_role($_) } @roles;

# print Dumper(\@roles);

#
# Determine genomes to include.
#

my %variants_to_include;
if ($opt->variant_code)
{
    $variants_to_include{$_} = 1 foreach @{$opt->variant_code};
}

my @genomes;
for my $genome ($ss->get_genomes())
{
    my $gidx = $ss->get_genome_index($genome);
    my $variant = $ss->get_variant_code($gidx);
    if (!$opt->variant_code || $variants_to_include{$variant})
    {
	push(@genomes, [$genome, $gidx, $variant]);
    }
}
# print Dumper(\@genomes);

#
# Accumulate list of proteins to include. We can write the spreadsheet now too.
#

open(SS, ">", "$dir/spreadsheet.txt") or die "Cannot open $dir/spreadsheet.txt: $!";

print SS join("\t", "Genome", "Variant", map { $_->[1] } @roles), "\n";

my %features;
for my $gent (@genomes)
{
    my($genome, $gidx, $variant) = @$gent;
    print SS "$genome\t$variant";
    for my $rent (@roles)
    {
	my($ridx, $abbr, $role) = @$rent;
	my $cell = $ss->get_cell($gidx, $ridx);
	print SS "\t", join(",", @$cell);
	$features{$_} = 1 foreach @$cell;
    }
    print SS "\n";
}

close(SS);

open(R, ">", "$dir/roles.txt") or die "Cannot write $dir/roles.txt: $!";
for my $rent (@roles)
{
    my($ridx, $abbr, $role) = @$rent;
    print R "$abbr\t$role\n";
}
close(R);

open(ANNO, ">", "$dir/annotations.txt") or die "Cannot write $dir/annotations.txt: $!";

open(FA, ">", "$dir/proteins.fa") or die "Cannot write $dir/proteins.fa: $!";

my @features = sort { &FIG::by_fig_id($a, $b) } keys %features;
#$#features = 10;
my $funcs = $fig->function_of_bulk(\@features);
my @locs = $fig->feature_location_bulk(\@features);
my $locs;
$locs->{$_->[0]} = $_->[1] foreach @locs;
my $aliases = $fig->feature_aliases_bulk(\@features);

for my $fid (@features)
{
    my $a = $aliases->{$fid} || [];
    my $alist = join(",", @$a);
    my $gs = $fig->genus_species(&FIG::genome_of($fid));
    my $trans = $fig->get_translation($fid);
    print_alignment_as_fasta(\*FA, [$fid, "($alist) $locs->{$fid} $funcs->{$fid} [$gs]", $trans]);
    print ANNO "$fid\t$funcs->{$fid}\n";
}
close(FA);
close(ANNO);


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3