[Bio] / SubsystemExtension / evaluate_gecko.pl Repository:
ViewVC logotype

View of /SubsystemExtension/evaluate_gecko.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Thu Jan 5 19:41:53 2006 UTC (12 years, 7 months ago) by olson
Branch: MAIN
CVS Tags: HEAD
Rearrangement for easier installation.

#!/Users/fig/FIGDisk/env/mac/bin/perl 

use strict;
use warnings;

use Getopt::Std;
use Term::ReadKey;
use IO::Handle;
use Data::Dumper;

use SubsystemExtension::ClusterList;
use SubsystemExtension::Cluster;
use SubsystemExtension::JoinedCluster;
use FIG;


sub usage {
    print "-= evaluate_gecko =-\n\n";
    print "usage:\n evaluate_gecko -c <cluster file> -l <clusterlist files> -s <subsystem name> -d <validation_directory> -n <minimal number of genomes> -m <minimal number of genes> \n";
    
}

my $fig = FIG->new();

our ($opt_c, $opt_l, $opt_s, $opt_d, $opt_n, $opt_m);

getopts('f:d:l:c:n:m:');

unless ($opt_l || $opt_c) {
    print usage;
    print "ERROR: cant start evaluation no cluster input defined\n";
    exit 1;
}


$opt_d = $opt_d ? $opt_d : '~/';

$opt_n = $opt_n ? $opt_n : 2;
$opt_m = $opt_m ? $opt_m : 2;


my %subsystems;
my %subsystem_counts;

my %clusters_in_subsystem;


my %genomes_in_clusters;

foreach my $subsys ($fig->all_subsystems()) {
    my $subs = $fig->get_subsystem($subsys);
    if (ref $subs && $subs->isa('Subsystem')) {
	$subsystems{$subsys} = $subs;

	# count the complete prokaryotic genomes in one subsystem
	$subsystem_counts{$subsys} = scalar grep {($fig->is_complete($_) && $fig->is_prokaryotic($_))} $subs->get_genomes();
    } else {

	print STDERR "Could not initialize $subsys\n"; 

    }
}



# subsystems in one cluster richtung
if ($opt_l && -e $opt_l) {
    
    open TEX, ">$opt_l.tex" or die "could not open table.tex\n";
    print TEX "\\begin{tabular}{llrrrr}\n";
    print TEX '\textbf{Cluster} & \textbf{Subsystem} & \textbf{Genes in Subsystem} & \textbf{Genes in Cluster} & \textbf{Coverage %} & \textbf{Subsystems in Cluster} & \textbf{Genomes in Cluster} & \textbf{Genomes in Subsystem}\\';
    print TEX "\n\\hline\n";
    my $clusterlist = SubsystemExtension::ClusterList->new();
    $clusterlist = SubsystemExtension::ClusterList->fromFile($opt_l);
    foreach my $cluster ($clusterlist->clusters()) {
	my %cluster_subsystems; # list of occuring subsystems
	my %cluster_subsystem_counts;
	my %cluster_subsystem_genomes;
	my %cluster_genome_count;
	my @genomes = $cluster->genomes();

	foreach (@genomes) {
	    $genomes_in_clusters{$_} = 1;
	}
	# skip those clusters with less genomes than threshold
	next if (scalar @genomes < $opt_n);



	my @cluster_genes = $cluster->all_genes();
	my $cluster_gene_count = scalar @cluster_genes;
	foreach my $gene (@cluster_genes) {
	    # print STDERR $gene->{name};
	    my $taxon_id;
	    if ($gene->{name} =~ /fig\|(\d+\.\d+)\.peg/) {
		$taxon_id = $1;
	    }
	    my @subsystems = $fig->subsystems_for_peg($gene->{name});
	    if (scalar @subsystems == 0 ) {
		$cluster_subsystems{'none'} = 1;
		$cluster_subsystem_counts{'none'}++;
	    } else {
		my %seen;
		foreach my $subsystem (@subsystems) {
		    next if $seen{$subsystem->[0]};
		    $seen{$subsystem->[0]} = 1;
                    # print STDERR $subsystem->[0];
		    unless (ref $clusters_in_subsystem{$subsystem->[0]} eq 'HASH') {
			$clusters_in_subsystem{$subsystem->[0]} = {};
		    }
		    $clusters_in_subsystem{$subsystem->[0]}->{$cluster->id()} = $cluster;
		    $cluster_subsystems{$subsystem->[0]} = 1;
		    $cluster_subsystem_counts{$subsystem->[0]}++;
		    $cluster_subsystem_genomes{$taxon_id}++;
		    $cluster_genome_count{$taxon_id}++;
		}
	    }
	}
	foreach my $subsystem (keys %cluster_subsystems) {
	    
	    # printf  "\t%s\t%d : %d %.2f %%\t %d : %d \n", $subsystem, $cluster_subsystem_counts{$subsystem}, $cluster_gene_count, ($cluster_subsystem_counts{$subsystem} /  $cluster_gene_count) * 100, scalar keys %cluster_subsystem_genomes, $genomes;
	    printf TEX "%s & %s & %d & %d & %.2f & %d & %d & %d\n", 
	    $cluster->id(), 
	    $subsystem, 
	    $cluster_subsystem_counts{$subsystem}, 
	    $cluster_gene_count, 
	    ($cluster_subsystem_counts{$subsystem} / $cluster_gene_count) * 100, 
	    scalar keys %cluster_subsystem_genomes, 
	    $subsystem_counts{$subsystem},
	    scalar keys %cluster_genome_count;
	}


    }
    print TEX '\end{tabular}';
    close TEX;

}





my $total_genes_in_subsystems = 0;

# clusters in one subsystem
if ($opt_l && -e $opt_l) {
    
    open TEX, ">$opt_l.reverse.tex" or die "could not open table.tex\n";
    print TEX "\\begin{tabular}{llrrrr}\n";
    print TEX '\textbf{Subsystem} & \textbf{Cluster} & \textbf{Genomes in Subsystem} & \textbf{Genomes in Cluster} & \textbf{Genes in Subsystem} & \textbf{Genes in subset of Subsystem} & \textbf{Genes in Cluster} & \textbf{ClusterGenes in Subsystem} & \textbf{Fraction} & \textbf{Fraction subset} \\';
    print TEX "\n\\hline\n";
    my $clusterlist = SubsystemExtension::ClusterList->new();
    $clusterlist = SubsystemExtension::ClusterList->fromFile($opt_l);
  
    print STDERR &Dumper(%genomes_in_clusters);

    foreach my $subsystem_name (keys %subsystems) {

	my $subsystem = $subsystems{$subsystem_name};

	my $genomes_in_subsystem = $subsystem_counts{$subsystem_name};
	# get the number of genomes in the subsystem
	
	my %genes_in_subsystem;
	my $count = -1;
	foreach my $row (@{$subsystem->{spreadsheet}}) {
	    $count++;
	    my $genome = $subsystem->get_genome($count);
	    next unless ($fig->is_complete($genome) && $fig->is_prokaryotic($genome) && ($genomes_in_clusters{$genome})); 
	    foreach my $cell (@$row) {
		foreach my $gene (@$cell) {
		    $genes_in_subsystem{$gene} = 1;
		}
	    }
	    
	}
	
	my $genes_in_subsystem = scalar keys %genes_in_subsystem;

	$total_genes_in_subsystems += $genes_in_subsystem;

	

	next if ($genes_in_subsystem == 0);

	my $genes_covered_by_clusters = 0;
	
	foreach my $cluster (values %{$clusters_in_subsystem{$subsystem_name}}) {
	    
	    my @genomes = $cluster->genomes();

	    # skip those clusters with less genomes than threshold
	    next if (scalar @genomes < $opt_n);

	    my %genes_in_subset;

	    foreach my $genome (@genomes) {
		
		my $genome_index = $subsystem->get_genome_index($genome);
		next unless ($fig->is_complete($genome) && $fig->is_prokaryotic($genome)); 
		foreach my $cell (@{$subsystem->get_row($genome_index)}) {
		    foreach (@$cell) {
			$genes_in_subset{$_} = 1;
		    }
		}
	    }

	    my $genes_in_subset = scalar keys %genes_in_subset;

	    next if ($genes_in_subset == 0);

	    my @cluster_genes = $cluster->all_genes();
	    my $cluster_gene_count = scalar @cluster_genes;
	    my $cluster_genomes_count = scalar @genomes, # genomes in cluster
	    my $cluster_genes_in_subsystem;
	    foreach my $gene (@cluster_genes) {
		# print STDERR $gene->{name};
		my $taxon_id;
		if ($gene->{name} && ($gene->{name} =~ /fig\|(\d+\.\d+)\.peg/)) {
		    $taxon_id = $1;
		}
		my @subsystems = $fig->subsystems_for_peg($gene->{name});
		next if (scalar @subsystems == 0 );
		my %seen;
		foreach my $tuple (@subsystems) {
		    next if ($seen{$tuple->[0]});
		    $seen{$tuple->[0]} = 1;
		    $cluster_genes_in_subsystem++ if ($tuple->[0] eq $subsystem_name);
		    $genes_covered_by_clusters++ if ($tuple->[0] eq $subsystem_name);
		}
	    }
	    
	    
	    
	    
	    printf TEX "%s & %s & %d & %d  & %d & %d & %d & %d & %.2f & %.2f\n", 
	    $subsystem_name,
	    $cluster->id(), 
	    $genomes_in_subsystem, 
	    $cluster_genomes_count,
	    $genes_in_subsystem, 
	    $genes_in_subset,
	    $cluster_gene_count,
	    $cluster_genes_in_subsystem,
	    ($cluster_genes_in_subsystem / $genes_in_subsystem) * 100, 
	    ($cluster_genes_in_subsystem / $genes_in_subset) * 100;
	}
	
	print STDERR "$subsystem_name $count $genes_in_subsystem $genes_covered_by_clusters\n";
	
    }
    print TEX '\end{tabular}';
    close TEX;

    print STDERR "Total genes in subsystems: ".$total_genes_in_subsystems."\n";

}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3