[Bio] / FigWebServices / heat_map.cgi Repository:
ViewVC logotype

View of /FigWebServices/heat_map.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.20 - (download) (annotate)
Thu Jun 22 23:20:31 2006 UTC (13 years, 4 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.19: +1 -0 lines
RAE: minor changes to heat map interface

#__perl__

=pod

=head1 heat_map.cgi

A simple "microarray" like program that I wanted. Just display a table with no borders, where the rows are the ss and the cols are the samples, and the cells are the intensity

A gui representation of data. I want to represent the connections to subsystems between different genomes. This allows us to compare the relative amounts of each metabolism occuring in each genome.

The connections to subsystems are stored as attributes, and are generated by the script:

get_ss_connections

=cut

use strict;
use FIG;
use HTML;
use CGI;
use FIG_CGI;
use CGI::Carp qw(fatalsToBrowser);
my $html=["<TITLE>Heat Map NQ</title>"];
use raelib;
use raedraw;
my $raedraw=new raedraw;
my $raelib=new raelib;

my ($fig, $cgi, $user);
eval {
    ($fig, $cgi, $user) = FIG_CGI::init(debug_save => 0,
                                        debug_load => 0,
                                        print_params => 0);
};                                      

if ($@ ne ""){
    my $err = $@;

    my(@html);

    push(@html, $cgi->p("Error connecting to SEED database."));
    if ($err =~ /Could not connect to DBI:.*could not connect to server/)
    {
        push(@html, $cgi->p("Could not connect to relational database of type $FIG_Config::dbms named $FIG_Config::db on port $FIG_Config::dbport."));
    }   
    else
    {
        push(@html, $cgi->pre($err));
    }   
    &HTML::show_page($cgi, \@html, 1, undef, {"default"=>"Html/css/heatmap.css"});
    exit;
}



unless ($cgi->param("korgs"))
{
    my %options;
    map {$options{$_} = $fig->genus_species($_) . " ($_)"} &genomes_with_cnx();

    my %limit=(""=>1, "unclassified"=>1);
    foreach my $ssc ($fig->all_subsystem_classifications()) {$limit{$ssc->[0]}=1}
    
    unless ($cgi->param('complete')) {$cgi->param('complete', 'All')}

    # logo
    #  $cgi->p({style=>"text-align: center;"}, $cgi->a({href=>$cgi->url}, $cgi->img({alt=>"Heat Map NR", src=>"/heatmapnq.png"}))),

    push @$html, (
      $cgi->start_form(), 
      $cgi->h2("Heat Map NQ"),
      $cgi->p("Heat Map NQ is designed to show relationships between subsystems in different environmental samples. Each subsystem that is present in a sample gets a score. The score is calculated by counting the number of sequences that are similar to a protein in each subsystem. This number is divided by the total number of sequences from the sample that are similar to any protein in a subsystem, so it is the fraction of sequences in subsystems. Therefore the size of the sample should not necessarily affect the number that you see. Please note that these numbers are only approximate and \"for entertainment purposes only\"."),
      $cgi->p("We have also integrated our statistical comparison package <a href='http://sourceforge.net/projects/xipe-totec' target='_new'>xipe-totec</a> into this analysis so that you can identify those subsystems that are present at unlikely levels. These comparisons are precomputed since they take about an hour per comparison, and so for a 4x4 table you'd have to wait 6 hours for the page to render. The standard protocol is to provide differences at 90%, 95%, and 99% confidence."),
      $cgi->p("The raw numbers mean that if there are 10 sequences that hit all subsystems in total, then a subsystem that has two sequences that hit it will get a score of 0.2 (2/10). However, these numbers tend to be 2 and 100000, so the number is very small in most cases. Therefore, the multiplier allows you to multiply all scores by a number to make them 2 instead of 0.0000002. The non-quantitative analysis gets biased by one or two outliers, so you can also overcome the outlier effect by trimming off the maximums -- anything above your chosen value is set as the maximum. Note that the maximum value is from the unmodified raw score."),
     $cgi->p("The raw scores may not mean that 2 is twice as much as 1, just that 2 is more than one. Because of that, and because it is easier to visualize groups of data, you can aggregate all the data into chunks. This will take all scores and split them into however many groups you tell it to. That is the non-quantitative analysis."),
      $cgi->p("My reccommendation is that you display different areas of metabolism, with non-quantitative differences grouped in either 5 or 10 groups. You can also see the raw data by using the quantitative analysis checkbox, but I am not certain how much you can infer from these numbers - does 2 mean twice as much as 1?"),
      $cgi->h2("Dataset"),
      $cgi->p(
       $cgi->br("Please choose some genomes: &nbsp; ",
      $raelib->scrolling_org_list($cgi, 1, 0, [&genomes_with_cnx()]),
      $cgi->br("Please choose a subset to show: &nbsp; ", $cgi->popup_menu(-name=>"limit", -values=>[sort {uc($a) cmp uc($b)}keys %limit], -default=>""), " &nbsp; (leave blank to see all of metabolism\n"),
      )),
      $cgi->h2("Non-quantitative Analysis"),
      $cgi->p(
        $cgi->p("Non quantitive analysis groups the data into a set of groups and colors the boxes accordingly. This is the default that you should probably use.\n"),
        $cgi->br("Number of groups: &nbsp; ", $cgi->textfield(-name=>"ng", -default=>5, -size=>3)),
        $cgi->br("Effective raw score maximum: &nbsp; ", $cgi->textfield(-name=>"fmax", -size=>5), " (a good value for this is about .01)\n"),
      ),
      $cgi->h2("Quantitative Analysis"),
      $cgi->p(
        $cgi->p("Quantitive analysis will show you the number of subsystems in each sample. This is the ratio of the number of times that subsystem is hit to the total number of subsystems that are found in the sample.\n The ratio is multiplied by a fiddle factor to normalize the data. Set the multiplier here, or use the default\n"),
        $cgi->br($cgi->checkbox(-name=>"quant", -label=>"Use quantitative analysis")," &nbsp; Multiplier: &nbsp; ", $cgi->textfield(-name=>"fiddle", -default=>5000, -size=>5)),
    ),
      $cgi->h2("Xipe-Totec"),
      $cgi->p(
         $cgi->p("<a href='http://sourceforge.net/projects/xipe-totec' target='_new'>Xipe-totec</a> is our statistical comparison of the presence of subsystems in different samples. This will show the results of those analyses."),
        $cgi->br($cgi->checkbox(-name=>"xipe", -label=>"", -checked=>1), "Include a table of the xipe statistical comparison in the output", " &nbsp; Confidence: &nbsp; ", $cgi->popup_menu(-name=>"xipe_conf", -values=>[0.90, 0.95, 0.99], -default=>"0.95")),
        ),
      $cgi->h2("Display"),
      $cgi->p(
        "The default is to use a blue color as the extreme, but you can change that to red or green\n",
        $cgi->br($cgi->popup_menu(-name=>"color", -label=>"Default color scheme", -values=>['blue', 'red', 'green'], -default=>'blue')),
      ),
      $cgi->submit, $cgi->reset, $cgi->end_form());
      
      &HTML::show_page($cgi, $html, 1, undef, {"default"=>"Html/css/heatmap.css"});
      exit(0);
}




my  @genomes=sort {lc($fig->genus_species($a)) cmp lc($fig->genus_species($b))} $cgi->param('korgs');
my $scores; my $max;
for (my $i=0; $i<=$#genomes; $i++)
{
    next unless ($fig->is_genome($genomes[$i]));
    foreach my $attr ($fig->get_attributes($genomes[$i], "ss_connections"))
    {
        $attr->[2] =~ /^(.*):(([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?)/;
        my ($ss, $score)=($1, $2);
        unless ($ss && defined $score) {die "Can't parse a ss and a score from ".(join("\n", @$attr))}
        unless (defined $scores->{$ss}) {$#{$scores->{$ss}}=$#genomes}
        $scores->{$ss}->[$i]=$score;
    }
}

my @data;
foreach my $arr (&right_classification([keys %$scores]))
{
    my $ss=$arr->[2];
    foreach my $sc (@{$scores->{$ss}}) {($sc > $max) ? ($max=$sc) : 1}
    push @data, [@$arr, @{$scores->{$ss}}];
}

#fix the effective maximum if we have set it
($cgi->param('fmax')) ? ($max=$cgi->param('fmax')) : 1;

unless ($data[0] && $max)
{
    push @$html, 
        (
        $cgi->p({style=>'color: red; background-color: yellow; font-size: 1.2em; font-weight: bolder;'}, "Sorry, no subsystems matched your query. <br>Please use your back button to try again"),
        );
        &HTML::show_page($cgi, $html, 1, undef, {"default"=>"Html/css/heatmap.css"});
        exit(0);
}


# now we have the max, we need to figure out what the groups are.
# we want $ng groups, and so mapping will have the range from 0 to 100
# @mapping has all the data, in order
my $ng=$cgi->param('ng');
my @mapping=(0);
my %mapped = (0=>0);
my $count=int(100/$ng);
for (my $i=$max/$ng; $i<= ($max-($max/$ng)); $i+=$max/$ng)
{
   push @mapping, $i;
   $mapped{$i}=$count;
   $count+=int(100/$ng);
}
# define $max as 100
push @mapping, 100;
$mapped{100}=100;

my $tab;
foreach my $a (@data)
{
    # delete this part to remove the links to the ss
    my $ssn=$a->[2];
    $ssn =~ s/ /_/g;
    $a->[2] = &HTML::sub_link($cgi, $a->[2]);

    my @row;
    foreach my $cell (@$a)
    {
        if ($raelib->is_number($cell)) {
            if ($cgi->param('quant'))
            {
                $cell *= $cgi->param('fiddle');
            }
            else
            {
                my $changed;
                for (my $i=0; $i<=$#mapping; $i++)
                {
                    last if ($changed);
                    if ($cell < $mapping[$i]) {$cell=$mapped{$mapping[$i]}; $changed=1}
                }
                unless ($changed) {$cell=100}
            }
            my @color=$raedraw->heat_map_color($cell, $cgi->param('color'));
            my $bgcolor;
            map {$_=int($_*255); $bgcolor.=sprintf("%x", $_)} @color;
            $cell =~ s/(\.\d\d)\d+/$1/;
            push @row, [" $cell ", "td bgcolor='#$bgcolor' align='center'"]
#push @row, [" $cell ($hue) ", "td"]
        }
        elsif ($cell) {push @row, [$cell, "td"]}
        else {push @row, [" &nbsp; ", "td"]}
    }
    push @$tab, \@row;
}

# sort the table by column 1 then col 2 then col 3
@$tab=sort {$a->[0]->[0] cmp $b->[0]->[0] || $a->[1]->[0] cmp $b->[1]->[0] || $a->[2]->[0] cmp $b->[2]->[0]} @$tab;

# merge the table
# skip the data columns
my $skip;
map {$skip->{$_}=1} (2..10);
$tab=&HTML::merge_table_rows($tab, $skip) unless ($cgi->param('create_excel'));

# generate the table of significant differences;
my $sigtab;
if ($cgi->param("xipe")) 
{
    $sigtab=$cgi->hr . "\n" .&significant_difference() . $cgi->submit("create_excel", "Create excel file of this table");
}

# finally make the HTML


my $border=0;
if ($cgi->param('border')) {$border=1}
my @headers=("Class 1", "Class 2", "Subsystem");
push @headers, map {$fig->genus_species($_) . "<br />$_"} @genomes;

my %options=("border"=>0);
if ($cgi->param('create_excel')) {$options{'excelfile'}="SubsystemConnections"}

push @$html, 
    (
        $cgi->start_form,
        $cgi->hidden('korgs'), 
        $cgi->hidden('border'),
        $cgi->hidden('fiddle'),
        $cgi->hidden('quant'),
        $cgi->hidden('xipe'), $cgi->hidden('xipe_conf'),
        $cgi->hidden('ng'),
        $cgi->hidden('fmax'),
        $cgi->hidden('limit'),
        $cgi->hidden('color'),
        &HTML::make_table([], &control_color_table(), ""),
        &HTML::make_table(\@headers, $tab, "", %options),
        $cgi->submit("create_excel", "Create excel file of this table"),
        $sigtab,
    );

     &HTML::show_page($cgi, $html, 1, undef, {"default"=>"Html/css/heatmap.css"});

exit(0);


sub genomes_with_cnx {
    my %gcx;
    foreach my $attr ($fig->get_attributes(undef, "ss_connections"))
    {
        $gcx{$attr->[0]}=1;
    }
    return keys %gcx;
}

sub control_color_table {
# controltab is the table at the top that shows what the colors are.
    my $controltab;
    {
        my $row;
        for (my $i=0; $i<=100; $i+=2)
        {
            my @color=$raedraw->heat_map_color($i, $cgi->param('color'));
            my $bgcolor;
            map {$_=int($_*255); $bgcolor.=sprintf("%x", $_)} @color;
            push @$row, [" $i ", "td bgcolor='#$bgcolor' align='center'"];
#if ($i && !($i % 20)) {push @$controltab, $row; undef $row}
        }
        push @$controltab, $row;
    }
    return $controltab;
}


sub significant_difference {
    # identify those things with a significant difference and make a cool table of them

    my $idx; # the index of each genome in the table. just the column number. note that col 0 == header column
    for (my $i=0; $i<=$#genomes; $i++)
    {
        $idx->{$genomes[$i]}=$i+1;
    }



    # read the xipe attribute for significant differences
    my $xipe; my $seen; my @allss;
    foreach my $i (0 .. $#genomes)
    {
        foreach my $attr (sort {lc($a->[2]) cmp lc($b->[2])} $fig->get_attributes($genomes[$i], "xipe"))
        {
            $attr->[2] =~ /c=(\d+\.\d+)/;
            next unless ($1 == $cgi->param("xipe_conf"));
            my @pieces=split /\:/, $attr->[2];

            next unless ($idx->{$pieces[0]});
            # this is a hack to ignore things with > 1 entry. We should clean this up and show based on confidence
            next if ($seen->{$genomes[$i]}->{$pieces[0]}->{$pieces[1]});
            $seen->{$genomes[$i]}->{$pieces[0]}->{$pieces[1]}=1;
            push @allss, $pieces[1];
            push @{$xipe->{$genomes[$i]}->[$idx->{$pieces[0]}]}, $pieces[1];
        }
    }

    my %wantedss; # limit by classification choice
    map {$wantedss{$_->[2]}=1} &right_classification(\@allss);

    # we want to make a table where the first column in the table is the genome name and the other columns are the ss, one per line
    # the problem is that we don't know which column has the most ss and we don't know which ones we'll ignore yet.
    # so we use added to determine whether any column in that row has data, and we use more to see whether any genome has potential data
    # left in it. This allows us to trim based on selected ss.
    my $tab; 
    foreach my $genome (@genomes)
    {
        my $added; my $more=1;
        while ($more)
        {
            undef $more;
            my $row=[[$fig->genus_species($genome) . " <br />\n ($genome)", "td class=\"bordered\""]];
            for (my $i=1; $i<=$#genomes+1; $i++)
            {
                my $cell=shift @{$xipe->{$genome}->[$i]};
                if ($cell) {$more=1}
                undef ($cell) unless ($wantedss{$cell});
                if ($cell) 
                {
                    $row->[$i]=[&HTML::sub_link($cgi, $cell), "td class=\"bordered\""];
                    $added=1;
                }
                else {$row->[$i]=" &nbsp; "}
            }
            if ($added) {push @$tab, $row; undef $added}
        }
    }

    # merge the tables based on identical rows
    my $skip;
    map {$skip->{$_}=1} (1 .. scalar(@genomes)); 
    $tab=&HTML::merge_table_rows($tab, $skip);
    
    
    my $hdrs=["UP IN", map {$fig->genus_species($_)."<br />($_)\n"} @genomes];
    map {$_=[$_, "th class=\"bordered\""]} @$hdrs;
    
    my %options=("border"=>1, "excelfile"=>"SubsystemConnections");
    return &HTML::make_table($hdrs, $tab, "Subsystems with significant difference", %options);

    # use the text table not the html table
    #$options{"excelfile"}="SubsystemConnections";
    #$raelib->tab2excel($hdrs, $texttab, "SigDiff", \%options, $options{"excelfile"});
}

# take a reference to an array of subsystems and only return those ones that we are interested in

sub right_classification {
    my $arr=shift;
    my @data;
    foreach my $ss (@$arr)
    {
        my @class=@{$fig->subsystem_classification($ss)};
        if (
                (
                 $cgi->param('limit') && 
                 ($cgi->param('limit') eq "unclassified" && !$class[0]) || 
                 ($cgi->param('limit') eq $class[0])
                ) || 
                !$cgi->param('limit')
           )
        {
            push @data, [@class, $ss];
        }
    }
    return @data;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3