[Bio] / FigKernelScripts / recompute_comparison.pl Repository:
ViewVC logotype

View of /FigKernelScripts/recompute_comparison.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Fri Mar 7 21:08:31 2008 UTC (11 years, 11 months ago) by bartels
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
*** empty log message ***

#!/usr/bin/env /home/bartels/FIGdisk/env/cee/bin/perl


use FIG;
use Getopt::Std;
use Consistency_Families;

getopts ( 'd:he:f:s:yzmnu' );

our ( $opt_d, $opt_h, $opt_f, $opt_e, $opt_s, $opt_y, $opt_z, $opt_n, $opt_m, $opt_u );
if ( defined( $opt_h ) ) {
    &usage;
    exit( 0 );
}

#my $seedurl = 'http://theseed.uchicago.edu/FIG/protein.cgi?prot=';
my $seedurl = 'http://clearinghouse.nmpdr.org/aclh.cgi?page=SearchResults&query=';

if ( defined( $opt_u ) ) {
    $seedurl = 'http://anno-3.nmpdr.org/anno/FIG/protein.cgi?user=VeronikaV&prot=';
}

my $a = "F2-func";
my $b = "F2-id";
my $c = "id-funcs";

my $all = "Comparison_all.tsv";
my $inc = "Comparison_Inconsistent.tsv";
my $col = "Comparison_Inconsistent_Collapsed.tsv";

my $dir = $opt_d;

$dir =~ /(.*)\/(\w+)\/$/;
$uberdir = $1;
$family = $2;

#my $programm = $opt_e.'/compute_comparison_table.pl';

my $htmlindex = $uberdir.'/comparison.html';
print STDERR "writing to $htmlindex\n";

##############################################
# Print the beginning of the html index file #
##############################################
if ( !$opt_n ) {
    open ( INDEX, ">$htmlindex" );
    
    print INDEX "<HTML>\n<BODY>\n\n";
    print INDEX "<STYLE>td {border: 1px solid black; padding: 3px; font-size: 10pt} table {border-spacing: 0px; } body {font-family: Helvetica; font-size: 10pt;}</STYLE>\n";
    print INDEX "<P style=\"margin-bottom:1%\"><H2>Evaluation using $family as a basis</H2></P>";
    print INDEX "<TABLE border=1>";
    print INDEX "<TR><TD><B>BRC</B></TD><TD><B>Number of proteins</B></TD><TD><B>Number of Families</B></TD><TD><B>InconsistentFamilies</B></TD>";
    print INDEX "<TD><B>Inconsistent using SameFunc</B></TD><TD><B>Consistency</B></TD><TD><B>Consistency using SameFunc</B></TD></TR>\n";
}
else {
    open ( INDEX, ">>$htmlindex" );
}

############################################
# Now work through the directory structure #
############################################
opendir( DIR, $dir ) or die "Could not open directory $dir\n";

while (defined ($thisdir = readdir(DIR))) {
    next if ( $thisdir eq '.' );
    next if ( $thisdir eq '..' );

#   next if ( $thisdir ne 'APIDB' );

    print $thisdir."\n";
    
    my $createdir = $uberdir.'/'.$family.'/'.$thisdir;
    system "mkdir $createdir";

    my $temp_a = $dir.'/'.$thisdir.'/'.$a;
    my $temp_b = $dir.'/'.$thisdir.'/'.$b;
    my $temp_c = $dir.'/'.$thisdir.'/'.$c;
    my $temp_all = $dir.'/'.$thisdir.'/'.$all;
    my $temp_inc = $dir.'/'.$thisdir.'/'.$inc;
    my $temp_col = $dir.'/'.$thisdir.'/'.$col;
    my $temp_all_html = $family.'/'.$thisdir.'/Comparison_all.html';
    my $temp_inc_html = $family.'/'.$thisdir.'/Comparison_Inconsistent.html';
    my $temp_col_html = $family.'/'.$thisdir.'/Comparison_Inconsistent_Collapsed.html';
    my $temp_all_html_full = $uberdir.'/'.$family.'/'.$thisdir.'/Comparison_all.html';
    my $temp_inc_html_full = $uberdir.'/'.$family.'/'.$thisdir.'/Comparison_Inconsistent.html';
    my $temp_col_html_full = $uberdir.'/'.$family.'/'.$thisdir.'/Comparison_Inconsistent_Collapsed.html';
    my $temp_excel_link = $family.'/'.$thisdir.'/Comparison_Download.xls';
    my $temp_excel_link2 = $uberdir.'/'.$family.'/'.$thisdir.'/Comparison_Download.xls';

    print STDERR $temp_excel_link."\n";
    print STDERR $temp_excel_link2."\n";
#    exit(0);
    # Hash that includes foreach FAMILY_ID the abbreviation and the function: #
    # $FamFunction->{ $fam }->{ 'short' }    = Abbreviation                   #
    # $FamFunction->{ $fam }->{ 'function' } = Function                       #
    
    my $FamFunction = Consistency_Families::read_famfunction( $temp_a, $opt_s );
    
    # Hash that includes the function for each CDS: #
    # $PegFunction->{ $cds } = Function             #
    
    my ( $PegFunction, $abr ) = Consistency_Families::read_pegfunction( $temp_c );
    
    # At some point we need the correspondences between the fig ids and #
    # the ids of the proteins we analyse here. Therefore, we use the    #
    # id-correspondence file Ross created.                              #
    
    my $idhash = Consistency_Families::build_hash( $abr );
    my $pathemahash = Consistency_Families::build_pathema_hash(  );
    
    # Hash that includes foreach FAMILY_ID an array of CDSs that belong to it: #
    # $FamsTOPEGs{ $fam } = /@cdss                                             #
    print STDERR "READING $temp_b\n";
    my $FamsTOPEGs = Consistency_Families::read_familyids( $opt_s, $temp_b );
    
    # Hash that includes all the families that should be excluded from now on   #
    # $opt_f is the name of the file which consists of the Families that should #
    # be excluded. If given, hash $FILTEROUT will be filled with those          #
    
    my $FILTEROUT = undef;
    
    if ( defined( $f ) ) {
	$FILTEROUT = read_filterout( $s, $f );
    }

    my ( $all_counter, $def_counter, $consistency_counter ) = Consistency_Families::compute_single_comparison_file( $temp_all_html_full, $seedurl, $pathemahash, $FamFunction, $FamsTOPEGs, $PegFunction, $idhash, $opt_s, $FILTEROUT, $opt_d, $opt_y, $opt_z, 0, 0, undef, $temp_excel_link, $temp_excel_link2 );

my ( $inc_counter ) = Consistency_Families::compute_single_comparison_file( $temp_inc_html_full, $seedurl, $pathemahash, $FamFunction, $FamsTOPEGs, $PegFunction, $idhash, $opt_s, $FILTEROUT, $opt_d, $opt_y, $opt_z, 1, 0 );

    my ( $col_counter, undef, $consistency2_counter ) = Consistency_Families::compute_single_comparison_file( $temp_col_html_full, $seedurl, $pathemahash, $FamFunction, $FamsTOPEGs, $PegFunction, $idhash, $opt_s, $FILTEROUT, $opt_d, $opt_y, $opt_z, 1, 1 );
    
    # RUNDEN #
    $consistency_counter = sprintf ("%.2f", $consistency_counter);
    $consistency2_counter = sprintf ("%.2f", $consistency2_counter);
    $accuracy_counter = sprintf ("%.2f", $accuracy_counter);
    $accuracy2_counter = sprintf ("%.2f", $accuracy2_counter);

    ########################################################
    # Print the part of the table that belongs to that BRC #
    ########################################################
    print STDERR $def_counter." GRML\n";
    if ( $def_counter > 0 ) {
      print STDERR "INHEREmann\n";
      print INDEX "<TR><TD>$thisdir</TD>";
      print INDEX "<TD><A HREF=\"$temp_excel_link\"\>$def_counter</A></TD>\n";
      
      print INDEX "<TD><A HREF=\"".$temp_all_html."\">$all_counter families</A></TD>\n";
      print INDEX "<TD><A HREF=\"$temp_inc_html\">$inc_counter families</A></TD>\n";
      print INDEX "<TD><A HREF=\"$temp_col_html\">$col_counter families</A></TD>";
      
      print INDEX "<TD>$consistency_counter</TD>";
      print INDEX "<TD>$consistency2_counter</TD>";
      print INDEX "</TR>\n\n";
    }
#    last;
}

#############################################
# Now print the end of that html index file #
#############################################
if ( !$opt_m ) {
    print INDEX "</TABLE>\n";
    print INDEX "<P>&nbsp;</P>\n";
    
# and some documentation on that website...
    print INDEX "<TABLE border=1>\n";
    print INDEX "<TR><TD>Families</TD><TD>This evaluation is based on 3 different kinds of classifications of proteins: TIGRfams, TIGRfamEquivalogs and FIGfams. These are referenced as <B>Families</B> in the following</TD></TR>";
    print INDEX "<TR><TD>Number of Proteins</TD><TD>The number of proteins curated by the respective BRC that belong to a Family</TD></TR>";
    print INDEX "<TR><TD>Number of Families</TD><TD>The number of different Families that are hit by the BRC proteins</TD></TR>";
    print INDEX "<TR><TD>Inconsistent Families</TD><TD>The number of Families that include BRC proteins with more than one BRC annotation</TD></TR>";
    print INDEX "<TR><TD>Inconsistent Families using SameFunc</TD><TD>The number of Families that include BRC proteins with more than one BRC annotation, but here we tried to collapse the different annotations using our SameFunc</TD></TR>";
    print INDEX "<TR><TD>Consistency</TD><TD>Consistency is measured for each Family. It is the probability that taken two different proteins of a given Family, the annotations are the same</TD></TR>";
    print INDEX "<TR><TD>Consistency using SameFunc</TD><TD>Consistency is measured for each Family. It is the probability that taken two different proteins of a given Family, the annotations are the same measured via SameFunc</TD></TR>";
    print INDEX "</TABLE>\n";
# we removed the accuracy stuff
    
    print INDEX "</BODY>\n</HTML>";
}

close INDEX;

sub usage {

    print "recompute_comparison.pl\n";
    print " -d <OUTPUT DIRECTORY\n";
    print " -e directory where the scripts can be found\n";
    print " -s 'f' for FIGfam, 'p' for PIRfam, 't' for TIGRfam\n";
    print " -f file with families to be filtered out\n";
    print " -y for printing also the consistency value...\n";
    print " -z for printing also the accuracy value...\n";
    print " -h print this help\n";

}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3