[Bio] / FigKernelPackages / FIG.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FIG.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.35, Tue Mar 9 23:19:23 2004 UTC revision 1.36, Sun Mar 14 03:46:16 2004 UTC
# Line 4  Line 4 
4  use Sim;  use Sim;
5  use Blast;  use Blast;
6  use FIG_Config;  use FIG_Config;
7    use tree_utilities;
8    
9  use IO::Socket;  use IO::Socket;
10    
# Line 1052  Line 1053 
1053    
1054  =pod  =pod
1055    
 =head1 taxonomy_of  
   
 usage: $gs = $fig->taxonomy_of($genome_id)  
   
 Returns the taxonomy of the specified genome.  Gives the taxonomy down to  
 genus and species.  
   
 =cut  
   
 sub taxonomy_of {  
     my($self,$genome) = @_;  
     my($ans);  
     my $taxonomy = $self->cached('_taxonomy');  
   
     if (! ($ans = $taxonomy->{$genome}))  
     {  
         my $rdbH = $self->db_handle;  
         my $relational_db_response = $rdbH->SQL("SELECT genome,taxonomy  FROM genome");  
         my $pair;  
         foreach $pair (@$relational_db_response)  
         {  
             $taxonomy->{$pair->[0]} = $pair->[1];  
         }  
         $ans = $taxonomy->{$genome};  
     }  
     return $ans;  
 }  
   
 =pod  
   
 =head1 is_bacterial  
   
 usage: $fig->is_bacterial($genome)  
   
 Returns true iff the genome is bacterial.  
   
 =cut  
   
 sub is_bacterial {  
     my($self,$genome) = @_;  
   
     return ($self->taxonomy_of($genome) =~ /^Bacteria/);  
 }  
   
   
 =pod  
   
 =head1 is_archaeal  
   
 usage: $fig->is_archaeal($genome)  
   
 Returns true iff the genome is archaeal.  
   
 =cut  
   
 sub is_archaeal {  
     my($self,$genome) = @_;  
   
     return ($self->taxonomy_of($genome) =~ /^Archaea/);  
 }  
   
   
 =pod  
   
 =head1 is_prokaryotic  
   
 usage: $fig->is_prokaryotic($genome)  
   
 Returns true iff the genome is prokaryotic  
   
 =cut  
   
 sub is_prokaryotic {  
     my($self,$genome) = @_;  
   
     return ($self->taxonomy_of($genome) =~ /^(Archaea|Bacteria)/);  
 }  
   
   
 =pod  
   
 =head1 is_eukaryotic  
   
 usage: $fig->is_eukaryotic($genome)  
   
 Returns true iff the genome is eukaryotic  
   
 =cut  
   
 sub is_eukaryotic {  
     my($self,$genome) = @_;  
   
     return ($self->taxonomy_of($genome) =~ /^Eukaryota/);  
 }  
   
 =pod  
   
 =head1 sort_genomes_by_taxonomy  
   
 usage: @genomes = $fig->sort_genomes_by_taxonomy(@list_of_genomes)  
   
 This routine is used to sort a list of genome IDs to put them  
 into taxonomic order.  
   
 =cut  
   
 sub sort_genomes_by_taxonomy {  
     my($self,@fids) = @_;  
   
     return map     { $_->[0] }  
            sort    { $a->[1] cmp $b->[1] }  
            map     { [$_,$self->taxonomy_of($_)] }  
            @fids;  
 }  
   
 =pod  
   
 =head1 crude_estimate_of_distance  
   
 usage: $dist = $fig->crude_estimate_of_distance($genome1,$genome2)  
   
 There are a number of places where we need estimates of the distance between  
 two genomes.  This routine will return a value between 0 and 1, where a value of 0  
 means "the genomes are essentially identical" and a value of 1 means  
 "the genomes are in different major groupings" (the groupings are archaea, bacteria,  
 euks, and viruses).  The measure is extremely crude.  
   
 =cut  
   
 sub crude_estimate_of_distance {  
     my($self,$genome1,$genome2) = @_;  
     my($i,$v,$d,$dist);  
   
     if ($genome1 > $genome2) { ($genome1,$genome2) = ($genome2,$genome1) }  
   
     my $relational_db_response;  
     my $rdbH = $self->db_handle;  
   
     if (($relational_db_response = $rdbH->SQL("SELECT dist FROM distances WHERE ( genome1 = \'$genome1\' ) AND ( genome2 = \'$genome2\' ) ")) &&  
         (@$relational_db_response == 1))  
     {  
         return $relational_db_response->[0]->[0];  
     }  
     return $self->crude_estimate_of_distance1($genome1,$genome2);  
 }  
   
 sub crude_estimate_of_distance1 {  
     my($self,$genome1,$genome2) = @_;  
     my($i,$v,$d,$dist);  
   
     if ($genome1 > $genome2) { ($genome1,$genome2) = ($genome2,$genome1) }  
     $dist = $self->cached('_dist');  
     if (! $dist->{"$genome1,$genome2"})  
     {  
         my @tax1 = split(/\s*;\s*/,$self->taxonomy_of($genome1));  
         my @tax2 = split(/\s*;\s*/,$self->taxonomy_of($genome2));  
   
         $d = 1;  
         for ($i=0, $v=0.5; ($i < @tax1) && ($i < @tax2) && ($tax1[$i] eq $tax2[$i]); $i++, $v = $v/2)  
         {  
             $d -= $v;  
         }  
         $dist->{"$genome1,$genome2"} = $d;  
     }  
     return $dist->{"$genome1,$genome2"};  
 }  
   
 =pod  
   
1056  =head1 org_of  =head1 org_of
1057    
1058  usage: $org = $fig->org_of($prot_id)  usage: $org = $fig->org_of($prot_id)
# Line 1347  Line 1179 
1179    
1180  =pod  =pod
1181    
 =head1 sort_fids_by_taxonomy  
   
 usage: @sorted_by_taxonomy = $fig->sort_fids_by_taxonomy(@list_of_fids)  
   
 Sorts a list of feature IDs based on the taxonomies of the genomes that contain the features.  
   
 =cut  
   
 sub sort_fids_by_taxonomy {  
     my($self,@fids) = @_;  
   
     return map     { $_->[0] }  
            sort    { $a->[1] cmp $b->[1] }  
            map     { [$_,$self->taxonomy_of(&genome_of($_))] }  
            @fids;  
 }  
   
 =pod  
   
1182  =head1 genes_in_region  =head1 genes_in_region
1183    
1184  usage: ($features_in_region,$beg1,$end1) = $fig->genes_in_region($genome,$contig,$beg,$end)  usage: ($features_in_region,$beg1,$end1) = $fig->genes_in_region($genome,$contig,$beg,$end)
# Line 4063  Line 3876 
3876      return undef;      return undef;
3877  }  }
3878    
3879    #################################   Taxonomy  ####################################
3880    
3881    =pod
3882    
3883    =head1 taxonomy_of
3884    
3885    usage: $gs = $fig->taxonomy_of($genome_id)
3886    
3887    Returns the taxonomy of the specified genome.  Gives the taxonomy down to
3888    genus and species.
3889    
3890    =cut
3891    
3892    sub taxonomy_of {
3893        my($self,$genome) = @_;
3894        my($ans);
3895        my $taxonomy = $self->cached('_taxonomy');
3896    
3897        if (! ($ans = $taxonomy->{$genome}))
3898        {
3899            my $rdbH = $self->db_handle;
3900            my $relational_db_response = $rdbH->SQL("SELECT genome,taxonomy  FROM genome");
3901            my $pair;
3902            foreach $pair (@$relational_db_response)
3903            {
3904                $taxonomy->{$pair->[0]} = $pair->[1];
3905            }
3906            $ans = $taxonomy->{$genome};
3907        }
3908        return $ans;
3909    }
3910    
3911    =pod
3912    
3913    =head1 is_bacterial
3914    
3915    usage: $fig->is_bacterial($genome)
3916    
3917    Returns true iff the genome is bacterial.
3918    
3919    =cut
3920    
3921    sub is_bacterial {
3922        my($self,$genome) = @_;
3923    
3924        return ($self->taxonomy_of($genome) =~ /^Bacteria/);
3925    }
3926    
3927    
3928    =pod
3929    
3930    =head1 is_archaeal
3931    
3932    usage: $fig->is_archaeal($genome)
3933    
3934    Returns true iff the genome is archaeal.
3935    
3936    =cut
3937    
3938    sub is_archaeal {
3939        my($self,$genome) = @_;
3940    
3941        return ($self->taxonomy_of($genome) =~ /^Archaea/);
3942    }
3943    
3944    
3945    =pod
3946    
3947    =head1 is_prokaryotic
3948    
3949    usage: $fig->is_prokaryotic($genome)
3950    
3951    Returns true iff the genome is prokaryotic
3952    
3953    =cut
3954    
3955    sub is_prokaryotic {
3956        my($self,$genome) = @_;
3957    
3958        return ($self->taxonomy_of($genome) =~ /^(Archaea|Bacteria)/);
3959    }
3960    
3961    
3962    =pod
3963    
3964    =head1 is_eukaryotic
3965    
3966    usage: $fig->is_eukaryotic($genome)
3967    
3968    Returns true iff the genome is eukaryotic
3969    
3970    =cut
3971    
3972    sub is_eukaryotic {
3973        my($self,$genome) = @_;
3974    
3975        return ($self->taxonomy_of($genome) =~ /^Eukaryota/);
3976    }
3977    
3978    =pod
3979    
3980    =head1 sort_genomes_by_taxonomy
3981    
3982    usage: @genomes = $fig->sort_genomes_by_taxonomy(@list_of_genomes)
3983    
3984    This routine is used to sort a list of genome IDs to put them
3985    into taxonomic order.
3986    
3987    =cut
3988    
3989    sub sort_genomes_by_taxonomy {
3990        my($self,@fids) = @_;
3991    
3992        return map     { $_->[0] }
3993               sort    { $a->[1] cmp $b->[1] }
3994               map     { [$_,$self->taxonomy_of($_)] }
3995               @fids;
3996    }
3997    
3998    =pod
3999    
4000    =head1 crude_estimate_of_distance
4001    
4002    usage: $dist = $fig->crude_estimate_of_distance($genome1,$genome2)
4003    
4004    There are a number of places where we need estimates of the distance between
4005    two genomes.  This routine will return a value between 0 and 1, where a value of 0
4006    means "the genomes are essentially identical" and a value of 1 means
4007    "the genomes are in different major groupings" (the groupings are archaea, bacteria,
4008    euks, and viruses).  The measure is extremely crude.
4009    
4010    =cut
4011    
4012    sub crude_estimate_of_distance {
4013        my($self,$genome1,$genome2) = @_;
4014        my($i,$v,$d,$dist);
4015    
4016        if ($genome1 > $genome2) { ($genome1,$genome2) = ($genome2,$genome1) }
4017    
4018        my $relational_db_response;
4019        my $rdbH = $self->db_handle;
4020    
4021        if (($relational_db_response = $rdbH->SQL("SELECT dist FROM distances WHERE ( genome1 = \'$genome1\' ) AND ( genome2 = \'$genome2\' ) ")) &&
4022            (@$relational_db_response == 1))
4023        {
4024            return $relational_db_response->[0]->[0];
4025        }
4026        return $self->crude_estimate_of_distance1($genome1,$genome2);
4027    }
4028    
4029    sub crude_estimate_of_distance1 {
4030        my($self,$genome1,$genome2) = @_;
4031        my($i,$v,$d,$dist);
4032    
4033        if ($genome1 > $genome2) { ($genome1,$genome2) = ($genome2,$genome1) }
4034        $dist = $self->cached('_dist');
4035        if (! $dist->{"$genome1,$genome2"})
4036        {
4037            my @tax1 = split(/\s*;\s*/,$self->taxonomy_of($genome1));
4038            my @tax2 = split(/\s*;\s*/,$self->taxonomy_of($genome2));
4039    
4040            $d = 1;
4041            for ($i=0, $v=0.5; ($i < @tax1) && ($i < @tax2) && ($tax1[$i] eq $tax2[$i]); $i++, $v = $v/2)
4042            {
4043                $d -= $v;
4044            }
4045            $dist->{"$genome1,$genome2"} = $d;
4046        }
4047        return $dist->{"$genome1,$genome2"};
4048    }
4049    
4050    =pod
4051    
4052    =head1 sort_fids_by_taxonomy
4053    
4054    usage: @sorted_by_taxonomy = $fig->sort_fids_by_taxonomy(@list_of_fids)
4055    
4056    Sorts a list of feature IDs based on the taxonomies of the genomes that contain the features.
4057    
4058    =cut
4059    
4060    sub sort_fids_by_taxonomy {
4061        my($self,@fids) = @_;
4062    
4063        return map     { $_->[0] }
4064               sort    { $a->[1] cmp $b->[1] }
4065               map     { [$_,$self->taxonomy_of(&genome_of($_))] }
4066               @fids;
4067    }
4068    
4069    sub build_tree_of_complete {
4070        my($self,$min_for_label) = @_;
4071        my(@last,@tax,$i,$prefix,$lev,$genome,$tax);
4072    
4073        $min_for_label = $min_for_label ? $min_for_label : 10;
4074        open(TMP,">/tmp/tree$$") || die "could not open /tmp/tree$$";
4075        print TMP "1. root\n";
4076    
4077        @last = ();
4078    
4079    
4080        foreach $genome (grep { $_ !~ /^99999/ } $self->sort_genomes_by_taxonomy($self->genomes("complete")))
4081        {
4082            $tax = $self->taxonomy_of($genome);
4083            @tax = split(/\s*;\s*/,$tax);
4084            push(@tax,$genome);
4085            for ($i=0; ((@last > $i) && (@tax > $i) && ($last[$i] eq $tax[$i])); $i++) {}
4086            while ($i < @tax)
4087            {
4088                $lev = $i+2;
4089                $prefix = " " x (4 * ($lev-1));
4090                print TMP "$prefix$lev\. $tax[$i]\n";
4091                $i++;
4092            }
4093            @last = @tax;
4094        }
4095        close(TMP);
4096        my $tree = &tree_utilities::build_tree_from_outline("/tmp/tree$$");
4097        $tree->[0] = 'All';
4098        &limit_labels($tree,$min_for_label);
4099        unlink("/tmp/tree$$");
4100        return ($tree,&tips_of_tree($tree));
4101    }
4102    
4103    sub limit_labels {
4104        my($tree,$min_for_label) = @_;
4105    
4106        my($children) = &tree_utilities::node_pointers($tree);
4107        if (@$children == 1)
4108        {
4109            return 1;
4110        }
4111        else
4112        {
4113            my $n = 0;
4114            my $i;
4115            for ($i=1; ($i < @$children); $i++)
4116            {
4117                $n += &limit_labels($children->[$i],$min_for_label);
4118            }
4119            if ($n < $min_for_label)
4120            {
4121                $tree->[0] = "";
4122            }
4123            return $n;
4124        }
4125    }
4126    
4127    sub taxonomic_groups_of_complete {
4128        my($self,$min_for_labels) = @_;
4129    
4130        my($tree,undef) = $self->build_tree_of_complete($min_for_labels);
4131        return &taxonomic_groups($tree);
4132    }
4133    
4134    sub taxonomic_groups {
4135        my($tree) = @_;
4136    
4137        my($groups,undef) = &taxonomic_groups_and_children($tree);
4138        return $groups;
4139    }
4140    
4141    sub taxonomic_groups_and_children {
4142        my($tree) = @_;
4143        my($ids1,$i,$groupsC,$idsC);
4144    
4145        my $ptrs   = &tree_utilities::node_pointers($tree);
4146        my $ids    = [];
4147        my $groups = [];
4148    
4149        if (@$ptrs > 1)
4150        {
4151            $ids1 = [];
4152            for ($i=1; ($i < @$ptrs); $i++)
4153            {
4154                ($groupsC,$idsC) = &taxonomic_groups_and_children($ptrs->[$i]);
4155                if (@$groupsC > 0)
4156                {
4157                    push(@$groups,@$groupsC);
4158                }
4159                push(@$ids1,@$idsC);
4160            }
4161    
4162            if ($tree->[0])
4163            {
4164                push(@$groups,[$tree->[0],$ids1]);
4165            }
4166            push(@$ids,@$ids1);
4167        }
4168        elsif ($tree->[0])
4169        {
4170            push(@$ids,$tree->[0]);
4171        }
4172    
4173        return ($groups,$ids);
4174    }
4175    
4176  1  1

Legend:
Removed from v.1.35  
changed lines
  Added in v.1.36

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3