package FIG; use DBrtns; use Sim; use Blast; use FIG_Config; use tree_utilities; use IO::Socket; use FileHandle; use Carp; use Data::Dumper; use Time::Local; use strict; use Fcntl qw/:flock/; # import LOCK_* constants sub new { my($class) = @_; my $rdbH = new DBrtns; bless { _dbf => $rdbH, }, $class; } sub DESTROY { my($self) = @_; my($rdbH); if ($rdbH = $self->db_handle) { $rdbH->DESTROY; } } sub delete_genomes { my($self,$genomes) = @_; my $tmpD = "$FIG_Config::temp/tmp.deleted.$$"; my $tmp_Data = "$FIG_Config::temp/Data.$$"; my %to_del = map { $_ => 1 } @$genomes; open(TMP,">$tmpD") || die "could not open $tmpD"; my $genome; foreach $genome ($self->genomes) { if (! $to_del{$genome}) { print TMP "$genome\n"; } } close(TMP); &run("extract_genomes $tmpD $FIG_Config::data $tmp_Data"); &run("mv $FIG_Config::data $FIG_Config::data.deleted; mv $tmp_Data $FIG_Config::data; fig load_all; rm -rf $FIG_Config::data.deleted"); } sub add_genome { my($self,$genomeF) = @_; my $rc = 0; if (($genomeF =~ /((.*\/)?(\d+\.\d+))$/) && (! -d "$FIG_Config::organisms/$3")) { my $genome = $3; my @errors = `$FIG_Config::bin/verify_genome_directory $genomeF`; if (@errors == 0) { &run("cp -r $genomeF $FIG_Config::organisms; chmod -R 777 $FIG_Config::organisms/$genome"); chmod 0777, "$FIG_Config::organisms/$genomeF"; &run("index_contigs $genome"); &run("compute_genome_counts $genome"); &run("load_features $genome"); $rc = 1; if (-s "$FIG_Config::organisms/$genome/Features/peg/fasta") { &run("index_translations $genome"); my @tmp = `cut -f1 $FIG_Config::organisms/$genome/Features/peg/tbl`; chop @tmp; &run("cat $FIG_Config::organisms/$genome/Features/peg/fasta >> $FIG_Config::data/Global/nr"); &make_similarities(\@tmp); } if ((-s "$FIG_Config::organisms/$genome/assigned_functions") || (-d "$FIG_Config::organisms/$genome/UserModels")) { &run("add_assertions_of_function $genome"); } } } return $rc; } sub make_similarities { my($fids) = @_; my $fid; open(TMP,">>$FIG_Config::global/queued_similarities") || die "could not open $FIG_Config::global/queued_similarities"; foreach $fid (@$fids) { print TMP "$fid\n"; } close(TMP); } sub get_local_hostname { # # First check to see if we our hostname is correct. # # Map it to an IP address, and try to bind to that ip. # my $tcp = getprotobyname('tcp'); my $hostname = `hostname`; chop($hostname); my @hostent = gethostbyname($hostname); if (@hostent > 0) { my $sock; my $ip = $hostent[4]; socket($sock, PF_INET, SOCK_STREAM, $tcp); if (bind($sock, sockaddr_in(0, $ip))) { # # It worked. Reverse-map back to a hopefully fqdn. # my @rev = gethostbyaddr($ip, AF_INET); if (@rev > 0) { my $host = $rev[0]; # # Check to see if we have a FQDN. # if ($host =~ /\./) { # # Good. # return $host; } else { # # We didn't get a fqdn; bail and return the IP address. # return get_hostname_by_adapter() } } else { return inet_ntoa($ip); } } else { # # Our hostname must be wrong; we can't bind to the IP # address it maps to. # Return the name associated with the adapter. # return get_hostname_by_adapter() } } else { # # Our hostname isn't known to DNS. This isn't good. # Return the name associated with the adapter. # return get_hostname_by_adapter() } } sub get_hostname_by_adapter { # # Attempt to determine our local hostname based on the # network environment. # # This implementation reads the routing table for the default route. # We then look at the interface config for the interface that holds the default. # # # Linux routing table: # [olson@yips 0.0.0]$ netstat -rn # Kernel IP routing table # Destination Gateway Genmask Flags MSS Window irtt Iface # 140.221.34.32 0.0.0.0 255.255.255.224 U 0 0 0 eth0 # 169.254.0.0 0.0.0.0 255.255.0.0 U 0 0 0 eth0 # 127.0.0.0 0.0.0.0 255.0.0.0 U 0 0 0 lo # 0.0.0.0 140.221.34.61 0.0.0.0 UG 0 0 0 eth0 # # Mac routing table: # # bash-2.05a$ netstat -rn # Routing tables # # Internet: # Destination Gateway Flags Refs Use Netif Expire # default 140.221.11.253 UGSc 12 120 en0 # 127.0.0.1 127.0.0.1 UH 16 8415486 lo0 # 140.221.8/22 link#4 UCS 12 0 en0 # 140.221.8.78 0:6:5b:f:51:c4 UHLW 0 183 en0 408 # 140.221.8.191 0:3:93:84:ab:e8 UHLW 0 92 en0 622 # 140.221.8.198 0:e0:98:8e:36:e2 UHLW 0 5 en0 691 # 140.221.9.6 0:6:5b:f:51:d6 UHLW 1 63 en0 1197 # 140.221.10.135 0:d0:59:34:26:34 UHLW 2 2134 en0 1199 # 140.221.10.152 0:30:1b:b0:ec:dd UHLW 1 137 en0 1122 # 140.221.10.153 127.0.0.1 UHS 0 0 lo0 # 140.221.11.37 0:9:6b:53:4e:4b UHLW 1 624 en0 1136 # 140.221.11.103 0:30:48:22:59:e6 UHLW 3 973 en0 1016 # 140.221.11.224 0:a:95:6f:7:10 UHLW 1 1 en0 605 # 140.221.11.237 0:1:30:b8:80:c0 UHLW 0 0 en0 1158 # 140.221.11.250 0:1:30:3:1:0 UHLW 0 0 en0 1141 # 140.221.11.253 0:d0:3:e:70:a UHLW 13 0 en0 1199 # 169.254 link#4 UCS 0 0 en0 # # Internet6: # Destination Gateway Flags Netif Expire # UH lo0 # fe80::%lo0/64 Uc lo0 # link#1 UHL lo0 # fe80::%en0/64 link#4 UC en0 # 0:a:95:a8:26:68 UHL lo0 # ff01::/32 U lo0 # ff02::%lo0/32 UC lo0 # ff02::%en0/32 link#4 UC en0 my($fh); if (!open($fh, "netstat -rn |")) { warn "Cannot run netstat to determine local IP address\n"; return "localhost"; } my $interface_name; while (<$fh>) { my @cols = split(); if ($cols[0] eq "default" || $cols[0] eq "0.0.0.0") { $interface_name = $cols[$#cols]; } } close($fh); # print "Default route on $interface_name\n"; # # Find ifconfig. # my $ifconfig; for my $dir ((split(":", $ENV{PATH}), "/sbin", "/usr/sbin")) { if (-x "$dir/ifconfig") { $ifconfig = "$dir/ifconfig"; last; } } if ($ifconfig eq "") { warn "Ifconfig not found\n"; return "localhost"; } # print "Foudn $ifconfig\n"; if (!open($fh, "$ifconfig $interface_name |")) { warn "Could not run $ifconfig: $!\n"; return "localhost"; } my $ip; while (<$fh>) { # # Mac: # inet 140.221.10.153 netmask 0xfffffc00 broadcast 140.221.11.255 # Linux: # inet addr:140.221.34.37 Bcast:140.221.34.63 Mask:255.255.255.224 # chomp; s/^\s*//; # print "Have '$_'\n"; if (/inet\s+addr:(\d+\.\d+\.\d+\.\d+)\s+/) { # # Linux hit. # $ip = $1; # print "Got linux $ip\n"; last; } elsif (/inet\s+(\d+\.\d+\.\d+\.\d+)\s+/) { # # Mac hit. # $ip = $1; # print "Got mac $ip\n"; last; } } close($fh); if ($ip eq "") { warn "Didn't find an IP\n"; return "localhost"; } return $ip; } sub cgi_url { return &plug_url($FIG_Config::cgi_url); } sub temp_url { return &plug_url($FIG_Config::temp_url); } sub plug_url { my($url) = @_; my $name = &get_local_hostname; if ($name && ($url =~ /^http:\/\/[^\/]+(.*)/)) { $url = "http://$name$1"; } return $url; } =pod =head1 hiding/caching in a FIG object We save the DB handle, cache taxonomies, and put a few other odds and ends in the FIG object. We expect users to invoke these services using the object $fig constructed using: use FIG; my $fig = new FIG; $fig is then used as the basic mechanism for accessing FIG services. It is, of course, just a hash that is used to retain/cache data. The most commonly accessed item is the DB filehandle, which is accessed via $self->db_handle. We cache genus/species expansions, taxonomies, distances (very crudely estimated) estimated between genomes, and a variety of other things. I am not sure that using cached/2 was a good idea, but I did it. =cut sub db_handle { my($self) = @_; return $self->{_dbf}; } sub cached { my($self,$what) = @_; my $x = $self->{$what}; if (! $x) { $x = $self->{$what} = {}; } return $x; } ################ Basic Routines [ existed since WIT ] ########################## =pod =head1 min usage: $n = &FIG::min(@x) Assumes @x contains numeric values. Returns the minimum of the values. =cut sub min { my(@x) = @_; my($min,$i); (@x > 0) || return undef; $min = $x[0]; for ($i=1; ($i < @x); $i++) { $min = ($min > $x[$i]) ? $x[$i] : $min; } return $min; } =pod =head1 max usage: $n = &FIG::max(@x) Assumes @x contains numeric values. Returns the maximum of the values. =cut sub max { my(@x) = @_; my($max,$i); (@x > 0) || return undef; $max = $x[0]; for ($i=1; ($i < @x); $i++) { $max = ($max < $x[$i]) ? $x[$i] : $max; } return $max; } =pod =head1 between usage: &FIG::between($x,$y,$z) Returns true iff $y is between $x and $z. =cut sub between { my($x,$y,$z) = @_; if ($x < $z) { return (($x <= $y) && ($y <= $z)); } else { return (($x >= $y) && ($y >= $z)); } } =pod =head1 standard_genetic_code usage: $code = &FIG::standard_genetic_code() Routines like "translate" can take a "genetic code" as an argument. I implemented such codes using hashes that assumed uppercase DNA triplets as keys. =cut sub standard_genetic_code { my $code = {}; $code->{"AAA"} = "K"; $code->{"AAC"} = "N"; $code->{"AAG"} = "K"; $code->{"AAT"} = "N"; $code->{"ACA"} = "T"; $code->{"ACC"} = "T"; $code->{"ACG"} = "T"; $code->{"ACT"} = "T"; $code->{"AGA"} = "R"; $code->{"AGC"} = "S"; $code->{"AGG"} = "R"; $code->{"AGT"} = "S"; $code->{"ATA"} = "I"; $code->{"ATC"} = "I"; $code->{"ATG"} = "M"; $code->{"ATT"} = "I"; $code->{"CAA"} = "Q"; $code->{"CAC"} = "H"; $code->{"CAG"} = "Q"; $code->{"CAT"} = "H"; $code->{"CCA"} = "P"; $code->{"CCC"} = "P"; $code->{"CCG"} = "P"; $code->{"CCT"} = "P"; $code->{"CGA"} = "R"; $code->{"CGC"} = "R"; $code->{"CGG"} = "R"; $code->{"CGT"} = "R"; $code->{"CTA"} = "L"; $code->{"CTC"} = "L"; $code->{"CTG"} = "L"; $code->{"CTT"} = "L"; $code->{"GAA"} = "E"; $code->{"GAC"} = "D"; $code->{"GAG"} = "E"; $code->{"GAT"} = "D"; $code->{"GCA"} = "A"; $code->{"GCC"} = "A"; $code->{"GCG"} = "A"; $code->{"GCT"} = "A"; $code->{"GGA"} = "G"; $code->{"GGC"} = "G"; $code->{"GGG"} = "G"; $code->{"GGT"} = "G"; $code->{"GTA"} = "V"; $code->{"GTC"} = "V"; $code->{"GTG"} = "V"; $code->{"GTT"} = "V"; $code->{"TAA"} = "*"; $code->{"TAC"} = "Y"; $code->{"TAG"} = "*"; $code->{"TAT"} = "Y"; $code->{"TCA"} = "S"; $code->{"TCC"} = "S"; $code->{"TCG"} = "S"; $code->{"TCT"} = "S"; $code->{"TGA"} = "*"; $code->{"TGC"} = "C"; $code->{"TGG"} = "W"; $code->{"TGT"} = "C"; $code->{"TTA"} = "L"; $code->{"TTC"} = "F"; $code->{"TTG"} = "L"; $code->{"TTT"} = "F"; return $code; } =pod =head1 translate usage: $aa_seq = &FIG::translate($dna_seq,$code,$fix_start); If $code is undefined, I use the standard genetic code. If $fix_start is true, I will translate initial TTG or GTG to 'M'. =cut sub translate { my( $dna,$code,$start) = @_; my( $i,$j,$ln ); my( $x,$y ); my( $prot ); if (! defined($code)) { $code = &FIG::standard_genetic_code; } $ln = length($dna); $prot = "X" x ($ln/3); $dna =~ tr/a-z/A-Z/; for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) { $x = substr($dna,$i,3); if ($y = $code->{$x}) { substr($prot,$j,1) = $y; } } if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) { substr($prot,0,1) = 'M'; } return $prot; } =pod =head1 reverse_comp and rev_comp usage: $dnaR = &FIG::reverse_comp($dna) or $dnaRP = &FIG::rev_comp($seqP) In WIT, we implemented reverse complement passing a pointer to a sequence and returning a pointer to a sequence. In most cases the pointers are a pain (although in a few they are just what is needed). Hence, I kept both versions of the function to allow you to use whichever you like. Use rev_comp only for long strings where passing pointers is a reasonable effeciency issue. =cut sub reverse_comp { my($seq) = @_; return ${&rev_comp(\$seq)}; } sub rev_comp { my( $seqP ) = @_; my( $rev ); $rev = reverse( $$seqP ); $rev =~ tr/a-z/A-Z/; $rev =~ tr/ACGTUMRWSYKBDHV/TGCAAKYWSRMVHDB/; return \$rev; } =pod =head1 verify_dir usage: &FIG::verify_dir($dir) Makes sure that $dir exists. If it has to create it, it sets permissions to 0777. =cut sub verify_dir { my($dir) = @_; if (-d $dir) { return } if ($dir =~ /^(.*)\/[^\/]+$/) { &verify_dir($1); } mkdir($dir,0777) || die "could not make $dir"; chmod 0777,$dir; } =pod =head1 run usage: &FIG::run($cmd) Runs $cmd and fails (with trace) if the command fails. =cut sub run { my($cmd) = @_; # my @tmp = `date`; chop @tmp; print STDERR "$tmp[0]: running $cmd\n"; (system($cmd) == 0) || confess "FAILED: $cmd"; } =pod =head1 display_id_and_seq usage: &FIG::display_id_and_seq($id_and_comment,$seqP,$fh) This command has always been used to put out fasta sequences. Note that it takes a pointer to the sequence. $fh is optional and defalts to STDOUT. =cut sub display_id_and_seq { my( $id, $seq, $fh ) = @_; if (! defined($fh) ) { $fh = \*STDOUT; } print $fh ">$id\n"; &display_seq($seq, $fh); } sub display_seq { my ( $seq, $fh ) = @_; my ( $i, $n, $ln ); if (! defined($fh) ) { $fh = \*STDOUT; } $n = length($$seq); # confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) ); for ($i=0; ($i < $n); $i += 60) { if (($i + 60) <= $n) { $ln = substr($$seq,$i,60); } else { $ln = substr($$seq,$i,($n-$i)); } print $fh "$ln\n"; } } ########## I commented the pods on the following routines out, since they should not ########## be part of the SOAP/WSTL interface #=pod # #=head1 file2N # #usage: $n = $fig->file2N($file) # #In some of the databases I need to store filenames, which can waste a lot of #space. Hence, I maintain a database for converting filenames to/from integers. # #=cut # sub file2N { my($self,$file) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT fileno FROM file_table WHERE ( file = \'$file\')")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } elsif (($relational_db_response = $rdbH->SQL("SELECT MAX(fileno) FROM file_table ")) && (@$relational_db_response == 1) && ($relational_db_response->[0]->[0])) { my $fileno = $relational_db_response->[0]->[0] + 1; if ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', $fileno )")) { return $fileno; } } elsif ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', 1 )")) { return 1; } return undef; } #=pod # #=head1 N2file # #usage: $filename = $fig->N2file($n) # #In some of the databases I need to store filenames, which can waste a lot of #space. Hence, I maintain a database for converting filenames to/from integers. # #=cut # sub N2file { my($self,$fileno) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT file FROM file_table WHERE ( fileno = $fileno )")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } return undef; } #=pod # #=head1 openF # #usage: $fig->openF($filename) # #Parts of the system rely on accessing numerous different files. The most obvious case is #the situation with similarities. It is important that the system be able to run in cases in #which an arbitrary number of files cannot be open simultaneously. This routine (with closeF) is #a hack to handle this. I should probably just pitch them and insist that the OS handle several #hundred open filehandles. # #=cut # sub openF { my($self,$file) = @_; my($fxs,$x,@fxs,$fh); $fxs = $self->cached('_openF'); if ($x = $fxs->{$file}) { $x->[1] = time(); return $x->[0]; } @fxs = keys(%$fxs); if (defined($fh = new FileHandle "<$file")) { if (@fxs >= 200) { @fxs = sort { $fxs->{$a}->[1] <=> $fxs->{$b}->[1] } @fxs; $x = $fxs->{$fxs[0]}; undef $x->[0]; delete $fxs->{$fxs[0]}; } $fxs->{$file} = [$fh,time()]; return $fh; } return undef; } #=pod # #=head1 closeF # #usage: $fig->closeF($filename) # #Parts of the system rely on accessing numerous different files. The most obvious case is #the situation with similarities. It is important that the system be able to run in cases in #which an arbitrary number of files cannot be open simultaneously. This routine (with openF) is #a hack to handle this. I should probably just pitch them and insist that the OS handle several #hundred open filehandles. # #=cut # sub closeF { my($self,$file) = @_; my($fxs,$x); if (($fxs = $self->{_openF}) && ($x = $fxs->{$file})) { undef $x->[0]; delete $fxs->{$file}; } } =pod =head1 ec_name usage: $enzymatic_function = $fig->ec_name($ec) Returns enzymatic name for EC. =cut sub ec_name { my($self,$ec) = @_; ($ec =~ /^\d+\.\d+\.\d+\.\d+$/) || return ""; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT name FROM ec_names WHERE ( ec = \'$ec\' )"); return (@$relational_db_response == 1) ? $relational_db_response->[0]->[0] : ""; return ""; } =pod =head1 all_roles usage: @roles = $fig->all_roles Supposed to return all known roles. For now, we ghet all ECs with "names". =cut sub all_roles { my($self) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT ec,name FROM ec_names"); return @$relational_db_response; } =pod =head1 expand_ec usage: $expanded_ec = $fig->expand_ec($ec) Expands "1.1.1.1" to "1.1.1.1 - alcohol dehydrogenase" or something like that. =cut sub expand_ec { my($self,$ec) = @_; my($name); return ($name = $self->ec_name($ec)) ? "$ec - $name" : $ec; } =pod =head1 clean_tmp usage: &FIG::clean_tmp We store temporary files in $FIG_Config::temp. There are specific classes of files that are created and should be saved for at least a few days. This routine can be invoked to clean out those that are over two days old. =cut sub clean_tmp { my($file); if (opendir(TMP,"$FIG_Config::temp")) { # change the pattern to pick up other files that need to be cleaned up my @temp = grep { $_ =~ /^(Geno|tmp)/ } readdir(TMP); foreach $file (@temp) { if (-M "$FIG_Config::temp/$file" > 2) { unlink("$FIG_Config::temp/$file"); } } } } ################ Routines to process genomes and genome IDs ########################## =pod =head1 genomes usage: @genome_ids = $fig->genomes; Genomes are assigned ids of the form X.Y where X is the taxonomic id maintained by NCBI for the species (not the specific strain), and Y is a sequence digit assigned to this particular genome (as one of a set with the same genus/species). Genomes also have versions, but that is a separate issue. =cut sub genomes { my($self,$complete,$restrictions) = @_; my $rdbH = $self->db_handle; my @where = (); if ($complete) { push(@where,"( complete = \'1\' )") } if ($restrictions) { push(@where,"( restrictions = \'1\' )") } my $relational_db_response; if (@where > 0) { my $where = join(" AND ",@where); $relational_db_response = $rdbH->SQL("SELECT genome FROM genome where $where"); } else { $relational_db_response = $rdbH->SQL("SELECT genome FROM genome"); } my @genomes = sort { $a <=> $b } map { $_->[0] } @$relational_db_response; return @genomes; } sub genome_counts { my($self,$complete) = @_; my($x,$relational_db_response); my $rdbH = $self->db_handle; if ($complete) { $relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome where complete = '1'"); } else { $relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome"); } my ($a,$b,$e,$v) = (0,0,0,0); if (@$relational_db_response > 0) { foreach $x (@$relational_db_response) { if ($x->[1] =~ /^a/i) { $a++ } elsif ($x->[1] =~ /^b/i) { $b++ } elsif ($x->[1] =~ /^e/i) { $e++ } elsif ($x->[1] =~ /^v/i) { $v++ } } } return ($a,$b,$e,$v); } =pod =head1 genome_version usage: $version = $fig->genome_version($genome_id); Versions are incremented for major updates. They are put in as major updates of the form 1.0, 2.0, ... Users may do local "editing" of the DNA for a genome, but when they do, they increment the digits to the right of the decimal. Two genomes remain comparable only if the versions match identically. Hence, minor updating should be committed only by the person/group responsible for updating that genome. We can, of course, identify which genes are identical between any two genomes (by matching the DNA or amino acid sequences). However, the basic intent of the system is to support editing by the main group issuing periodic major updates. =cut sub genome_version { my($self,$genome) = @_; my(@tmp); if ((-s "$FIG_Config::organisms/$genome/VERSION") && (@tmp = `cat $FIG_Config::organisms/$genome/VERSION`) && ($tmp[0] =~ /^(\d+(\.\d+)?)$/)) { return $1; } return undef; } =pod =head1 genus_species usage: $gs = $fig->genus_species($genome_id) Returns the genus and species (and strain if that has been properly recorded) in a printable form. =cut sub genus_species { my ($self,$genome) = @_; my $ans; my $genus_species = $self->cached('_genus_species'); if (! ($ans = $genus_species->{$genome})) { my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT genome,gname FROM genome"); my $pair; foreach $pair (@$relational_db_response) { $genus_species->{$pair->[0]} = $pair->[1]; } $ans = $genus_species->{$genome}; } return $ans; } =pod =head1 org_of usage: $org = $fig->org_of($prot_id) In the case of external proteins, we can usually determine an organism, but not anything more precise than genus/species (and often not that). This routine takes a protein ID (which may be a feature ID) and returns "the organism". =cut sub org_of { my($self,$prot_id) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if ($prot_id =~ /^fig\|/) { return $self->genus_species($self->genome_of($prot_id)); } if (($relational_db_response = $rdbH->SQL("SELECT org FROM external_orgs WHERE ( prot = \'$prot_id\' )")) && (@$relational_db_response >= 1)) { return $relational_db_response->[0]->[0]; } return ""; } =pod =head1 abbrev usage: $abbreviated_name = $fig->abbrev($genome_name) For alignments and such, it is very useful to be able to produce an abbreviation of genus/species. That's what this does. Note that multiple genus/species might reduce to the same abbreviation, so be careful (disambiguate them, if you must). =cut sub abbrev { my($genome_name) = @_; $genome_name =~ s/^(\S{3})\S+/$1./; $genome_name =~ s/^(\S+\s+\S{4})\S+/$1./; if (length($genome_name) > 13) { $genome_name = substr($genome_name,0,13); } return $genome_name; } ################ Routines to process Features and Feature IDs ########################## =pod =head1 ftype usage: $type = &FIG::ftype($fid) Returns the type of a feature, given the feature ID. This just amounts to lifting it out of the feature ID, since features have IDs of tghe form fig|x.y.f.n where x.y is the genome ID f is the type pf feature n is an integer that is unique within the genome/type =cut sub ftype { my($feature_id) = @_; if ($feature_id =~ /^fig\|\d+\.\d+\.([^\.]+)/) { return $1; } return undef; } =pod =head1 genome_of usage: $genome_id = $fig->genome_of($fid) This just extracts the genome ID from a feature ID. =cut sub genome_of { my $prot_id = (@_ == 1) ? $_[0] : $_[1]; if ($prot_id =~ /^fig\|(\d+\.\d+)/) { return $1; } return undef; } =pod =head1 by_fig_id usage: @sorted_by_fig_id = sort { &FIG::by_fig_id($a,$b) } @fig_ids This is a bit of a clutzy way to sort a list of FIG feature IDs, but it works. =cut sub by_fig_id { my($a,$b) = @_; my($g1,$g2,$t1,$t2,$n1,$n2); if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) && ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) { ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2); } else { $a cmp $b; } } =pod =head1 genes_in_region usage: ($features_in_region,$beg1,$end1) = $fig->genes_in_region($genome,$contig,$beg,$end) It is often important to be able to find the genes that occur in a specific region on a chromosome. This routine is designed to provide this information. It returns all genes that overlap the region ($genome,$contig,$beg,$end). $beg1 is set to the minimum coordinate of the returned genes (which may be before the given region), and $end1 the maximum coordinate. The routine assumes that genes are not more than 10000 bases long, which is certainly not true in eukaryotes. Hence, in euks you may well miss genes that overlap the boundaries of the specified region (sorry). =cut sub genes_in_region { my($self,$genome,$contig,$beg,$end) = @_; my($x,$relational_db_response,$feature_id,$b1,$e1,@feat,@tmp,$l,$u); my $pad = 10000; my $rdbH = $self->db_handle; my $minV = $beg - $pad; my $maxV = $end + $pad; if (($relational_db_response = $rdbH->SQL("SELECT id FROM features WHERE ( minloc > $minV ) AND ( minloc < $maxV ) AND (maxloc < $maxV) AND ( genome = \'$genome\' ) AND ( contig = \'$contig\' );")) && (@$relational_db_response >= 1)) { @tmp = sort { ($a->[1] cmp $b->[1]) or ($a->[2] <=> $b->[2]) or ($a->[3] <=> $b->[3]) } map { $feature_id = $_->[0]; $x = $self->feature_location($feature_id); $x ? [$feature_id,&boundaries_of($x)] : () } @$relational_db_response; ($l,$u) = (10000000000,0); foreach $x (@tmp) { ($feature_id,undef,$b1,$e1) = @$x; if (&between($beg,&min($b1,$e1),$end) || &between(&min($b1,$e1),$beg,&max($b1,$e1))) { push(@feat,$feature_id); $l = &min($l,&min($b1,$e1)); $u = &max($u,&max($b1,$e1)); } } (@feat <= 0) || return ([@feat],$l,$u); } return ([],$l,$u); } sub close_genes { my($self,$fid,$dist) = @_; my $loc = $self->feature_location($fid); if ($loc) { my($contig,$beg,$end) = &FIG::boundaries_of($loc); if ($contig && $beg && $end) { my $min = &min($beg,$end) - $dist; my $max = &max($beg,$end) + $dist; my $feat; ($feat,undef,undef) = $self->genes_in_region(&FIG::genome_of($fid),$contig,$min,$max); return @$feat; } } return (); } =pod =head1 feature_location usage: $loc = $fig->feature_location($fid) OR @loc = $fig->feature_location($fid) The location of a feature in a scalar context is contig_b1_e1,contig_b2_e2,... [one contig_b_e for each exon] In a list context it is (contig_b1_e1,contig_b2_e2,...) =cut sub feature_location { my($self,$feature_id) = @_; my($relational_db_response,$locations,$location); $locations = $self->cached('_location'); if (! ($location = $locations->{$feature_id})) { my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT location FROM features WHERE ( id = \'$feature_id\' )")) && (@$relational_db_response == 1)) { $locations->{$feature_id} = $location = $relational_db_response->[0]->[0]; } } if ($location) { return wantarray() ? split(/,/,$location) : $location; } return undef; } =pod =head1 boundaries_of usage: ($contig,$beg,$end) = $fig->boundaries_of($loc) The location of a feature in a scalar context is contig_b1_e1,contig_b2_e2,... [one contig_b_e for each exon] This routine takes as input such a location and reduces it to a single description of the entire region containing the gene. =cut sub boundaries_of { my($location) = (@_ == 1) ? $_[0] : $_[1]; my($contigQ); if (defined($location)) { my @exons = split(/,/,$location); my($contig,$beg,$end); if (($exons[0] =~ /^(\S+)_(\d+)_\d+$/) && (($contig,$beg) = ($1,$2)) && ($contigQ = quotemeta $contig) && ($exons[$#exons] =~ /^$contigQ\_\d+_(\d+)$/) && ($end = $1)) { return ($contig,$beg,$end); } } return undef; } =pod =head1 all_features usage: $fig->all_features($genome,$type) Returns a list of all feature IDs of a specified type in the designated genome. You would usually use just $fig->pegs_of($genome) or $fig->rnas_of($genome) which simply invoke this routine. =cut sub all_features { my($self,$genome,$type) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT id FROM features WHERE (genome = \'$genome\' AND (type = \'$type\'))"); if (@$relational_db_response > 0) { return map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 all_pegs_of usage: $fig->all_pegs_of($genome) Returns a list of all PEGs in the specified genome. Note that order is not specified. =cut sub pegs_of { my($self,$genome) = @_; return $self->all_features($genome,"peg"); } =pod =head1 all_rnas_of usage: $fig->all_rnas($genome) Returns a list of all RNAs for the given genome. =cut sub rnas_of { my($self,$genome) = @_; return $self->all_features($genome,"rna"); } =pod =head1 feature_aliases usage: @aliases = $fig->feature_aliases($fid) OR $aliases = $fig->feature_aliases($fid) Returns a list of aliases (gene IDs, arbitrary numbers assigned by authors, etc.) for the feature. These must come from the tbl files, so add them there if you want to see them here. In a scalar context, the aliases come back with commas separating them. =cut sub feature_aliases { my($self,$feature_id) = @_; my($rdbH,$relational_db_response,$aliases); $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT aliases FROM features WHERE ( id = \'$feature_id\' )")) && (@$relational_db_response == 1)) { $aliases = $relational_db_response->[0]->[0]; } return $aliases ? (wantarray ? split(/,/,$aliases) : $aliases) : undef; } =pod =head1 by_alias usage: $peg = $fig->by_alias($alias) Returns a FIG id if the alias can be converted. Right now we convert aliases of the form NP_* (RefSeq IDs) or gi|* (GenBank IDs) =cut sub by_alias { my($self,$alias) = @_; my($rdbH,$relational_db_response,$peg); $peg = ""; $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT id FROM ext_alias WHERE ( alias = \'$alias\' )")) && (@$relational_db_response == 1)) { $peg = $relational_db_response->[0]->[0]; } return $peg; } =pod =head1 possibly_truncated usage: $fig->possibly_truncated($fid) Returns true iff the feature occurs near the end of a contig. =cut sub possibly_truncated { my($self,$feature_id) = @_; my($loc); if ($loc = $self->feature_location($feature_id)) { my $genome = &genome_of($feature_id); my ($contig,$beg,$end) = &boundaries_of($loc); if ((! $self->near_end($genome,$contig,$beg)) && (! $self->near_end($genome,$contig,$end))) { return 0; } } return 1; } sub near_end { my($self,$genome,$contig,$x) = @_; return (($x < 300) || ($x > ($self->contig_ln($genome,$contig) - 300))); } sub is_real_feature { my($self,$fid) = @_; my($relational_db_response); my $rdbH = $self->db_handle; return (($relational_db_response = $rdbH->SQL("SELECT id FROM features WHERE ( id = \'$fid\' )")) && (@$relational_db_response == 1)); } ################ Routines to process functional coupling for PEGs ########################## =pod =head1 coupling_and_evidence usage: @coupling_data = $fig->coupling_and_evidence($fid,$bound,$sim_cutoff,$coupling_cutoff,$keep_record) A computation of couplings and evidence starts with a given peg and produces a list of 3-tuples. Each 3-tuple is of the form [Score,CoupledToFID,Evidence] Evidence is a list of 2-tuples of FIDs that are close in other genomes (producing a "pair of close homologs" of [$peg,CoupledToFID]). The maximum score for a single PCH is 1, but "Score" is the sum of the scores for the entire set of PCHs. If $keep_record is true, the system records the information, asserting coupling for each of the pairs in the set of evidence, and asserting a pin from the given $fd through all of the PCH entries used in forming the score. =cut sub coupling_and_evidence { my($self,$feature_id,$bound,$sim_cutoff,$coupling_cutoff,$keep_record) = @_; my($neighbors,$neigh,$similar1,$similar2,@hits,$sc,$ev,$genome1); if ($feature_id =~ /^fig\|(\d+\.\d+)/) { $genome1 = $1; } my($contig,$beg,$end) = &FIG::boundaries_of($self->feature_location($feature_id)); if (! $contig) { return () } ($neighbors,undef,undef) = $self->genes_in_region(&genome_of($feature_id), $contig, &min($beg,$end) - $bound, &max($beg,$end) + $bound); if (@$neighbors == 0) { return () } $similar1 = $self->acceptably_close($feature_id,$sim_cutoff); @hits = (); foreach $neigh (grep { $_ =~ /peg/ } @$neighbors) { next if ($neigh eq $feature_id); $similar2 = $self->acceptably_close($neigh,$sim_cutoff); ($sc,$ev) = $self->coupling_ev($genome1,$similar1,$similar2,$bound); if ($sc >= $coupling_cutoff) { push(@hits,[$sc,$neigh,$ev]); } } if ($keep_record) { $self->add_chr_clusters_and_pins($feature_id,\@hits); } return sort { $b->[0] <=> $a->[0] } @hits; } sub fast_coupling { my($self,$peg,$bound,$coupling_cutoff) = @_; my($genome,$genome1,$genome2,$peg1,$peg2,$peg3,%maps,$loc,$loc1,$loc2,$loc3); my($pairs,$sc,%ev); my @ans = (); $genome = &genome_of($peg); foreach $peg1 ($self->in_pch_pin_with($peg)) { $peg1 =~ s/,.*$//; if ($peg ne $peg1) { $genome1 = &genome_of($peg1); $maps{$peg}->{$genome1} = $peg1; } } $loc = [&boundaries_of(scalar $self->feature_location($peg))]; foreach $peg1 ($self->in_cluster_with($peg)) { if ($peg ne $peg1) { # print STDERR "peg1=$peg1\n"; $loc1 = [&boundaries_of(scalar $self->feature_location($peg1))]; if (&close_enough($loc,$loc1,$bound)) { foreach $peg2 ($self->in_pch_pin_with($peg1)) { $genome2 = &genome_of($peg2); if (($peg3 = $maps{$peg}->{$genome2}) && ($peg2 ne $peg3)) { $loc2 = [&boundaries_of(scalar $self->feature_location($peg2))]; $loc3 = [&boundaries_of(scalar $self->feature_location($peg3))]; if (&close_enough($loc2,$loc3,$bound)) { push(@{$ev{$peg1}},[$peg3,$peg2]); } } } } } } foreach $peg1 (keys(%ev)) { $pairs = $ev{$peg1}; $sc = $self->score([$genome,map { $self->genome_of($_->[0]) } @$pairs]); if ($sc >= $coupling_cutoff) { push(@ans,[$sc,$peg1]); } } return sort { $b->[0] <=> $a->[0] } @ans; } sub score { my($self,$genomes) = @_; my($min,$i,$j,$d,%seen,@reduced,$genome); foreach $genome (@$genomes) { $genome =~ /^(\d+)/; if (! $seen{$1}) { push(@reduced,$genome); $seen{$1} = 1; } } $i=1; $d = 0; for ($j=1; ($j < @reduced); $j++) { $d += $self->crude_estimate_of_distance($reduced[$i],$reduced[$j]); } return $d; } =pod =head1 add_chr_clusters_and_pins usage: $fig->add_chr_clusters_and_pins($peg,$hits) The system supports retaining data relating to functional coupling. If a user computes evidence once and then saves it with this routine, data relating to both "the pin" and the "clusters" (in all of the organisms supporting the functional coupling) will be saved. $hits must be a pointer to a list of 3-tuples of the sort returned by $fig->coupling_and_evidence. =cut sub add_chr_clusters_and_pins { my($self,$peg,$hits) = @_; my(@clusters,@pins,$x,$sc,$neigh,$pairs,$y,@corr,@orgs,%projection); my($genome,$cluster,$pin,$peg2); if (@$hits > 0) { @clusters = (); @pins = (); push(@clusters,[$peg,map { $_->[1] } @$hits]); foreach $x (@$hits) { ($sc,$neigh,$pairs) = @$x; push(@pins,[$neigh,map { $_->[1] } @$pairs]); foreach $y (@$pairs) { $peg2 = $y->[0]; if ($peg2 =~ /^fig\|(\d+\.\d+)/) { $projection{$1}->{$peg2} = 1; } } } @corr = (); @orgs = keys(%projection); if (@orgs > 0) { foreach $genome (sort { $a <=> $b } @orgs) { push(@corr,sort { &FIG::by_fig_id($a,$b) } keys(%{$projection{$genome}})); } push(@pins,[$peg,@corr]); } foreach $cluster (@clusters) { $self->add_chromosomal_cluster($cluster); } foreach $pin (@pins) { $self->add_pch_pin($pin); } } } sub coupling_ev { my($self,$genome1,$sim1,$sim2,$bound) = @_; my($ev,$sc,$i,$j); $ev = []; $sc = 0; $i = 0; $j = 0; while (($i < @$sim1) && ($j < @$sim2)) { if ($sim1->[$i]->[0] < $sim2->[$j]->[0]) { $i++; } elsif ($sim1->[$i]->[0] > $sim2->[$j]->[0]) { $j++; } else { $sc += $self->accumulate_ev($genome1,$sim1->[$i]->[1],$sim2->[$j]->[1],$bound,$ev); $i++; $j++; } } return ($sc,$ev); } sub accumulate_ev { my($self,$genome1,$feature_ids1,$feature_ids2,$bound,$ev) = @_; my($genome2,@locs1,@locs2,$i,$j,$sc,$x); if ((@$feature_ids1 == 0) || (@$feature_ids2 == 0)) { return 0 } $feature_ids1->[0] =~ /^fig\|(\d+\.\d+)/; $genome2 = $1; $sc = 0; @locs1 = map { $x = $self->feature_location($_); $x ? [&boundaries_of($x)] : () } @$feature_ids1; @locs2 = map { $x = $self->feature_location($_); $x ? [&boundaries_of($x)] : () } @$feature_ids2; for ($i=0; ($i < @$feature_ids1); $i++) { for ($j=0; ($j < @$feature_ids2); $j++) { if (($feature_ids1->[$i] ne $feature_ids2->[$j]) && &close_enough($locs1[$i],$locs2[$j],$bound)) { $sc += $self->crude_estimate_of_distance($genome1,$genome2); push(@$ev,[$feature_ids1->[$i],$feature_ids2->[$j]]); } } } return $sc; } sub close_enough { my($locs1,$locs2,$bound) = @_; # print STDERR &Dumper(["close enough",$locs1,$locs2]); return (($locs1->[0] eq $locs2->[0]) && (abs((($locs1->[1]+$locs1->[2])/2) - (($locs2->[1]+$locs2->[2])/2)) <= $bound)); } sub acceptably_close { my($self,$feature_id,$sim_cutoff) = @_; my(%by_org,$id2,$genome,$sim); my($ans) = []; foreach $sim ($self->sims($feature_id,1000,$sim_cutoff,"fig")) { $id2 = $sim->id2; if ($id2 =~ /^fig\|(\d+\.\d+)/) { my $genome = $1; if ($self->taxonomy_of($genome) !~ /^Euk/) { push(@{$by_org{$genome}},$id2); } } } foreach $genome (sort { $a <=> $b } keys(%by_org)) { push(@$ans,[$genome,$by_org{$genome}]); } return $ans; } ################ Translations of PEGsand External Protein Sequences ########################## =pod =head1 translatable usage: $fig->translatable($prot_id) The system takes any number of sources of protein sequences as input (and builds an nr for the purpose of computing similarities). For each of these input fasta files, it saves (in the DB) a filename, seek address and length so that it can go get the translation if needed. This routine simply returns true iff info on the translation exists. =cut sub translatable { my($self,$prot) = @_; return &translation_length($self,$prot) ? 1 : 0; } =pod =head1 translation_length usage: $len = $fig->translation_length($prot_id) The system takes any number of sources of protein sequences as input (and builds an nr for the purpose of computing similarities). For each of these input fasta files, it saves (in the DB) a filename, seek address and length so that it can go get the translation if needed. This routine returns the length of a translation. This does not require actually retrieving the translation. =cut sub translation_length { my($self,$prot) = @_; $prot =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT slen FROM protein_sequence_seeks WHERE id = \'$prot\' "); return (@$relational_db_response == 1) ? $relational_db_response->[0]->[0] : undef; } =pod =head1 get_translation usage: $translation = $fig->get_translation($prot_id) The system takes any number of sources of protein sequences as input (and builds an nr for the purpose of computing similarities). For each of these input fasta files, it saves (in the DB) a filename, seek address and length so that it can go get the translation if needed. This routine returns a protein sequence. =cut sub get_translation { my($self,$id) = @_; my($rdbH,$relational_db_response,$fileN,$file,$fh,$seek,$ln,$tran); $rdbH = $self->db_handle; $id =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/; $relational_db_response = $rdbH->SQL("SELECT fileno, seek, len FROM protein_sequence_seeks WHERE id = \'$id\' "); if ($relational_db_response && @$relational_db_response == 1) { ($fileN,$seek,$ln) = @{$relational_db_response->[0]}; if (($fh = $self->openF($self->N2file($fileN))) && ($ln > 10)) { seek($fh,$seek,0); read($fh,$tran,$ln-1); $tran =~ s/\s//g; return $tran; } } return ''; } =pod =head1 mapped_prot_ids usage: @mapped = $fig->mapped_prot_ids($prot) This routine is at the heart of maintaining synonyms for protein sequences. The system determines which protein sequences are "essentially the same". These may differ in length (presumably due to miscalled starts), but the tails are identical (and the heads are not "too" extended). Anyway, the set of synonyms is returned as a list of 2-tuples [Id,length] sorted by length. =cut sub mapped_prot_ids { my($self,$id) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT maps_to FROM peg_synonyms WHERE syn_id = \'$id\' "); if ($relational_db_response && (@$relational_db_response == 1)) { $id = $relational_db_response->[0]->[0]; } $relational_db_response = $rdbH->SQL("SELECT syn_id,syn_ln,maps_to_ln FROM peg_synonyms WHERE maps_to = \'$id\' "); if ($relational_db_response && (@$relational_db_response > 0)) { return ([$id,$relational_db_response->[0]->[2]],map { [$_->[0],$_->[1]] } @$relational_db_response); } else { return ([$id,$self->translation_length($id)]); } } sub maps_to_id { my($self,$id) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT maps_to FROM peg_synonyms WHERE syn_id = \'$id\' "); return ($relational_db_response && (@$relational_db_response == 1)) ? $relational_db_response->[0]->[0] : $id; } ################ Assignments of Function to PEGs ########################## =pod =head1 function_of usage: @functions = $fig->function_of($peg) OR $function = $fig->function_of($peg,$user) In a list context, you get back a list of 2-tuples. Each 2-tuple is of the form [MadeBy,Function]. In a scalar context, 1. user is "master" if not specified 2. function returned is the user's, if one exists; otherwise, master's, if one exists In a scalar context, you get just the function. =cut # Note that we do not return confidence. I propose a separate function to get both # function and confidence # sub function_of { my($self,$id,$user) = @_; my($relational_db_response,@tmp,$entry,$i); my $wantarray = wantarray(); my $rdbH = $self->db_handle; if (($id =~ /^fig\|(\d+\.\d+\.peg\.\d+)/) && ($wantarray || $user)) { if (($relational_db_response = $rdbH->SQL("SELECT made_by,assigned_function FROM assigned_functions WHERE ( prot = \'$id\' )")) && (@$relational_db_response >= 1)) { @tmp = sort { $a->[0] cmp $b->[0] } map { [$_->[0],$_->[1]] } @$relational_db_response; for ($i=0; ($i < @tmp) && ($tmp[$i]->[0] ne "master"); $i++) {} if ($i < @tmp) { $entry = splice(@tmp,$i,1); unshift @tmp, ($entry); } my $val; if ($wantarray) { return @tmp } elsif ($user && ($val = &extract_by_who(\@tmp,$user))) { return $val } elsif ($user && ($val = &extract_by_who(\@tmp,"master"))) { return $val } else { return "" } } } else { if (($relational_db_response = $rdbH->SQL("SELECT assigned_function FROM assigned_functions WHERE ( prot = \'$id\' AND made_by = \'master\' )")) && (@$relational_db_response >= 1)) { return $wantarray ? (["master",$relational_db_response->[0]->[0]]) : $relational_db_response->[0]->[0]; } } return $wantarray ? () : ""; } =pod =head1 translated_function_of usage: $function = $fig->translated_function_of($peg,$user) You get just the translated function. =cut sub translated_function_of { my($self,$id,$user) = @_; my $func = $self->function_of($id,$user); if ($func) { $func = $self->translate_function($func); } return $func; } sub extract_by_who { my($xL,$who) = @_; my($i); for ($i=0; ($i < @$xL) && ($xL->[$i]->[0] ne $who); $i++) {} return ($i < @$xL) ? $xL->[$i]->[1] : ""; } =pod =head1 translate_function usage: $translated_func = $fig->translate_function($func) Translates a function based on the function.synonyms table. =cut sub translate_function { my($self,$function) = @_; my ($tran,$from,$to,$line); if (! ($tran = $self->{_function_translation})) { $tran = {}; if (open(TMP,"<$FIG_Config::global/function.synonyms")) { while (defined($line = )) { chop $line; ($from,$to) = split(/\t/,$line); $tran->{$from} = $to; } close(TMP); } foreach $from (keys(%$tran)) { $to = $tran->{$from}; if ($tran->{$to}) { delete $tran->{$from}; } } $self->{_function_translation} = $tran; } while ($to = $tran->{$function}) { $function = $to; } return $function; } =pod =head1 assign_function usage: $fig->assign_function($peg,$user,$function,$confidence) Assigns a function. Note that confidence can (and should be if unusual) included. Note that no annotation is written. This should normally be done in a separate call of the form =cut sub assign_function { my($self,$peg,$user,$function,$confidence) = @_; my($role,$roleQ); my $rdbH = $self->db_handle; $confidence = $confidence ? $confidence : ""; my $genome = $self->genome_of($peg); $rdbH->SQL("DELETE FROM assigned_functions WHERE ( prot = \'$peg\' AND made_by = \'$user\' )"); my $funcQ = quotemeta $function; $rdbH->SQL("INSERT INTO assigned_functions ( prot, made_by, assigned_function, quality, org ) VALUES ( \'$peg\', \'$user\', \'$funcQ\', \'$confidence\', \'$genome\' )"); $rdbH->SQL("DELETE FROM roles WHERE ( prot = \'$peg\' AND made_by = \'$user\' )"); foreach $role (&roles_of_function($function)) { $roleQ = quotemeta $role; $rdbH->SQL("INSERT INTO roles ( prot, role, made_by, org ) VALUES ( \'$peg\', '$roleQ\', \'$user\', \'$genome\' )"); } &verify_dir("$FIG_Config::organisms/$genome/UserModels"); if ($user ne "master") { &verify_dir("$FIG_Config::organisms/$genome/UserModels/$user"); } if ((($user eq "master") && open(TMP,">>$FIG_Config::organisms/$genome/assigned_functions")) || (($user ne "master") && open(TMP,">>$FIG_Config::organisms/$genome/UserModels/$user/assigned_functions"))) { flock(TMP,LOCK_EX) || confess "cannot lock assigned_functions"; seek(TMP,0,2) || confess "failed to seek to the end of the file"; print TMP "$peg\t$function\t$confidence\n"; close(TMP); return 1; } return 0; } sub hypo { my $x = (@_ == 1) ? $_[0] : $_[1]; if (! $x) { return 1 } if ($x =~ /hypoth/i) { return 1 } if ($x =~ /conserved protein/i) { return 1 } if ($x =~ /unknown/i) { return 1 } return 0; } ############################ Similarities ############################### =pod =head1 sims usage: @sims = $fig->sims($peg,$maxN,$maxP,$select) Returns a list of similarities for $peg such that there will be at most $maxN similarities, each similarity will have a P-score <= $maxP, and $select gives processing instructions: "raw" means that the similarities will not be expanded (by far fastest option) "fig" means return only similarities to fig genes "all" means that you want all the expanded similarities. By "expanded", we refer to taking a "raw similarity" against an entry in the non-redundant protein collection, and converting it to a set of similarities (one for each of the proteins that are essentially identical to the representative in the nr). =cut sub sims { my ($self,$id,$maxN,$maxP,$select,$max_expand) = @_; my($sim); $max_expand = defined($max_expand) ? $max_expand : $maxN; my @sims = (); my @maps_to = $self->mapped_prot_ids($id); if (@maps_to > 0) { my $rep_id = $maps_to[0]->[0]; my @entry = grep { $_->[0] eq $id } @maps_to; if ((@entry == 1) && defined($entry[0]->[1])) { if ((! defined($maps_to[0]->[1])) || (! defined($entry[0]->[1]))) { print STDERR &Dumper(\@maps_to,\@entry); confess "bad"; } my $delta = $maps_to[0]->[1] - $entry[0]->[1]; my @raw_sims = &get_raw_sims($self,$rep_id,$maxN,$maxP); if ($id ne $rep_id) { foreach $sim (@raw_sims) { $sim->[0] = $id; $sim->[6] -= $delta; $sim->[7] -= $delta; } } unshift(@raw_sims,bless([$id,$rep_id,100.00,undef,undef,undef,1,$entry[0]->[1],$delta+1,$maps_to[0]->[1],0.0,,undef,$entry[0]->[1],$maps_to[0]->[1],"blastp",0,0],'Sim')); @sims = grep { $_->id1 ne $_->id2 } &expand_raw_sims($self,\@raw_sims,$maxP,$select,0,$max_expand); } } return @sims; } sub expand_raw_sims { my($self,$raw_sims,$maxP,$select,$dups,$max_expand) = @_; my($sim,$id2,%others,$x); my @sims = (); foreach $sim (@$raw_sims) { next if ($sim->psc > $maxP); $id2 = $sim->id2; next if ($others{$id2} && (! $dups)); $others{$id2} = 1; if ($select && ($select eq "raw") || ($max_expand <= 0)) { push(@sims,$sim); } else { my @relevant; $max_expand--; my @maps_to = $self->mapped_prot_ids($id2); if ((! $select) || ($select eq "fig")) { @relevant = grep { $_->[0] =~ /^fig/ } @maps_to; } elsif ($select && ($select =~ /^ext/i)) { @relevant = grep { $_->[0] !~ /^fig/ } @maps_to; } else { @relevant = @maps_to; } foreach $x (@relevant) { my $sim1 = [@$sim]; my($x_id,$x_ln) = @$x; defined($x_ln) || confess "x_ln id2=$id2 x_id=$x_id"; defined($maps_to[0]->[1]) || confess "maps_to"; my $delta2 = $maps_to[0]->[1] - $x_ln; $sim1->[1] = $x_id; $sim1->[8] -= $delta2; $sim1->[9] -= $delta2; bless($sim1,"Sim"); push(@sims,$sim1); } } } return @sims; } sub get_raw_sims { my($self,$rep_id,$maxN,$maxP) = @_; my(@sims,$seek,$fileN,$ln,$fh,$file,$readN,$readC,@lines,$i,$sim); my($sim_chunk,$psc,$id2); $maxN = $maxN ? $maxN : 500; @sims = (); my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT seek, fileN, len FROM sim_seeks WHERE id = \'$rep_id\' "); foreach $sim_chunk (@$relational_db_response) { ($seek,$fileN,$ln) = @$sim_chunk; $file = $self->N2file($fileN); $fh = $self->openF($file); if (! $fh) { confess "could not open sims for $file"; } seek($fh,$seek,0); $readN = read($fh,$readC,$ln-1); ($readN == ($ln-1)) || confess "could not read the block of sims at $seek for $ln - 1 characters; $readN actually read from $file\n$readC"; @lines = grep { (@$_ == 15) && ($_->[12] =~ /^\d+$/) && ($_->[13] =~ /^\d+$/) && ($_->[6] =~ /^\d+$/) && ($_->[7] =~ /^\d+$/) && ($_->[8] =~ /^\d+$/) && ($_->[9] =~ /^\d+$/) && ($_->[2] =~ /^[0-9.]+$/) && ($_->[10] =~ /^[0-9.e-]+$/) } map { [split(/\t/,$_),"blastp"] } split(/\n/,$readC); @lines = sort { $a->[10] <=> $b->[10] } @lines; for ($i=0; ($i < @lines); $i++) { $psc = $lines[$i]->[10]; $id2 = $lines[$i]->[1]; if ($maxP >= $psc) { $sim = $lines[$i]; bless($sim,"Sim"); push(@sims,$sim); if (@sims == $maxN) { return @sims } } } } return @sims; } =pod =head1 dsims usage: @sims = $fig->dsims($peg,$maxN,$maxP,$select) Returns a list of similarities for $peg such that there will be at most $maxN similarities, each similarity will have a P-score <= $maxP, and $select gives processing instructions: "raw" means that the similarities will not be expanded (by far fastest option) "fig" means return only similarities to fig genes "all" means that you want all the expanded similarities. By "expanded", we refer to taking a "raw similarity" against an entry in the non-redundant protein collection, and converting it to a set of similarities (one for each of the proteins that are essentially identical to the representative in the nr). The "dsims" or "dynamic sims" are not precomputed. They are computed using a heuristic which is much faster than blast, but misses some similarities. Essentially, you have an "index" or representative sequences, a quick blast is done against it, and if there are any hits these are used to indicate which sub-databases to blast against. =cut sub dsims { my($self,$id,$seq,$maxN,$maxP,$select) = @_; my($sim,$sub_dir,$db,$hit,@hits,%in); my @index = &blastit($id,$seq,"$FIG_Config::global/SimGen/exemplar.fasta",1.0e-3); foreach $sim (@index) { if ($sim->id2 =~ /_(\d+)$/) { $in{$1}++; } } @hits = (); foreach $db (keys(%in)) { $sub_dir = $db % 1000; push(@hits,&blastit($id,$seq,"$FIG_Config::global/SimGen/AccessSets/$sub_dir/$db",$maxP)); } if (@hits == 0) { push(@hits,&blastit($id,$seq,"$FIG_Config::global/SimGen/nohit.fasta",$maxP)); } @hits = sort { ($a->psc <=> $b->psc) or ($a->iden cmp $b->iden) } grep { $_->id2 ne $id } @hits; if ($maxN && ($maxN < @hits)) { $#hits = $maxN - 1 } return &expand_raw_sims($self,\@hits,$maxP,$select,0); } sub blastit { my($id,$seq,$db,$maxP) = @_; if (! $maxP) { $maxP = 1.0e-5 } my $tmp = &Blast::blastp([[$id,$seq]],$db,"-e $maxP"); my $tmp1 = $tmp->{$id}; if ($tmp1) { return @$tmp1; } return (); } sub related_by_func_sim { my($self,$peg,$user) = @_; my($func,$sim,$id2,%related); if (($func = $self->function_of($peg,$user)) && (! &FIG::hypo($func))) { foreach $sim ($self->sims($peg,500,1,"fig",500)) { $id2 = $sim->id2; if ($func eq $self->function_of($id2,$user)) { $related{$id2} = 1; } } } return keys(%related); } ################################# chromosomal clusters #################################### =pod =head1 in_cluster_with usage: @pegs = $fig->in_cluster_with($peg) Returns the set of pegs that are thought to be clustered with $peg (on the chromosome). =cut sub in_cluster_with { my($self,$peg) = @_; my($set,$id,%in); return $self->in_set_with($peg,"chromosomal_clusters","cluster_id"); } =pod =head1 add_chromosomal_clusters usage: $fig->add_chromosomal_clusters($file) The given file is supposed to contain one predicted chromosomal cluster per line (either comma or tab separated pegs). These will be added (to the extent they are new) to those already in $FIG_Config::global/chromosomal_clusters. =cut sub add_chromosomal_clusters { my($self,$file) = @_; my($set,$added); open(TMPCLUST,"<$file") || die "aborted"; while (defined($set = )) { print STDERR "."; chop $set; $added += $self->add_chromosomal_cluster([split(/[\t,]+/,$set)]); } close(TMPCLUST); if ($added) { my $rdbH = $self->db_handle; $self->export_set("chromosomal_clusters","cluster_id","$FIG_Config::global/chromosomal_clusters"); return 1; } return 0; } #=pod # #=head1 export_chromosomal_clusters # #usage: $fig->export_chromosomal_clusters # #Invoking this routine writes the set of chromosomal clusters as known in the #relational DB back to $FIG_Config::global/chromosomal_clusters. # #=cut # sub export_chromosomal_clusters { my($self) = @_; $self->export_set("chromosomal_clusters","cluster_id","$FIG_Config::global/chromosomal_clusters"); } sub add_chromosomal_cluster { my($self,$ids) = @_; my($id,$set,%existing,%in,$new,$existing,$new_id); # print STDERR "adding cluster ",join(",",@$ids),"\n"; foreach $id (@$ids) { foreach $set ($self->in_sets($id,"chromosomal_clusters","cluster_id")) { $existing{$set} = 1; foreach $id ($self->ids_in_set($set,"chromosomal_clusters","cluster_id")) { $in{$id} = 1; } } } # print &Dumper(\%existing,\%in); $new = 0; foreach $id (@$ids) { if (! $in{$id}) { $in{$id} = 1; $new++; } } # print STDERR "$new new ids\n"; if ($new) { foreach $existing (keys(%existing)) { $self->delete_set($existing,"chromosomal_clusters","cluster_id"); } $new_id = $self->next_set("chromosomal_clusters","cluster_id"); # print STDERR "adding new cluster $new_id\n"; $self->insert_set($new_id,[keys(%in)],"chromosomal_clusters","cluster_id"); return 1; } return 0; } ################################# PCH pins #################################### =pod =head1 in_pch_pin_with usage: $fig->in_pch_pin_with($peg) Returns the set of pegs that are believed to be "pinned" to $peg (in the sense that PCHs occur containing these pegs over significant phylogenetic distances). =cut sub in_pch_pin_with { my($self,$peg) = @_; my($set,$id,%in); return $self->in_set_with($peg,"pch_pins","pin"); } =pod =head1 add_pch_pins usage: $fig->add_pch_pins($file) The given file is supposed to contain one set of pinned pegs per line (either comma or tab seprated pegs). These will be added (to the extent they are new) to those already in $FIG_Config::global/pch_pins. =cut sub add_pch_pins { my($self,$file) = @_; my($set,$added); open(TMPCLUST,"<$file") || die "aborted"; while (defined($set = )) { print STDERR "."; chop $set; my @tmp = split(/[\t,]+/,$set); if (@tmp < 200) { $added += $self->add_pch_pin([@tmp]); } } close(TMPCLUST); if ($added) { my $rdbH = $self->db_handle; $self->export_set("pch_pins","pin","$FIG_Config::global/pch_pins"); return 1; } return 0; } sub export_pch_pins { my($self) = @_; $self->export_set("pch_pins","pin","$FIG_Config::global/pch_pins"); } sub add_pch_pin { my($self,$ids) = @_; my($id,$set,%existing,%in,$new,$existing,$new_id); # print STDERR "adding cluster ",join(",",@$ids),"\n"; foreach $id (@$ids) { foreach $set ($self->in_sets($id,"pch_pins","pin")) { $existing{$set} = 1; foreach $id ($self->ids_in_set($set,"pch_pins","pin")) { $in{$id} = 1; } } } # print &Dumper(\%existing,\%in); $new = 0; foreach $id (@$ids) { if (! $in{$id}) { $in{$id} = 1; $new++; } } if ($new) { if (keys(%in) < 300) { foreach $existing (keys(%existing)) { $self->delete_set($existing,"pch_pins","pin"); } $new_id = $self->next_set("pch_pins","pin"); # print STDERR "adding new pin $new_id\n"; $self->insert_set($new_id,[keys(%in)],"pch_pins","pin"); } else { $new_id = $self->next_set("pch_pins","pin"); # print STDERR "adding new pin $new_id\n"; $self->insert_set($new_id,$ids,"pch_pins","pin"); } return 1; } return 0; } ################################# Annotations #################################### =pod =head1 add_annotation usage: $fig->add_annotation($fid,$user,$annotation) $annotation is added as a time-stamped annotation to $peg showing $user as the individual who added the annotation. =cut sub add_annotation { my($self,$feature_id,$user,$annotation) = @_; my($genome); # print STDERR "add: fid=$feature_id user=$user annotation=$annotation\n"; if ($genome = $self->genome_of($feature_id)) { my $file = "$FIG_Config::organisms/$genome/annotations"; my $fileno = $self->file2N($file); my $time_made = time; my $ma = ($annotation =~ /^Set master function to/); if (open(TMP,">>$file")) { flock(TMP,LOCK_EX) || confess "cannot lock assigned_functions"; seek(TMP,0,2) || confess "failed to seek to the end of the file"; my $seek1 = tell TMP; print TMP "$feature_id\n$time_made\n$user\n$annotation", (substr($annotation,-1) eq "\n") ? "" : "\n","//\n"; my $seek2 = tell TMP; close(TMP); chmod 0777, $file; my $ln = $seek2 - $seek1; my $rdbH = $self->db_handle; if ($rdbH->SQL("INSERT INTO annotation_seeks ( fid, dateof, who, ma, fileno, seek, len ) VALUES ( \'$feature_id\', $time_made, \'$user\', \'$ma\', $fileno, $seek1, $ln )")) { return 1; } } } return 0; } =pod =head1 merged_related_annotations usage: @annotations = $fig->merged_related_annotations($fids) The set of annotations of a set of PEGs ($fids) is returned as a list of 4-tuples. Each entry in the list is of the form [$fid,$timestamp,$user,$annotation]. =cut sub merged_related_annotations { my($self,$fids) = @_; my($fid); my(@ann) = (); foreach $fid (@$fids) { push(@ann,$self->feature_annotations1($fid)); } return map { $_->[1] = localtime($_->[1]); $_ } sort { $a->[1] <=> $b->[1] } @ann; } =pod =head1 feature_annotations usage: @annotations = $fig->feature_annotations($fid) The set of annotations of $fid is returned as a list of 4-tuples. Each entry in the list is of the form [$fid,$timestamp,$user,$annotation]. =cut sub feature_annotations { my($self,$feature_id) = @_; return map { $_->[1] = localtime($_->[1]); $_ } $self->feature_annotations1($feature_id); } sub feature_annotations1 { my($self,$feature_id) = @_; my($tuple,$fileN,$seek,$ln,$annotation,$feature_idQ); my($file,$fh); my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT fileno, seek, len FROM annotation_seeks WHERE fid = \'$feature_id\' "); my @annotations = (); foreach $tuple (@$relational_db_response) { ($fileN,$seek,$ln) = @$tuple; $annotation = $self->read_annotation($fileN,$seek,$ln); $feature_idQ = quotemeta $feature_id; if ($annotation =~ /^$feature_idQ\n(\d+)\n([^\n]+)\n(.*)/s) { push(@annotations,[$feature_id,$1,$2,$3]); } else { print STDERR "malformed annotation\n$annotation\n"; } } return sort { $a->[1] <=> $b->[1] } @annotations; } sub read_annotation { my($self,$fileN,$seek,$ln) = @_; my($readN,$readC); my $file = $self->N2file($fileN); my $fh = $self->openF($file); if (! $fh) { confess "could not open annotations for $file"; } seek($fh,$seek,0); $readN = read($fh,$readC,$ln-3); ($readN == ($ln-3)) || confess "could not read the block of annotations at $seek for $ln characters; $readN actually read from $file\n$readC"; return $readC; } sub epoch_to_readable { my($epoch) = @_; my($sec,$min,$hr,$dd,$mm,$yr) = localtime($epoch); $mm++; $yr += 1900; return "$mm-$dd-$yr:$hr:$min:$sec"; } sub assignments_made { my($self,$genomes,$who,$date) = @_; my($relational_db_response,$entry,$fid,$fileno,$seek,$len,$ann); my($epoch_date,$when,%sofar,$x); my %genomes = map { $_ => 1 } @$genomes; if ($date =~ /^(\d{1,2})\/(\d{1,2})\/(\d{4})$/) { my($mm,$dd,$yyyy) = ($1,$2,$3); $epoch_date = &Time::Local::timelocal(0,0,0,$dd,$mm-1,$yyyy-1900,0,0,0); } else { $epoch_date = 0; } $epoch_date = defined($epoch_date) ? $epoch_date-1 : 0; my @assignments = (); my $rdbH = $self->db_handle; if ($who eq "master") { $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE ((ma = \'1\') AND (dateof > $epoch_date))"); } else { $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE (( who = \'$who\' ) AND (dateof > $epoch_date))"); } if ($relational_db_response && (@$relational_db_response > 0)) { foreach $entry (@$relational_db_response) { ($fid,$when,$fileno,$seek,$len) = @$entry; if (($fid =~ /^fig\|(\d+\.\d+)/) && $genomes{$1}) { $ann = $self->read_annotation($fileno,$seek,$len); if (($ann =~ /^(fig\|\d+\.\d+\.peg\.\d+)\n(\d+)\n(\S+)\nSet ([^\n]*)function[^\n]*\n(\S[^\n]+\S)/s) && (($who eq $3) || (($4 eq "master ") && ($who eq "master"))) && ($2 >= $epoch_date)) { if ((! $sofar{$1}) || (($x = $sofar{$1}) && ($when > $x->[0]))) { $sofar{$1} = [$when,$5]; } } } } } @assignments = map { $x = $sofar{$_}; [$_,$x->[1]] } keys(%sofar); return @assignments; } ################################# Indexing Features and Functional Roles #################################### =pod =head1 search_index usage: ($pegs,$roles) = $fig->search_pattern($pattern) All pegs that "match" $pattern are put into a list, and $pegs will be a pointer to that list. All roles that "match" $pattern are put into a list, and $roles will be a pointer to that list. The notion of "match $pattern" is intentionally left undefined. For now, you will probably get only entries in which each word id $pattern occurs exactly, but that is not a long term commitment. =cut sub search_index { my($self,$pattern) = @_; my($patternQ,@raw,@pegs,@roles); &clean_tmp; $patternQ = $pattern; $patternQ =~ s/\s+/;/g; $patternQ =~ s/\./\\./g; # print STDERR "pattern=$pattern patternQ=$patternQ\n"; @raw = `$FIG_Config::ext_bin/glimpse -y -H $FIG_Config::data/Indexes -i -w \'$patternQ\'`; @pegs = sort { &FIG::by_fig_id($a->[0],$b->[0]) } map { $_ =~ s/^\S+:\s+//; [split(/\t/,$_)] } grep { $_ =~ /^\S+peg.index/ } @raw; my %roles = map { $_ =~ s/^\S+:\s+//; $_ => 1} grep { $_ =~ /^\S+role.index/ } @raw; @roles = sort keys(%roles); return ([@pegs],[@roles]); } ################################# Loading Databases #################################### #=pod # #=head1 load_all # #usage: load_all # #This function is supposed to reload all entries into the database and do #whatever is required to properly support indexing of pegs and roles. # #=cut sub load_all { &run("index_contigs"); &run("compute_genome_counts"); &run("load_features"); &run("index_sims"); &run("load_peg_mapping"); &run("index_translations"); &run("add_assertions_of_function"); &run("load_protein_families"); &run("load_external_orgs"); &run("load_chromosomal_clusters"); &run("load_pch_pins"); &run("index_neighborhoods"); &run("index_annotations"); &run("load_ec_names"); &run("load_kegg"); &run("load_distances"); &run("make_indexes"); } ################################# Automated Assignments #################################### =pod =head1 auto_assign usage: $assignment = &FIG::auto_assign($peg,$seq) This returns an automated assignment for $peg. $seq is optional; if it is not present, then it is assumed that similarities already exist for $peg. $assignment is set to either Function or Function\tW if it is felt that the assertion is pretty weak. =cut sub auto_assign { my($peg,$seq) = @_; my $cmd = $seq ? "echo \"$peg\t$seq\" | auto_assign | make_calls" : "echo \"$peg\" | auto_assign | make_calls"; # print STDERR $cmd; my(@tmp) = `$cmd`; if ((@tmp == 1) && ($tmp[0] =~ /^\S+\t(\S.*\S)/)) { return $1; } else { return "hypothetical protein"; } } ################################# Protein Families #################################### =pod =head1 all_protein_families usage: @all = $fig->all_protein_families Returns a list of the ids of all of the protein families currently defined. =cut sub all_protein_families { my($self) = @_; return $self->all_sets("protein_families","family"); } =pod =head1 ids_in_family usage: @pegs = $fig->ids_in_family($family) Returns a list of the pegs in $family. =cut sub ids_in_family { my($self,$family) = @_; return $self->ids_in_set($family,"protein_families","family"); } =pod =head1 family_function usage: $func = $fig->family_function($family) Returns the putative function of all of the pegs in $family. Remember, we are defining "protein family" as a set of homologous proteins that have the same function. =cut sub family_function { my($self,$family) = @_; my($relational_db_response); my $rdbH = $self->db_handle; defined($family) || confess "family is missing"; if (($relational_db_response = $rdbH->SQL("SELECT function FROM family_function WHERE ( family = $family)")) && (@$relational_db_response >= 1)) { return $relational_db_response->[0]->[0]; } return ""; } =pod =head1 sz_family usage: $n = $fig->sz_family($family) Returns the number of pegs in $family. =cut sub sz_family { my($self,$family) = @_; return $self->sz_set($family,"protein_families","family"); } =pod =head1 in_family usage: @pegs = $fig->in_family($family) Returns the pegs in $family. =cut sub in_family { my($self,$id) = @_; my @in = $self->in_sets($id,"protein_families","family"); return (@in > 0) ? $in[0] : ""; } ################################# Abstract Set Routines #################################### sub all_sets { my($self,$relation,$set_name) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT DISTINCT $set_name FROM $relation")) && (@$relational_db_response >= 1)) { return map { $_->[0] } @$relational_db_response; } return (); } sub next_set { my($self,$relation,$set_name) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT MAX($set_name) FROM $relation")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0] + 1; } } sub ids_in_set { my($self,$which,$relation,$set_name) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (defined($which) && ($which =~ /^\d+$/)) { if (($relational_db_response = $rdbH->SQL("SELECT id FROM $relation WHERE ( $set_name = $which)")) && (@$relational_db_response >= 1)) { return sort { by_fig_id($a,$b) } map { $_->[0] } @$relational_db_response; } } return (); } sub in_sets { my($self,$id,$relation,$set_name) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT $set_name FROM $relation WHERE ( id = \'$id\' )")) && (@$relational_db_response >= 1)) { return map { $_->[0] } @$relational_db_response; } return (); } sub sz_set { my($self,$which,$relation,$set_name) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT COUNT(*) FROM $relation WHERE ( $set_name = $which)")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } return 0; } sub delete_set { my($self,$set,$relation,$set_name) = @_; # print STDERR "deleting set $set\n"; my $rdbH = $self->db_handle; return $rdbH->SQL("DELETE FROM $relation WHERE ( $set_name = $set )"); } sub insert_set { my($self,$set,$ids,$relation,$set_name) = @_; my($id); # print STDERR "inserting set $set containing ",join(",",@$ids),"\n"; my $rdbH = $self->db_handle; my @ids = grep { length($_) < 255 } @$ids; if (@ids < 2) { return 0 } my $rc = 1; foreach $id (@ids) { if (! $rdbH->SQL("INSERT INTO $relation ( $set_name,id ) VALUES ( $set,\'$id\' )")) { $rc = 0; } } # print STDERR " rc=$rc\n"; return $rc; } sub in_set_with { my($self,$peg,$relation,$set_name) = @_; my($set,$id,%in); foreach $set ($self->in_sets($peg,$relation,$set_name)) { foreach $id ($self->ids_in_set($set,$relation,$set_name)) { $in{$id} = 1; } } return sort { &by_fig_id($a,$b) } keys(%in); } sub export_set { my($self,$relation,$set_name,$file) = @_; my($pair); my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT $set_name, id FROM $relation"); open(TMP,">$file") || die "could not open $file"; flock(TMP,LOCK_EX) || confess "cannot lock $file"; seek(TMP,0,2) || confess "failed to seek to the end of the file"; foreach $pair (sort { ($a->[0] <=> $b->[0]) or &by_fig_id($a->[1],$b->[1]) } @$relational_db_response) { print TMP join("\t",@$pair),"\n"; } close(TMP); return 1; } ################################# KEGG Stuff #################################### =pod =head1 all_compounds usage: @compounds = $fig->all_compounds Returns a list containing all of the KEGG compounds. =cut sub all_compounds { my($self) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT DISTINCT cid FROM comp_name"); if (@$relational_db_response > 0) { return sort map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 names_of_compound usage: @names = $fig->names_of_compound Returns a list containing all of the names assigned to the KEGG compounds. The list will be ordered as given by KEGG. =cut sub names_of_compound { my($self,$cid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT pos,name FROM comp_name where cid = \'$cid\'"); if (@$relational_db_response > 0) { return map { $_->[1] } sort { $a->[0] <=> $b->[0] } @$relational_db_response; } return (); } =pod =head1 comp2react usage: @rids = $fig->comp2react($cid) Returns a list containing all of the reaction IDs for reactions that take $cid as either a substrate or a product. =cut sub comp2react { my($self,$cid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT rid FROM reaction_to_compound where cid = \'$cid\'"); if (@$relational_db_response > 0) { return sort map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 cas usage: $cas = $fig->cas($cid) Returns the CAS ID for the compound, if known. =cut sub cas { my($self,$cid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT cas FROM comp_cas where cid = \'$cid\'"); if (@$relational_db_response == 1) { return $relational_db_response->[0]->[0]; } return ""; } =pod =head1 cas_to_cid usage: $cid = $fig->cas_to_cid($cas) Returns the compound id (cid), given the CAS ID. =cut sub cas_to_cid { my($self,$cas) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT cid FROM comp_cas where cas = \'$cas\'"); if (@$relational_db_response == 1) { return $relational_db_response->[0]->[0]; } return ""; } =pod =head1 all_reactions usage: @rids = $fig->all_reactions Returns a list containing all of the KEGG reaction IDs. =cut sub all_reactions { my($self) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT DISTINCT rid FROM reaction_to_compound"); if (@$relational_db_response > 0) { return sort map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 reversible usage: $rev = $fig->reversible($rid) Returns true iff the reactions had a "main direction" designated as "<=>"; =cut sub reversible { my($self,$rid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT reversible FROM reversible where rid = \'$rid\'"); if (@$relational_db_response == 1) { return $relational_db_response->[0]->[0]; } return 1; } =pod =head1 reaction2comp usage: @tuples = $fig->reaction2comp($rid,$which) Returns the "substrates" iff $which == 0. In any event (i.e., whether you ask for substrates or products), you get back a list of 3-tuples. Each 3-tuple will contain [$cid,$stoich,$main] Stoichiometry is normally numeric, but can be things like "n" or "(n+1)". $main is 1 iff the compound is considered "main" or "connectable". =cut sub reaction2comp { my($self,$rid,$which) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT cid,stoich,main FROM reaction_to_compound where rid = \'$rid\' and setn = \'$which\'"); if (@$relational_db_response > 0) { return sort { $a->[0] cmp $b->[0] } map { $_->[1] =~ s/\s+//g; $_ } @$relational_db_response; } return (); } =pod =head1 catalyzed_by usage: @ecs = $fig->catalyzed_by($rid) Returns the ECs that are reputed to catalyze the reaction. Note that we are currently just returning the ECs that KEGG gives. We need to handle the incompletely specified forms (e.g., 1.1.1.-), but we do not do it yet. =cut sub catalyzed_by { my($self,$rid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT role FROM reaction_to_enzyme where rid = \'$rid\'"); if (@$relational_db_response > 0) { return sort map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 catalyzes usage: @ecs = $fig->catalyzes($role) Returns the rids of the reactions catalyzed by the "role" (normally an EC). =cut sub catalyzes { my($self,$role) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT rid FROM reaction_to_enzyme where role = \'$role\'"); if (@$relational_db_response > 0) { return sort map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 displayable_reaction usage: $display_format = $fig->displayable_reaction($rid) Returns a string giving the displayable version of a reaction. =cut sub displayable_reaction { my($self,$rid) = @_; my @tmp = `grep $rid $FIG_Config::data/KEGG/reaction_name.lst`; if (@tmp > 0) { chop $tmp[0]; return $tmp[0]; } return $rid; } =pod =head1 all_maps usage: @maps = $fig->all_maps Returns a list containing all of the KEGG maps that the system knows about (the maps need to be periodically updated). =cut sub all_maps { my($self,$ec) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT DISTINCT map FROM ec_map "); if (@$relational_db_response > 0) { return map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 ec_to_maps usage: @maps = $fig->ec_to_maps($ec) Returns the set of maps that contain $ec as a functional role. $ec is usually an EC number, but in the more general case, it can be a functional role. =cut sub ec_to_maps { my($self,$ec) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT map FROM ec_map WHERE ( ec = \'$ec\' )"); if (@$relational_db_response > 0) { return map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 map_to_ecs usage: @ecs = $fig->map_to_ecs($map) Returns the set of functional roles (usually ECs) that are contained in the functionality depicted by $map. =cut sub map_to_ecs { my($self,$map) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT ec FROM ec_map WHERE ( map = \'$map\' )"); if (@$relational_db_response > 0) { return map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 map_name usage: $name = $fig->map_name($map) Returns the descriptive name covering the functionality depicted by $map. =cut sub map_name { my($self,$map) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT mapname FROM map_name WHERE ( map = \'$map\' )"); if (@$relational_db_response == 1) { return $relational_db_response->[0]->[0]; } return ""; } ################################# Functional Roles #################################### =pod =head1 neighborhood_of_role usage: @roles = $fig->neighborhood_of_role($role) Returns a list of functional roles that we consider to be "the neighborhood" of $role. =cut sub neighborhood_of_role { my($self,$role) = @_; my($readC); my $file = "$FIG_Config::global/role.neighborhoods"; my $rdbH = $self->db_handle; my $roleQ = quotemeta $role; my $relational_db_response = $rdbH->SQL("SELECT seek, len FROM neigh_seeks WHERE role = \'$roleQ\' "); if (@$relational_db_response == 1) { my($seek,$ln) = @{$relational_db_response->[0]}; my $fh = $self->openF($file); seek($fh,$seek,0); my $readN = read($fh,$readC,$ln-1); ($readN == ($ln-1)) || confess "could not read the block of sims at $seek for $ln - 1 characters; $readN actually read from $file\n$readC"; return grep { $_ && ($_ !~ /^\/\//) } split(/\n/,$readC); } return (); } =pod =head1 roles_of_function usage: @roles = $fig->roles_of_function($func) Returns a list of the functional roles implemented by $func. =cut sub roles_of_function { my($func) = @_; return (split(/\s*[\/;]\s+/,$func),($func =~ /\d+\.\d+\.\d+\.\d+/g)); } =pod =head1 seqs_with_role usage: @pegs = $fig->seqs_with_role($role,$who) Returns a list of the pegs that implement $role. If $who is not given, it defaults to "master". The system returns all pegs with an assignment made by either "master" or $who (if it is different than the master) that implement $role. Note that this includes pegs for which the "master" annotation disagrees with that of $who, the master's implements $role, and $who's does not. =cut sub seqs_with_role { my($self,$role,$who,$genome) = @_; my($relational_db_response,$query); my $roleQ = quotemeta $role; $who = $who ? $who : "master"; my $rdbH = $self->db_handle; my $who_cond; if ($who eq "master") { $who_cond = "( made_by = \'master\' OR made_by = \'unknown\' )"; } else { $who_cond = "( made_by = \'master\' OR made_by = \'$who\' OR made_by = \'unknown\')"; } if (! $genome) { $query = "SELECT distinct prot FROM roles WHERE (( role = \'$roleQ\' ) AND $who_cond )"; } else { $query = "SELECT distinct prot FROM roles WHERE (( role = \'$roleQ\' ) AND $who_cond AND (org = \'$genome\'))"; } return (($relational_db_response = $rdbH->SQL($query)) && (@$relational_db_response >= 1)) ? map { $_->[0] } @$relational_db_response : (); } =pod =head1 seqs_with_roles_in_genomes usage: $result = $fig->seqs_with_roles_in_genomes($genomes,$roles,$made_by) This routine takes a pointer to a list of genomes ($genomes) and a pointer to a list of roles ($roles) and looks up all of the sequences that connect to those roles according to either the master assignments or those made by $made_by. Again, you will get assignments for which the "master" assignment connects, but the $made_by does not. A hash is returned. The keys to the hash are genome IDs for which at least one sequence was found. $result->{$genome} will itself be a hash, assuming that at least one sequence was found for $genome. $result->{$genome}->{$role} will be set to a pointer to a list of 2-tuples. Each 2-tuple will contain [$peg,$function], where $function is the one for $made_by (which may not be the one that connected). =cut sub seqs_with_roles_in_genomes { my($self,$genomes,$roles,$made_by) = @_; my($genome,$role,$roleQ,$role_cond,$made_by_cond,$query,$relational_db_response,$peg,$genome_cond,$hit); my $rdbH = $self->db_handle; my $result = {}; # foreach $genome ($self->genomes) { $result->{$genome} = {} } if (! $made_by) { $made_by = 'master' } if ((@$genomes > 0) && (@$roles > 0)) { $genome_cond = "(" . join(" OR ",map { "( org = \'$_\' )" } @$genomes) . ")"; $role_cond = "(" . join(" OR ",map { $roleQ = quotemeta $_; "( role = \'$roleQ\' )" } @$roles) . ")"; $made_by_cond = ($made_by eq 'master') ? "(made_by = 'master')" : "(made_by = 'master' OR made_by = '$made_by')"; $query = "SELECT distinct prot, role FROM roles WHERE ( $made_by_cond AND $genome_cond AND $role_cond )"; if (($relational_db_response = $rdbH->SQL($query)) && (@$relational_db_response >= 1)) { foreach $hit (@$relational_db_response) { ($peg,$role) = @$hit; $genome = $self->genome_of($peg); push(@{ $result->{$genome}->{$role} },[$peg,scalar $self->function_of($peg,$made_by)]); } } } return $result; } =pod =head1 largest_clusters usage: @clusters = $fig->largest_clusters($roles,$user) This routine can be used to find the largest clusters containing the some of the designated set of roles. A list of clusters is returned. Each cluster is a pointer to a list of pegs. =cut sub largest_clusters { my($self,$roles,$user,$sort_by_unique_functions) = @_; my($genome,$x,$role,$y,$peg,$loc,$contig,$beg,$end,%pegs,@pegs,$i,$j); my $ss = $self->seqs_with_roles_in_genomes([$self->genomes],$roles,$user); my @clusters = (); foreach $genome (keys(%$ss)) { my %pegs; $x = $ss->{$genome}; foreach $role (keys(%$x)) { $y = $x->{$role}; foreach $peg (map { $_->[0] } @$y) { if ($loc = $self->feature_location($peg)) { ($contig,$beg,$end) = &FIG::boundaries_of($loc); $pegs{$peg} = [$peg,$contig,int(($beg + $end) / 2)]; } } } @pegs = sort { ($pegs{$a}->[1] cmp $pegs{$b}->[1]) or ($pegs{$a}->[2] <=> $pegs{$b}->[2]) } keys(%pegs); $i = 0; while ($i < $#pegs) { for ($j=$i+1; ($j < @pegs) && &close_enough_locs($pegs{$pegs[$j-1]},$pegs{$pegs[$j]}); $j++) {} if ($j > ($i+1)) { push(@clusters,[@pegs[$i..$j-1]]); } $i = $j; } } if ($sort_by_unique_functions) { @clusters = sort { $self->unique_functions($b,$user) <=> $self->unique_functions($a,$user) } @clusters; } else { @clusters = sort { @$b <=> @$a } @clusters; } return @clusters; } sub unique_functions { my($self,$pegs,$user) = @_; my($peg,$func,%seen); foreach $peg (@$pegs) { if ($func = $self->function_of($peg,$user)) { $seen{$func} = 1; } } return scalar keys(%seen); } sub close_enough_locs { my($x,$y) = @_; return (($x->[1] eq $y->[1]) && (abs($x->[2] - $y->[2]) < 5000)); } ################################# DNA sequence Stuff #################################### =pod =head1 extract_seq usage: $seq = &FIG::extract_seq($contigs,$loc) This is just a little utility routine that I have found convenient. It assumes that $contigs is a hash that contains IDs as keys and sequences as values. $loc must be of the form Contig_Beg_End where Contig is the ID of one of the sequences; Beg and End give the coordinates of the sought subsequence. If Beg > End, it is assumed that you want the reverse complement of the subsequence. This routine plucks out the subsequence for you. =cut sub extract_seq { my($contigs,$loc) = @_; my($contig,$beg,$end,$contig_seq); my($plus,$minus); $plus = $minus = 0; my $strand = ""; my @loc = split(/,/,$loc); my @seq = (); foreach $loc (@loc) { if ($loc =~ /^\S+_(\d+)_(\d+)$/) { if ($1 < $2) { $plus++; } elsif ($2 < $1) { $minus++; } } } if ($plus > $minus) { $strand = "+"; } elsif ($plus < $minus) { $strand = "-"; } foreach $loc (@loc) { if ($loc =~ /^(\S+)_(\d+)_(\d+)$/) { ($contig,$beg,$end) = ($1,$2,$3); if (($beg < $end) || (($beg == $end) && ($strand eq "+"))) { $strand = "+"; push(@seq,substr($contigs->{$contig},$beg-1,($end+1-$beg))); } else { $strand = "-"; push(@seq,&reverse_comp(substr($contigs->{$contig},$end-1,($beg+1-$end)))); } } } return join("",@seq); } =pod =head1 contig_ln usage: $n = $fig->contig_ln($genome,$contig) Returns the length of $contig from $genome. =cut sub contig_ln { my($self,$genome,$contig) = @_; my($rdbH,$relational_db_response); $rdbH = $self->db_handle; if (defined($genome) && defined($contig)) { if (($relational_db_response = $rdbH->SQL("SELECT len FROM contig_lengths WHERE ( genome = \'$genome\' ) and ( contig = \'$contig\' )")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } } return undef; } =pod =head1 dna_seq usage: $seq = dna_seq($genome,@locations) Returns the concatenated subsequences described by the list of locations. Each location must be of the form Contig_Beg_End where Contig must be the ID of a contig for genome $genome. If Beg > End the location describes a stretch of the complementary strand. =cut sub dna_seq { my($self,$genome,@locations) = @_; my(@pieces,$loc,$contig,$beg,$end,$ln,$rdbH); @pieces = (); foreach $loc (@locations) { if ($loc =~ /^(\S+)_(\d+)_(\d+)$/) { ($contig,$beg,$end) = ($1,$2,$3); $ln = $self->contig_ln($genome,$contig); if (! $ln) { print STDERR "$genome/$contig: could not get length\n"; return ""; } if (&between(1,$beg,$ln) && &between(1,$end,$ln)) { if ($beg < $end) { push(@pieces, $self->get_dna($genome,$contig,$beg,$end)); } else { push(@pieces, &reverse_comp($self->get_dna($genome,$contig,$end,$beg))); } } } } return join("",@pieces); } sub get_dna { my($self,$genome,$contig,$beg,$end) = @_; my $relational_db_response; my $rdbH = $self->db_handle; my $indexpt = int(($beg-1)/10000) * 10000; if (($relational_db_response = $rdbH->SQL("SELECT startN,fileno,seek FROM contig_seeks WHERE ( genome = \'$genome\' ) AND ( contig = \'$contig\' ) AND ( indexpt = $indexpt )")) && (@$relational_db_response == 1)) { my($startN,$fileN,$seek) = @{$relational_db_response->[0]}; my $fh = $self->openF($self->N2file($fileN)); if (seek($fh,$seek,0)) { my $chunk = ""; read($fh,$chunk,int(($end + 1 - $startN) * 1.03)); $chunk =~ s/\s//g; my $ln = ($end - $beg) + 1; if (length($chunk) >= $ln) { return substr($chunk,(($beg-1)-$startN),$ln); } } } return undef; } ################################# Taxonomy #################################### =pod =head1 taxonomy_of usage: $gs = $fig->taxonomy_of($genome_id) Returns the taxonomy of the specified genome. Gives the taxonomy down to genus and species. =cut sub taxonomy_of { my($self,$genome) = @_; my($ans); my $taxonomy = $self->cached('_taxonomy'); if (! ($ans = $taxonomy->{$genome})) { my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT genome,taxonomy FROM genome"); my $pair; foreach $pair (@$relational_db_response) { $taxonomy->{$pair->[0]} = $pair->[1]; } $ans = $taxonomy->{$genome}; } return $ans; } =pod =head1 is_bacterial usage: $fig->is_bacterial($genome) Returns true iff the genome is bacterial. =cut sub is_bacterial { my($self,$genome) = @_; return ($self->taxonomy_of($genome) =~ /^Bacteria/); } =pod =head1 is_archaeal usage: $fig->is_archaeal($genome) Returns true iff the genome is archaeal. =cut sub is_archaeal { my($self,$genome) = @_; return ($self->taxonomy_of($genome) =~ /^Archaea/); } =pod =head1 is_prokaryotic usage: $fig->is_prokaryotic($genome) Returns true iff the genome is prokaryotic =cut sub is_prokaryotic { my($self,$genome) = @_; return ($self->taxonomy_of($genome) =~ /^(Archaea|Bacteria)/); } =pod =head1 is_eukaryotic usage: $fig->is_eukaryotic($genome) Returns true iff the genome is eukaryotic =cut sub is_eukaryotic { my($self,$genome) = @_; return ($self->taxonomy_of($genome) =~ /^Eukaryota/); } =pod =head1 sort_genomes_by_taxonomy usage: @genomes = $fig->sort_genomes_by_taxonomy(@list_of_genomes) This routine is used to sort a list of genome IDs to put them into taxonomic order. =cut sub sort_genomes_by_taxonomy { my($self,@fids) = @_; return map { $_->[0] } sort { $a->[1] cmp $b->[1] } map { [$_,$self->taxonomy_of($_)] } @fids; } =pod =head1 crude_estimate_of_distance usage: $dist = $fig->crude_estimate_of_distance($genome1,$genome2) There are a number of places where we need estimates of the distance between two genomes. This routine will return a value between 0 and 1, where a value of 0 means "the genomes are essentially identical" and a value of 1 means "the genomes are in different major groupings" (the groupings are archaea, bacteria, euks, and viruses). The measure is extremely crude. =cut sub crude_estimate_of_distance { my($self,$genome1,$genome2) = @_; my($i,$v,$d,$dist); if ($genome1 > $genome2) { ($genome1,$genome2) = ($genome2,$genome1) } my $relational_db_response; my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT dist FROM distances WHERE ( genome1 = \'$genome1\' ) AND ( genome2 = \'$genome2\' ) ")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } return $self->crude_estimate_of_distance1($genome1,$genome2); } sub crude_estimate_of_distance1 { my($self,$genome1,$genome2) = @_; my($i,$v,$d,$dist); if ($genome1 > $genome2) { ($genome1,$genome2) = ($genome2,$genome1) } $dist = $self->cached('_dist'); if (! $dist->{"$genome1,$genome2"}) { my @tax1 = split(/\s*;\s*/,$self->taxonomy_of($genome1)); my @tax2 = split(/\s*;\s*/,$self->taxonomy_of($genome2)); $d = 1; for ($i=0, $v=0.5; ($i < @tax1) && ($i < @tax2) && ($tax1[$i] eq $tax2[$i]); $i++, $v = $v/2) { $d -= $v; } $dist->{"$genome1,$genome2"} = $d; } return $dist->{"$genome1,$genome2"}; } =pod =head1 sort_fids_by_taxonomy usage: @sorted_by_taxonomy = $fig->sort_fids_by_taxonomy(@list_of_fids) Sorts a list of feature IDs based on the taxonomies of the genomes that contain the features. =cut sub sort_fids_by_taxonomy { my($self,@fids) = @_; return map { $_->[0] } sort { $a->[1] cmp $b->[1] } map { [$_,$self->taxonomy_of(&genome_of($_))] } @fids; } sub build_tree_of_complete { my($self,$min_for_label) = @_; my(@last,@tax,$i,$prefix,$lev,$genome,$tax); $min_for_label = $min_for_label ? $min_for_label : 10; open(TMP,">/tmp/tree$$") || die "could not open /tmp/tree$$"; print TMP "1. root\n"; @last = (); foreach $genome (grep { $_ !~ /^99999/ } $self->sort_genomes_by_taxonomy($self->genomes("complete"))) { $tax = $self->taxonomy_of($genome); @tax = split(/\s*;\s*/,$tax); push(@tax,$genome); for ($i=0; ((@last > $i) && (@tax > $i) && ($last[$i] eq $tax[$i])); $i++) {} while ($i < @tax) { $lev = $i+2; $prefix = " " x (4 * ($lev-1)); print TMP "$prefix$lev\. $tax[$i]\n"; $i++; } @last = @tax; } close(TMP); my $tree = &tree_utilities::build_tree_from_outline("/tmp/tree$$"); $tree->[0] = 'All'; &limit_labels($tree,$min_for_label); unlink("/tmp/tree$$"); return ($tree,&tips_of_tree($tree)); } sub limit_labels { my($tree,$min_for_label) = @_; my($children) = &tree_utilities::node_pointers($tree); if (@$children == 1) { return 1; } else { my $n = 0; my $i; for ($i=1; ($i < @$children); $i++) { $n += &limit_labels($children->[$i],$min_for_label); } if ($n < $min_for_label) { $tree->[0] = ""; } return $n; } } sub taxonomic_groups_of_complete { my($self,$min_for_labels) = @_; my($tree,undef) = $self->build_tree_of_complete($min_for_labels); return &taxonomic_groups($tree); } sub taxonomic_groups { my($tree) = @_; my($groups,undef) = &taxonomic_groups_and_children($tree); return $groups; } sub taxonomic_groups_and_children { my($tree) = @_; my($ids1,$i,$groupsC,$idsC); my $ptrs = &tree_utilities::node_pointers($tree); my $ids = []; my $groups = []; if (@$ptrs > 1) { $ids1 = []; for ($i=1; ($i < @$ptrs); $i++) { ($groupsC,$idsC) = &taxonomic_groups_and_children($ptrs->[$i]); if (@$groupsC > 0) { push(@$groups,@$groupsC); } push(@$ids1,@$idsC); } if ($tree->[0]) { push(@$groups,[$tree->[0],$ids1]); } push(@$ids,@$ids1); } elsif ($tree->[0]) { push(@$ids,$tree->[0]); } return ($groups,$ids); } 1