Parent Directory
|
Revision Log
broaden set of acceptable aliases as dbxrefs
# -*- perl -*- use strict; use FIG; my $fig = new FIG; use DBrtns; my $temp_dir = "$FIG_Config::temp"; my($organisms_dir) = "$FIG_Config::organisms"; my($genome,@types,$type,$id,$loc,@aliases,$aliases,$contig); # usage: load_features [G1 G2 G3 ... ] open(REL,">$temp_dir/tmpfeat$$") || die "could not open $temp_dir/tmpfeat$$"; open(ALIAS,"| sort -T $temp_dir -u > $temp_dir/tmpalias$$") || die "could not open $temp_dir/tmpalias$$"; my $dbf = $fig->{_dbf}; my @genomes; if (@ARGV == 0) { $dbf->drop_table( tbl => "features" ); $dbf->drop_table( tbl => "ext_alias" ); $dbf->create_table( tbl => 'ext_alias', flds => "id varchar(32), alias varchar(32), genome varchar(16)" ); if ($FIG_Config::dbms eq "Pg") { $dbf->create_table( tbl => "features", flds => "id varchar(32), idN INTEGER, type varchar(16),genome varchar(16)," . "location varchar(5000)," . "contig varchar(96), minloc INTEGER, maxloc INTEGER," . "aliases TEXT" ); } elsif ($FIG_Config::dbms eq "mysql") { $dbf->create_table( tbl => "features", flds => "id varchar(32), idN INTEGER, type varchar(16),genome varchar(16)," . "location TEXT," . "contig varchar(96), minloc INTEGER, maxloc INTEGER," . "aliases TEXT" ); } @genomes = $fig->genomes; # Here we extract external aliases from the peg.synonyms table, when they can be inferred # accurately. open(SYN,"<$FIG_Config::global/peg.synonyms") || die "could not open $FIG_Config::global/peg.synonyms"; while (defined($_ = <SYN>)) { chop; my($x,$y) = split(/\t/,$_); my @ids = map { $_ =~ /^([^,]+),(\d+)/; [$1,$2] } ($x,split(/;/,$y)); my @fig = (); my(@nonfig) = (); foreach $_ (@ids) { if ($_->[0] =~ /^fig\|/) { push(@fig,$_); } else { push(@nonfig,$_); } } my $x; foreach $x (@fig) { my($peg,$peg_ln) = @$x; my $genome = &FIG::genome_of($peg); foreach $_ (@nonfig) { if ((@fig == 1) || ($peg_ln == $_->[1])) { print ALIAS "$peg\t$_->[0]\t$genome\n"; # print STDERR "$peg\t$_->[0]\t$genome\n"; } } } } close(SYN); } else { @genomes = @ARGV; foreach $genome (@genomes) { $dbf->SQL("DELETE FROM features WHERE ( genome = \'$genome\' )"); $dbf->SQL("DELETE FROM ext_alias WHERE ( genome = \'$genome\' )"); } } my $changes = {}; if (open(TMP,"<$FIG_Config::global/changed.location.features")) { while ($_ = <TMP>) { if ($_ =~ /^(fig\|\d+\.\d+\.[a-zA-Z]+\.\d+)/) { $changes->{$1}++; } } close(TMP); } foreach $genome (@genomes) { opendir(FEAT,"$organisms_dir/$genome/Features") || die "could not open $genome/Features"; @types = grep { $_ =~ /^[a-zA-Z]+$/ } readdir(FEAT); closedir(FEAT); foreach $type (@types) { if ((-s "$organisms_dir/$genome/Features/$type/tbl") && open(TBL,"<$organisms_dir/$genome/Features/$type/tbl")) { # print STDERR "loading $genome/Features/$type/tbl\n"; while (defined($_ = <TBL>)) { chop; ($id,$loc,@aliases) = split(/\t/,$_); if ($id && ($_ = $changes->{$id})) # check for obsolete entries due to location changes { $changes->{$id}--; next; } if ($id) { my($minloc,$maxloc); if ($loc) { $loc =~ s/\s+$//; ($contig,$minloc,$maxloc) = &FIG::boundaries_of($loc); if ($minloc && $maxloc) { ($minloc < $maxloc) || (($minloc,$maxloc) = ($maxloc,$minloc)); } } if (! $contig) { $loc = $contig = $minloc = $maxloc = ""; } if (@aliases > 0) { $aliases = join(",",grep(/\S/,@aliases)); my $alias; foreach $alias (@aliases) { if ($alias =~ /^([NXYZA]P_|gi\||sp\|\tr\||kegg\||uni\|)/) { print ALIAS "$id\t$alias\t$genome\tOVERRIDE\n"; # print STDERR "$id\t$alias\t$genome\tOVERRIDE\n"; } } } else { $aliases = ""; } $minloc = (! $minloc) ? 0 : $minloc; $maxloc = (! $maxloc) ? 0 : $maxloc; if ((length($loc) < 5000) && (length($contig) < 96) && (length($id) < 32) && ($id =~ /(\d+)$/)) { print REL "$id\t$1\t$type\t$genome\t$loc\t$contig\t$minloc\t$maxloc\t$aliases\n"; } } } close(TBL); } } } close(REL); close(ALIAS); open(ALIASIN,"<$temp_dir/tmpalias$$") || die "could not open $temp_dir/tmpalias$$"; open(ALIASOUT,">$temp_dir/tmpalias$$.1") || die "could not open $temp_dir/tmpalias$$.1"; $_ = <ALIASIN>; while ($_ && ($_ =~ /^(\S+)/)) { my @aliases = (); my $curr = $1; while ($_ && ($_ =~ /^(\S+)\t(\S+)(\t(\S+))?/) && ($1 eq $curr)) { push(@aliases,[$2,$3 ? 1 : 0]); $_ = <ALIASIN>; } my $x; my $genome = &FIG::genome_of($curr); foreach $x (@aliases) { if ($x->[1]) { print ALIASOUT "$curr\t$x->[0]\t$genome\n"; } else { my $i; for ($i=0; ($i < @aliases) && ((! $aliases[$i]->[1]) || (! &same_class($x->[0],$aliases[$i]->[0]))); $i++) {} if ($i == @aliases) { print ALIASOUT "$curr\t$x->[0]\t$genome\n"; } } } } close(ALIASIN); close(ALIASOUT); unlink("$temp_dir/tmpalias$$"); $dbf->load_table( tbl => "features", file => "$temp_dir/tmpfeat$$" ); $dbf->load_table( tbl => "ext_alias", file => "$temp_dir/tmpalias$$.1" ); if (@ARGV == 0) { $dbf->create_index( idx => "ext_alias_alias_ix", tbl => "ext_alias", type => "btree", flds => "alias" ); $dbf->create_index( idx => "ext_alias_genome_ix", tbl => "ext_alias", type => "btree", flds => "genome" ); $dbf->create_index( idx => "ext_alias_id_ix", tbl => "ext_alias", type => "btree", flds => "id" ); $dbf->create_index( idx => "features_id_ix", tbl => "features", type => "btree", flds => "id" ); $dbf->create_index( idx => "features_org_ix", tbl => "features", type => "btree", flds => "genome" ); $dbf->create_index( idx => "features_type_ix", type => "btree", tbl => "features", flds => "type" ); $dbf->create_index( idx => "features_beg_ix", type => "btree", tbl => "features", flds => "genome,contig,minloc" ); $dbf->vacuum_it("features") } unlink("$temp_dir/tmpfeat$$"); unlink("$temp_dir/tmpalias$$.1"); sub same_class { my($x,$y) = @_; my $class1 = &classA($x); my $class2 = &classA($y); return ($class1 && ($class1 eq $class2)); } sub classA { my($alias) = @_; if ($alias =~ /^([^\|]+)\|/) { return $1; } elsif ($alias =~ /^[NXYZA]P_[0-9\.]+$/) { return "refseq"; } else { return ""; } }
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |