[Bio] / FigKernelScripts / make_uniprot_corr.pl Repository:
ViewVC logotype

View of /FigKernelScripts/make_uniprot_corr.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Tue Jan 29 19:04:04 2008 UTC (12 years ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.1: +1 -0 lines
build uniprot correspondence

########################################################################
use strict;

my $usage = "usage: make_uniprot_corr UniProt:idmapping.tb ID-corr OrganismCorr ";

# This program takes as an argument 
# 
#      idmapping.tb
# 
# which is a file that can be downloaded from the PIR web site using
# 
#      ftp://ftp.pir.georgetown.edu/databases/idmapping
# 
# The program outputs two correspondences:
# 
#     ID-corr is a 2-column table [UniProt_ac,PEG]
# 
# and
# 
#     OrganismCorr which is of the form [FIG-organism,FIG-genomeID,Uni-Organism,Uni-taxid]
# 

my($idmapF,$id_corrF,$org_corrF);

(
 ($idmapF       = shift @ARGV) &&
 ($id_corrF     = shift @ARGV) &&
 ($org_corrF    = shift @ARGV)
)
    || die $usage;

use FIG;
my $fig = new FIG;

my %complete = map { $_ => 1 } $fig->genomes('complete');

my($fig_org_2_uni_org,$uni_org_2_fig_org) = &pass1($fig,\%complete);
#open(TMP,">tmp.org.corr") || die "bad";
#foreach $_ (keys(%$fig_org_2_uni_org)) { print TMP "$_\t$fig_org_2_uni_org->{$_}\n"; }
#close(TMP);

my($peg2uni,$uni2peg) = &pass2($fig,\%complete,$fig_org_2_uni_org,$uni_org_2_fig_org);
#open(TMP,">tmp.peg.corr") || die "bad";
#foreach $_ (keys(%$peg2uni)) { print TMP "$_\t$peg2uni->{$_}\n"; }
#close(TMP);

open(ORG,">$org_corrF") || die "could not open $org_corrF";
foreach my $orgF (keys(%$fig_org_2_uni_org))
{
    print ORG join("\t",($orgF,$fig_org_2_uni_org->{$orgF})),"\n";
}
close(ORG);

&pass3($fig,\%complete,$peg2uni,$uni2peg,$idmapF);

open(IDCORR,">$id_corrF") || die "could not open $id_corrF";
foreach my $uni (sort keys(%$uni2peg))
{
    print IDCORR join("\t",($uni,$uni2peg->{$uni})),"\n";
}
close(IDCORR);

sub pass1 {
    my($fig,$completeG) = @_;

    my $fig_org_2_uni_org = {};
    my $uni_org_2_fig_org = {};

    my(%counts);
    open(SYN,"<$FIG_Config::global/peg.synonyms")
	|| die "could not open $FIG_Config::global/peg.synonyms";
    while (defined($_ = <SYN>))
    {
	chomp;
	my ($head,$rest) = split(/\t/,$_);
	my @ids = map { $_ =~ /^([^,]+),/; $1 } ($head,split(/;/,$rest));
	my @fig = grep { $_ =~ /fig/ } @ids;
	my @uni = map { $_ =~ /^uni\|(\S+)/; $1 } grep { $_ =~ /^uni/ } @ids;
	if ((@fig == 1) && (@uni == 1))
	{
	    next if (! $completeG->{&FIG::genome_of($fig[0])});
	    my $org1 = $fig->org_of($fig[0]);
	    my $org2 = $fig->org_of("uni|$uni[0]");
	    if ($org1 && $org2)
	    {
		my $key = "$org1\t$org2";
		$counts{$key}++;
	    }
	}
    }
    close(SYN);

    foreach my $key (sort {$counts{$b} <=> $counts{$a} } keys(%counts))
    {
	my($org1,$org2) = split(/\t/,$key);
	if ((! $fig_org_2_uni_org->{$org1}) && (! $uni_org_2_fig_org->{$org2}))
	{
	    $fig_org_2_uni_org->{$org1} = $org2;
	    $uni_org_2_fig_org->{$org2} = $org1;
#	    print STDERR join("\t",($key,$counts{$key})),"\n";
	}
    }
    return ($fig_org_2_uni_org,$uni_org_2_fig_org);
}

sub pass2 {
    my($fig,$complete,$fig_org_2_uni_org,$uni_org_2_fig_org) = @_;

    my(%badF,%badU);
    my $peg2uni = {};
    my $uni2peg = {};

    open(SYN,"<$FIG_Config::global/peg.synonyms")
	|| die "could not open $FIG_Config::global/peg.synonyms";
    while (defined($_ = <SYN>))
    {
	chomp;
	my($head,$rest) = split(/\t/,$_);
	my @ids = map { $_ =~ /^([^,]+),/; $1 } ($head,split(/;/,$rest));
	my @fig = grep { $complete->{&FIG::genome_of($_)} } grep { $_ =~ /fig/ } @ids;
	my @uni = map { $_ =~ /^uni\|(\S+)/; $1 } grep { $_ =~ /^uni/ } @ids;

	if ((@fig == 1) && (@uni == 1))
	{
	    $peg2uni->{$fig[0]} = $uni[0];
	    $uni2peg->{$uni[0]} = $fig[0];
	}
	elsif ((@fig > 0) && (@uni > 0))
	{
	    foreach my $peg (@fig)
	    {
		if (my $org1 = $fig->org_of($peg))
		{
		    my($i,$org2);
		    for ($i=0; ($i < @uni) && ($fig_org_2_uni_org->{$org1} ne 
					       ($org2 = $fig->org_of("uni|$uni[$i]"))); $i++) {}
		    if ($i < @uni)
		    {
			if ($uni2peg->{$uni[$i]}) { $badF{$peg} = $badU{$uni[$i]} = 1 }
			$uni2peg->{$uni[$i]} = $peg;
			$peg2uni->{$peg} = $uni[$i];
		    }
		}
	    }
	}
    }
    close(SYN);

    foreach my $uni (keys(%badU))
    {
	delete $uni2peg->{$uni};
    }
    foreach my $peg (keys(%badF))
    {
	delete $peg2uni->{$peg};
    }
    return ($peg2uni,$uni2peg);
}


sub pass3 {
    my($fig,$completeG,$peg2uni,$uni2peg,$idmapF) = @_;

    my(%badF,%badU,$genome);
    my(%gi2fig);

    foreach $genome (keys(%$completeG))
    {
	if (open(TBL,"<$FIG_Config::organisms/$genome/Features/peg/tbl"))
	{
	    while (defined($_ = <TBL>))
	    {
		chomp;
		my($peg,undef,@aliases) = split(/\t/,$_);
		if (! $fig->is_deleted_fid($peg))
		{
		    if (@aliases > 0)
		    {
			my @gi = map { ($_ =~ /^gi\|(\S+)/) ? $1 : () } @aliases;
			if (@gi == 1)
			{
			    $gi2fig{$gi[0]} = $peg;
			}
		    }
		}
	    }
	    close(TBL);
	}
    }

    my($peg,$gi,$uni,$gIs,$x);
    open(IDMAP,"<$idmapF") || die "could not open $idmapF";
    while (defined($_ = <IDMAP>))
    {
	chomp;
	($uni,undef,undef,undef,$gIs) = split(/\t/,$_);
	my @gi = split(/; /,$gIs);
	foreach $gi (@gi)
	{
	    if ($peg = $gi2fig{$gi})
	    {
		if ($x = $peg2uni->{$peg})
		{ 
		    if ($x ne $uni)
		    {
			$badF{$peg} = $badU{$x} = $badU{$uni}  = 1;
			if ($x = $uni2peg->{$uni}) { $badF{$x} = 1 }
		    }
		}
		else
		{
		    $peg2uni->{$peg} = $uni;
		    $uni2peg->{$uni} = $peg;
		}
	    }
	}
    }
    close(IDMAP);

    foreach $peg (keys(%badF))
    {
	delete $peg2uni->{$peg};
    }

    foreach $uni (keys(%badU))
    {
	if ($uni2peg->{$uni}) { delete $uni2peg->{$uni} }
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3