[Bio] / FigKernelScripts / iedb.pl Repository:
ViewVC logotype

View of /FigKernelScripts/iedb.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (download) (as text) (annotate)
Sat Aug 19 21:39:18 2006 UTC (13 years, 2 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.9: +5 -2 lines
RAE: bugger, correcting the array dereferencing

#__perl__


=pod

=head1 iedb_parse.pl

The IEDB is the Immune Epitope Database, and we should play nice with them. This parser will grab all the data from their XML files, and enter it into the SEED. You can download the IEDB data from here: http://www.immuneepitope.org/downloadDocuments.do

The basis of the IEDB is a manually curated database that relies on people reading the literature, just like PIR. 

Currently we grab the following data:
    Literature		: They have manually curated these so we add them
    Epitopes  		: We keep information on both the raw sequence of the epitope and the epitope name. These are added as attributes
    Links to IEDB	: At the moment some links are set to IEDB

some of this was added as attributes but has been migrated to Links


=cut

use strict;
use FIG;
use Tracer;
my $fig=new FIG;

# probably have some xml parser, but I'll just wing my own at the moment.
#

my ($options, @genomes) = StandardSetup([],
	{ 
	    file => ['', "XML file from http://www.immuneepitope.org/downloadDocuments.do"],
	}, "",
	@ARGV);

die "You must specify an XML file.\n\tFor example: iedb -file=IEDB_2006_6_13_0_30_1.xml\n Use $0 -h for more information\n" unless ($options->{'file'});

open(IN, $options->{'file'}) || die "Can't open ".$options->{'file'};

my ($reference, $article, $genbank, $swiss, $epitope);


print STDERR "Deleting old IEDB links and attributes\nDeleting links ...\n";
# delete the links. First we need to find them
foreach my $arr ($fig->fids_with_link_to("iedb"), $fig->fids_with_link_to("IEDB"),)
{
    foreach my $fidlink (@$arr)
    {
        $fig->delete_fid_link(@$fidlink);
    }
}

# erase old attributes
print STDERR "Deleting attributes ...\n";
foreach my $old_attribute ("iedb_epitopename", "iedb_epitopelinearsequence", "iedb_smiles_structure")
{
    $fig->erase_attribute_entirely($old_attribute);
}


while (<IN>)
{
    if (/<Reference.*Reference_Id="(\d+)">/) {
	my $newreference=$1;
	&distribute_data($reference, $article, $genbank, $swiss, $epitope) if ($reference);
	$reference=$newreference;
	($article, $genbank, $swiss, $epitope)=(undef, undef, undef, undef);
    }
    if (/<Article/)
    {
	# parse the article record.
	while (s/(\S+)=\"(.*?[^\\"])\"//) {$article->{$1}=$2}
    }
    if (/<Source>/)
    {
	$_=<IN>;
	if (/<Genbank Genbank_GI_Number="(\d+)"/) {$genbank->{$1}=1}
	elsif (/<Swiss_Prot Primary_Accession_Number="(\S+)"/) {$swiss->{$1}=1}
        elsif (/IEDB_Source_Number/) {1} # ignore these
	else {print STDERR "Not sure of source $_"}
    }
    
    
    if (/<Epitope /)
    {
	my $hash;
	while (s/(\S+)=\"(.*?[^\\"])\"//) {$hash->{$1}=$2}
	push @$epitope, $hash;
    }   

}
    


sub distribute_data {
    my ($reference, $article, $genbank, $swiss, $epitope)=@_;
    foreach my $extid (keys %$genbank, keys %$swiss)
    {
	# this is just to normalize to the db
	$genbank->{$extid} ? ($extid="gi|".$extid) :
	$swiss->{$extid} ? ($extid="sp|".$extid) :
	next; # this left intentionally vague so we can add more dbs
	
	my $fid=$fig->by_alias($extid);
	unless ($fid) {$fid=$fig->by_raw_alias($extid)} # use Ed's alternate search, just in case we missed something cool.
	next unless ($fid);
	
	# we need to look through the linear sequence for the epitope information
	my $mappedepi;
	foreach my $epi (@$epitope)
	{
	    my $ls=$epi->{"Linear_Sequence"};
	    if ($fig->get_translation($fid) =~ /$ls/i) {$mappedepi=$epi; last}
	}

	unless ($mappedepi)
	{
	    print STDERR "Unable to map any epitope information onto $fid, From reference $reference we looked for: \n",
	    join("\n\t", map {$_->{"Linear_Sequence"}} @$epitope),
	    "\nfrom ", $fig->get_translation($fid), "\n";
	    
	    next;
	}
	
	if ($mappedepi->{Epitope_Name}) 
	{
	    $fig->add_attribute($fid, "iedb_epitopename", $mappedepi->{Epitope_Name}, undef);
	    #my $url='http://www.immuneepitope.org/intermediateQuery.do?&tcellActive=on&mhcCat=on&natProcCat=on&tcellCat=on&bcellActive=on&antibodyMonoType=on&antibodyPolyType=on&literatureType=on&patentType=on&submissionType=on&dataType=+&epitopeLinearSequence='.$mappedepi->{"Linear_Sequence"};
            my $url='http://www.immuneepitope.org/httpQuery.do?dispatch=runquery&elinseq='.$mappedepi->{"Linear_Sequence"};
	    $fig->add_attribute($fid, "iedb_epitopelinearsequence", $mappedepi->{"Linear_Sequence"}, $url);
	}
        

	if ($mappedepi->{Smiles_Structure} && ($mappedepi->{Smiles_Structure} ne "Not Applicable"))
	{
	    $fig->add_attribute($fid, "iedb_smiles_structure", $mappedepi->{Smiles_Structure}, undef);
	}

	if ($article->{Pubmed_Id})
	{
	    # now add literature links
	    # first, lets see what we have
	    my @attributes=$fig->get_attributes($fid, "PUBMED");
	    my $got;
	    foreach my $attr (@attributes)
	    {
		if ($attr->[2] eq $article->{Pubmed_Id})
		{
		    print STDERR "Already had article ", $attr->[2], " for $fid\n";
		    $got=1;
		}
	    }
	    unless ($got)
	    {
		# we want to add it
		$fig->add_attribute($fid, "PUBMED", $article->{Pubmed_Id}, 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=retrieve&db=pubmed&dopt=abstract&list_uids='.$article->{Pubmed_Id});
	    }
	}

	# now add an attribute for the IEDB
	#$fig->add_attribute($fid, "iedb", $reference, 'http://www.immuneepitope.org/reportingDetails.do?dispatch=showReportingReferenceDetails&rid='.$reference.'&rtype=1&edate=86558054400000');

        # add the link to the IEDB database
        #$fig->add_attribute($fid, "iedb", $reference, 'http://www.immuneepitope.org/httpQuery.do?dispatch=runquery&eid='.$reference);
        
        # now add a link to the database using the IEDB_Id
        foreach my $epi (@$epitope)
        {
            if ($epi->{'IEDB_Id'})
            {
                my $link='<a href=\"http://www.immuneepitope.org/httpQuery.do?dispatch=runquery&eid='.$epi->{'IEDB_Id'}.'\" target=\"_blank\">IEDB record:'.$epi->{'IEDB_Id'}."</a>";
                $fig->add_fid_link($fid, $link);
            }
        }
	
	Trace("Added details for $fid in $reference") if T(2);
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3