[Bio] / FigKernelScripts / parse_mart.py Repository:
ViewVC logotype

View of /FigKernelScripts/parse_mart.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (download) (as text) (annotate)
Mon Dec 5 18:56:37 2005 UTC (13 years, 11 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.5: +17 -0 lines
Add license words.

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info or the Fellowship for Interpretation of
# Genomes at veronika or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

#
# This supports embl2gff by reading data from ensembl mart files to
# pull out things we want to load into the gff alias and attributes but
# which are not in the EMBL file.  For example, the cytogenetic
# location is not in the EMBL file.
#
# We just write a file of lines where each line is a tab separated
# 3-tuple (name, Alias|Attribute, text).  Name is a key that will
# be rendezvoused on in embl2gff.  For example, we collate the
# cytogenetic location via the ENSG gene id in human.
#
# Write output to standard out so that user can concat
# onto a growing file of extra-info to be fed into
# embl2gff
#
# At the moment, we only are handling the cytogenetic
# location.
#
#
# Read ensembl mart file with gene information, e.g.,
#       hsapiens_gene_ensembl__gene__main.txt.table"
# and pull out the chromosome name, map information.
#

import sys
import os

__debug = 0

if (not len(sys.argv) == 3):
    print "usage:  parse_mart dirbase fileSetPrefix\nFor example, parse_mart /sandbox/embl hsapiens"
    sys.exit(1)

dirName = sys.argv[1]
orgName = sys.argv[2]

##
# Handle <org>_gene_ensmbl_gene_main.txt.table files from
# ensembl mart.
#
# The various orgs differ in their schema for this table *after*
# the columns we care about, but the first 14 columns are same
# (semantically even if a little different in # chars allowed, etc.)
#
# The first 14 columns are as follows (need regression test):
##

gene_id_key = 0
gene_stable_id = 1
gene_stable_id_v = 2
biotype = 3
source =4 
confidence = 5
display_xref_id = 6
gene_chrom_start = 7
gene_chrom_end = 8
chrom_strand = 9
chromosome_id = 10
chr_name = 11
description = 12
band = 13

filename = os.path.join(dirName, 
                        orgName + "_gene_ensembl__gene__main.txt.table")
fd = file( filename, 'r')

line=fd.readline()[:-1]

while ( line ):
    cols = line.split('\t')

    if (__debug):
        print "gene_id_key =", cols[gene_id_key]
        print "gene_stable_id =", cols[gene_stable_id]
        print "gene_stable_id_v =", cols[gene_stable_id_v]
        print "biotype =", cols[biotype]
        print "source =", cols[source]
        print "confidence =", cols[confidence]
        print "display_xref_id =", cols[display_xref_id]
        print "gene_chrom_start =", cols[gene_chrom_start]
        print "gene_chrom_end =", cols[gene_chrom_end]
        print "chrom_strand =", cols[chrom_strand]
        print "chromosome_id =", cols[chromosome_id]
        print "chr_name =", cols[chr_name]
        print "description =", cols[description]
        print "band =", cols[band]


    print "%s\tAlias\tChromName:%s" % ( cols[gene_stable_id], cols[chr_name])
    if (not cols[band] == "\N"):
        print "%s\tAlias\tBand:%s" % (cols[gene_stable_id], cols[band])
      
    line=fd.readline()[:-1]

##
# Handle <org>_gene_ensembl__transcript__main.txt.table
# ensembl mart files.  These hold transcript level info.
#
##

filename = os.path.join(dirName,
                        orgName + "_gene_ensembl__transcript__main.txt.table")
fd = file( filename, 'r')

transcript_id_key=0
description=1
transcript_stable_id=2
transcript_stable_id_v=3
translation_id=4
translation_stable_id=5
translation_stable_id_v=6
gene_id_key=7
gene_stable_id=8
gene_stable_id_v=9
#plus a bunch more

line=fd.readline()[:-1]
while ( line ):
    cols = line.split('\t')
    if (not cols[description] == "\N"):
        print "%s_function\tAlias\t%s" % (cols[transcript_stable_id], cols[description])
        
    line=fd.readline()[:-1]




##
# Handle <org>_gene_ensembl__xref_uniprot_accession__dm.txt.table
# ensembl mart files.  These hold uniprotKB primary accession info.
#
##

filename = os.path.join(dirName,
                        orgName + "_gene_ensembl__xref_uniprot_accession__dm.txt.table")
fd = file( filename, 'r')

gene_id_key = 0
gene_stable_id = 1
transcript_id_key = 2
transcript_stable_id = 3
translation_id = 4
translation_stable_id =5
display_id_list = 6


line=fd.readline()[:-1]
while ( line ):
    cols = line.split('\t')
    if (not cols[display_id_list] == "\N"):
        print "%s\tAlias\tUniProt:%s" % ( cols[transcript_stable_id], cols[display_id_list])
        
    line=fd.readline()[:-1]

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3