Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Source: parse_drugbank.py
- # -*- coding: utf-8 -*-
- import os
- import xml.sax
- DATA_DIR = "./data/"
- class DrugXmlContentHandler(xml.sax.ContentHandler):
- def __init__(self):
- xml.sax.ContentHandler.__init__(self)
- self.tags = []
- self.generic_names = []
- self.brand_names = []
- def startElement(self, name, attrs):
- self.tags.append(name)
- def endElement(self, name):
- self.tags.pop()
- def characters(self, content):
- breadcrumb = "/".join(self.tags)
- #if breadcrumb == "drugbank/drug/products/product/name":
- # self.brand_names.append(content)
- if breadcrumb == "drugbank/drug/name":
- self.generic_names.append(content)
- def write_list_to_file(lst, filename):
- fout = open(os.path.join(DATA_DIR, filename), 'wb')
- for e in lst:
- fout.write("%s\n" % (e.encode("utf-8")))
- fout.close()
- source = open(os.path.join(DATA_DIR, "drugbank.xml"), 'rb')
- handler = DrugXmlContentHandler()
- xml.sax.parse(source, handler)
- source.close()
- write_list_to_file(handler.generic_names, "generic_names.txt")
- #write_list_to_file(handler.brand_names, "brand_names.txt")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement