-
Dan Suciu authored72bcaa8b
wrapper.py 2.43 KiB
import xml.sax
import re
class DBLPContentHandler(xml.sax.ContentHandler):
"""
Reads the dblp.xml file and produces two output files.
pubFile.txt = (key, pubtype) tuples
fieldFile.txt = (key, fieldCnt, field, value) tuples
Each file is tab-separated
Once the program finishes, load these two files in a relational database; run createSchema.sql
"""
def __init__(self):
xml.sax.ContentHandler.__init__(self)
def startElement(self, name, attrs):
if name == "dblp":
DBLPContentHandler.pubFile = open('pubFile.txt', 'w')
DBLPContentHandler.fieldFile = open('fieldFile.txt', 'w')
DBLPContentHandler.pubList = ["article", "inproceedings", "proceedings", "book", "incollection", "phdthesis", "mastersthesis", "www"]
DBLPContentHandler.fieldList = ["author", "editor", "title", "booktitle", "pages", "year", "address", "journal", "volume", "number", "month", "url", "ee", "cdrom", "cite", "publisher", "note", "crossref", "isbn", "series", "school", "chapter"]
DBLPContentHandler.content = ""
if name in DBLPContentHandler.pubList:
DBLPContentHandler.key = attrs.getValue("key")
DBLPContentHandler.pub = name
DBLPContentHandler.fieldCount = 0
DBLPContentHandler.content = ""
if name in DBLPContentHandler.fieldList:
DBLPContentHandler.field = name
DBLPContentHandler.content = ""
def endElement(self, name):
if name in DBLPContentHandler.fieldList:
DBLPContentHandler.fieldFile.write(DBLPContentHandler.key)
DBLPContentHandler.fieldFile.write("\t")
DBLPContentHandler.fieldFile.write(str(DBLPContentHandler.fieldCount))
DBLPContentHandler.fieldFile.write( "\t")
DBLPContentHandler.fieldFile.write(DBLPContentHandler.field)
DBLPContentHandler.fieldFile.write("\t")
DBLPContentHandler.fieldFile.write(DBLPContentHandler.content)
DBLPContentHandler.fieldFile.write("\n")
DBLPContentHandler.fieldCount += 1
if name in DBLPContentHandler.pubList:
DBLPContentHandler.pubFile.write(DBLPContentHandler.key)
DBLPContentHandler.pubFile.write("\t")
DBLPContentHandler.pubFile.write(DBLPContentHandler.pub)
DBLPContentHandler.pubFile.write("\n")
def characters(self, content):
DBLPContentHandler.content += content.replace('\\','\\\\')
def main(sourceFileName):
source = open(sourceFileName)
xml.sax.parse(source, DBLPContentHandler())
if __name__ == "__main__":
main("dblp.xml")