import xml.sax
import re


class DBLPContentHandler(xml.sax.ContentHandler):
  """
  Reads the dblp.xml file and produces two output files.
        pubFile.txt = (key, pubtype) tuples
        fieldFile.txt = (key, fieldCnt, field, value) tuples
  Each file is tab-separated

  Once the program finishes,  load these two files in a relational database; run createSchema.sql
  """

  def __init__(self):
    xml.sax.ContentHandler.__init__(self)


  def startElement(self, name, attrs):
    if name == "dblp":
      DBLPContentHandler.pubFile = open('pubFile.txt', 'w')
      DBLPContentHandler.fieldFile = open('fieldFile.txt', 'w')
      DBLPContentHandler.pubList = ["article", "inproceedings", "proceedings", "book", "incollection", "phdthesis", "mastersthesis", "www"]
      DBLPContentHandler.fieldList = ["author", "editor", "title", "booktitle", "pages", "year", "address", "journal", "volume", "number", "month", "url", "ee", "cdrom", "cite", "publisher", "note", "crossref", "isbn", "series", "school", "chapter"]
      DBLPContentHandler.content = ""
    if name in DBLPContentHandler.pubList:
      DBLPContentHandler.key = attrs.getValue("key")
      DBLPContentHandler.pub = name
      DBLPContentHandler.fieldCount = 0
      DBLPContentHandler.content = ""
    if name in DBLPContentHandler.fieldList:
      DBLPContentHandler.field = name
      DBLPContentHandler.content = ""
 
  def endElement(self, name):
    if name in DBLPContentHandler.fieldList:
      DBLPContentHandler.fieldFile.write(DBLPContentHandler.key)
      DBLPContentHandler.fieldFile.write("\t")
      DBLPContentHandler.fieldFile.write(str(DBLPContentHandler.fieldCount))
      DBLPContentHandler.fieldFile.write( "\t")
      DBLPContentHandler.fieldFile.write(DBLPContentHandler.field)
      DBLPContentHandler.fieldFile.write("\t")
      DBLPContentHandler.fieldFile.write(DBLPContentHandler.content)
      DBLPContentHandler.fieldFile.write("\n")
      DBLPContentHandler.fieldCount += 1
    if name in DBLPContentHandler.pubList:
      DBLPContentHandler.pubFile.write(DBLPContentHandler.key)
      DBLPContentHandler.pubFile.write("\t")
      DBLPContentHandler.pubFile.write(DBLPContentHandler.pub)
      DBLPContentHandler.pubFile.write("\n")

  def characters(self, content):
    DBLPContentHandler.content += content.replace('\\','\\\\')
      
def main(sourceFileName):
  source = open(sourceFileName)
  xml.sax.parse(source, DBLPContentHandler())
 
if __name__ == "__main__":
  main("dblp.xml")