Skip to content
Snippets Groups Projects
Commit 2f957172 authored by Dan Suciu's avatar Dan Suciu
Browse files

working files

parent 9c39bf36
No related branches found
No related tags found
No related merge requests found
This diff is collapsed.
create table Pub (k text, p text);
create table Field (k text, i text, p text, v text);
copy Pub from 'pubFile.txt';
copy Field from 'fieldFile.txt';
#!/usr/bin/python
import psycopg2
def main():
try:
conn = psycopg2.connect("dbname='dblp' user='<YOUR USER NAME>' host='localhost' password=''")
except psycopg2.Error, e:
print "I am unable to connect to the database"
cur = conn.cursor()
cur.execute("SELECT * FROM author LIMIT 10")
rows = cur.fetchall()
print "Showing first 10 results:\n"
for row in rows:
print row[0], row[1]
if __name__ == "__main__":
main()
import xml.sax
import re
class DBLPContentHandler(xml.sax.ContentHandler):
"""
Reads the dblp.xml file and produces two output files.
pubFile.txt = (key, pubtype) tuples
fieldFile.txt = (key, fieldCnt, field, value) tuples
Each file is tab-separated
Once the program finishes, load these two files in a relational database; run createSchema.sql
"""
def __init__(self):
xml.sax.ContentHandler.__init__(self)
def startElement(self, name, attrs):
if name == "dblp":
DBLPContentHandler.pubFile = open('pubFile.txt', 'w')
DBLPContentHandler.fieldFile = open('fieldFile.txt', 'w')
DBLPContentHandler.pubList = ["article", "inproceedings", "proceedings", "book", "incollection", "phdthesis", "mastersthesis", "www"]
DBLPContentHandler.fieldList = ["author", "editor", "title", "booktitle", "pages", "year", "address", "journal", "volume", "number", "month", "url", "ee", "cdrom", "cite", "publisher", "note", "crossref", "isbn", "series", "school", "chapter"]
DBLPContentHandler.content = ""
if name in DBLPContentHandler.pubList:
DBLPContentHandler.key = attrs.getValue("key")
DBLPContentHandler.pub = name
DBLPContentHandler.fieldCount = 0
DBLPContentHandler.content = ""
if name in DBLPContentHandler.fieldList:
DBLPContentHandler.field = name
DBLPContentHandler.content = ""
def endElement(self, name):
if name in DBLPContentHandler.fieldList:
DBLPContentHandler.fieldFile.write(DBLPContentHandler.key)
DBLPContentHandler.fieldFile.write("\t")
DBLPContentHandler.fieldFile.write(str(DBLPContentHandler.fieldCount))
DBLPContentHandler.fieldFile.write( "\t")
DBLPContentHandler.fieldFile.write(DBLPContentHandler.field)
DBLPContentHandler.fieldFile.write("\t")
DBLPContentHandler.fieldFile.write(DBLPContentHandler.content)
DBLPContentHandler.fieldFile.write("\n")
DBLPContentHandler.fieldCount += 1
if name in DBLPContentHandler.pubList:
DBLPContentHandler.pubFile.write(DBLPContentHandler.key)
DBLPContentHandler.pubFile.write("\t")
DBLPContentHandler.pubFile.write(DBLPContentHandler.pub)
DBLPContentHandler.pubFile.write("\n")
def characters(self, content):
DBLPContentHandler.content += content.replace('\\','\\\\')
def main(sourceFileName):
source = open(sourceFileName)
xml.sax.parse(source, DBLPContentHandler())
if __name__ == "__main__":
main("dblp.xml")
put your .sql files in this directory, one file per question.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment