Changes

Jump to navigation Jump to search
== dbxml ==
Oracle Berkeley DB XML [http://www.oracle.com/database/berkeley-db/xml/index.html (see here)] is an open source, embeddable XML database with XQuery-based access to documents stored in containers and indexed based on their content. Oracle Berkeley DB XML is built on top of Oracle Berkeley DB and inherits its rich features and attributes. Like Oracle Berkeley DB, it runs in process with the application with no need for human administration. Oracle Berkeley DB XML adds a document parser, XML indexer and XQuery engine on top of Oracle Berkeley DB to enable the fastest, most efficient retrieval of data.
 
As test, we can use a dump from wikiversity [http://en.wikiversity.org/wiki/Getting_stats_out_of_Wikiversity_XML_dumps (see here)] .
 
First we build the container 'Data.dbxml' in the directory "wikienv" (that must exists) :
<pre>
"""
---
"""
from bsddb3.db import *
from dbxml import *
import sys
import re
import time
 
def createEnvironment(home):
"""Create DBEnv and initialize XmlManager"""
try:
environment = DBEnv()
#
environment.set_cachesize(0,512 * 1024 *1024,1)
environment.set_lk_max_lockers(10000)
environment.set_lk_max_locks(10000)
environment.set_lk_max_objects(10000)
# initialize DBEnv for transactions
environment.open(home, DB_RECOVER|DB_CREATE|DB_INIT_LOCK|DB_DSYNC_LOG|
DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN, 0)
except DBError, exc:
print exc
sys.exit()
try:
mgr = XmlManager(environment, 0)
mgr.setDefaultPageSize(4096)
except XmlException, se:
print xe
sys.exit()
return mgr
 
def createContainer(mgr, containerName, flags):
"""create/open a node container"""
try:
uc = mgr.createUpdateContext()
container = mgr.openContainer(containerName,
flags|DB_CREATE,
XmlContainer.WholedocContainer)
container.addIndex("","title","edge-element-substring-string",uc)
container.addIndex("","username","edge-element-substring-string",uc)
container.addIndex("","text","edge-element-substring-string",uc)
return container
except XmlException, ex:
print ex
sys.exit()
 
 
def loadcontent(mgr, container,content,printmsg,k):
""" -- """
id= re.compile(r"<id>(.*)</id>")
title = re.compile(r"<title>(.*)</title>",re.MULTILINE|re.DOTALL)
 
id_text = id.search(content,re.MULTILINE|re.DOTALL).group(1)
title_text = title.search(content).group(1)
docName = '_'.join(title_text.split()) + '_' +id_text
txn = False
try:
# all Container modification operations need XmlUpdateContext
uc = mgr.createUpdateContext()
# create XmlTransaction for the operation
txn = mgr.createTransaction()
# use the DBXML_GEN_NAME flag to make sure this
# succeeds by creating a new, unique name
# Use a try/except block to allow the transaction to
# be aborted in the proper scope upon error
try:
docName = container.putDocument(txn, docName,
content, uc,
DBXML_GEN_NAME)
txn.commit()
except XmlException, ex:
print k,ex
txn.abort()
if printmsg:
# now, get the document in a new transaction
txn = mgr.createTransaction()
doc = container.getDocument(txn, docName)
name = doc.getName()
docContent = doc.getContentAsString()
txn.commit() # done with data
# print the name and content
print name
pass
except XmlException, inst:
print inst
if txn:
txn.abort()
 
# "main"
def main():
home = "wikienv"
# some configuration...
containerName = "Data.dbxml"
# initialize...
mgr = createEnvironment(home)
# create/open a transactional container
container = createContainer(mgr, containerName,
DBXML_TRANSACTIONAL)
 
startpage = re.compile(r"^\s*<page>\s*$")
endpage = re.compile(r"^\s*</page>\s*$")
id= re.compile(r"<id>(.*)</id>")
title = re.compile(r"<title>(.*)</title>",re.MULTILINE|re.DOTALL)
text = re.compile(r"<text ([^>]*)>(.*)</text>",re.MULTILINE|re.DOTALL)
k,k1,k2 = 0,0,0
startcollect = False
#src = file("enwiki-latest-pages-articles.xml","rb")
src = file("enwikiversity-20090627-pages-articles.xml",'rb')
for line in src:
try:
k1 = k1 +1
except:
k1 = 0
if divmod(k1,10000)[0]>0 and divmod(k1,10000)[1] == 0 :
print "k1=%012d,k=%012d ,sleep 1 sec." % (k1,k)
#time.sleep(1)
if startcollect and endpage.match(line) is None:
temp = ''.join((temp,line))
continue
if startpage.match(line) is not None:
temp = line
startcollect = True
pos = src.tell()
continue
if endpage.match(line) is not None:
content = ''.join((temp,line))
startcollect = False
if title.search(content) is not None and id.search(content) is not None:
#title_text = title.search(temp).group(1)
#id_text , content_len = id.search(temp,re.MULTILINE|re.DOTALL).group(1), len(temp)
#text_text = ((text.search(temp) is not None and text.search(temp).group(2)) or '' )+ ' ' + title_text
#keywords = [kk.lower() for kk in re.split("\W",text_text)
# if len(kk) >4 and kk.lower() != 'redirect'
# and kk.lower() != 'disambiguation' ]
#keywords.append(title_text)
#keywords = list(set(keywords))
#keywords.sort()
printmesg = False
if divmod(k,100)[1] == 0 and divmod(k,100)[0] >0:
print "%012d sync" %k
container.sync()
#del container
#container = mgr.openContainer(containerName,DBXML_TRANSACTIONAL)
#if divmod(k,1200)[1] == 0 and divmod(k,1200)[0] == 1:
#print k,title_text,id_text ,pos,content_len,keywords#,temp
#printmesg = True
#print '%09d insert data...'%k,
#return
loadcontent(mgr,container,content,printmesg,k)
k = k+1
src.close()
 
if __name__ == "__main__":
main()
</pre>

Navigation menu