Changes

User:Luigi.scarso/luatex lunatic (view source)

Revision as of 00:46, 28 July 2009

6,003 bytes added , 00:46, 28 July 2009

== dbxml ==

Oracle Berkeley DB XML [http://www.oracle.com/database/berkeley-db/xml/index.html (see here)] is an open source, embeddable XML database with XQuery-based access to documents stored in containers and indexed based on their content. Oracle Berkeley DB XML is built on top of Oracle Berkeley DB and inherits its rich features and attributes. Like Oracle Berkeley DB, it runs in process with the application with no need for human administration. Oracle Berkeley DB XML adds a document parser, XML indexer and XQuery engine on top of Oracle Berkeley DB to enable the fastest, most efficient retrieval of data.

As test, we can use a dump from wikiversity [http://en.wikiversity.org/wiki/Getting_stats_out_of_Wikiversity_XML_dumps (see here)] .

First we build the container 'Data.dbxml' in the directory "wikienv" (that must exists) :

<pre>

"""

---

"""

from bsddb3.db import *

from dbxml import *

import sys

import re

import time

def createEnvironment(home):

"""Create DBEnv and initialize XmlManager"""

try:

environment = DBEnv()

#

environment.set_cachesize(0,512 * 1024 *1024,1)

environment.set_lk_max_lockers(10000)

environment.set_lk_max_locks(10000)

environment.set_lk_max_objects(10000)

# initialize DBEnv for transactions

environment.open(home, DB_RECOVER|DB_CREATE|DB_INIT_LOCK|DB_DSYNC_LOG|

DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN, 0)

except DBError, exc:

print exc

sys.exit()

try:

mgr = XmlManager(environment, 0)

mgr.setDefaultPageSize(4096)

except XmlException, se:

print xe

sys.exit()

return mgr

def createContainer(mgr, containerName, flags):

"""create/open a node container"""

try:

uc = mgr.createUpdateContext()

container = mgr.openContainer(containerName,

flags|DB_CREATE,

XmlContainer.WholedocContainer)

container.addIndex("","title","edge-element-substring-string",uc)

container.addIndex("","username","edge-element-substring-string",uc)

container.addIndex("","text","edge-element-substring-string",uc)

return container

except XmlException, ex:

print ex

sys.exit()

def loadcontent(mgr, container,content,printmsg,k):

""" -- """

id= re.compile(r"<id>(.*)</id>")

title = re.compile(r"<title>(.*)</title>",re.MULTILINE|re.DOTALL)

id_text = id.search(content,re.MULTILINE|re.DOTALL).group(1)

title_text = title.search(content).group(1)

docName = '_'.join(title_text.split()) + '_' +id_text

txn = False

try:

# all Container modification operations need XmlUpdateContext

uc = mgr.createUpdateContext()

# create XmlTransaction for the operation

txn = mgr.createTransaction()

# use the DBXML_GEN_NAME flag to make sure this

# succeeds by creating a new, unique name

# Use a try/except block to allow the transaction to

# be aborted in the proper scope upon error

try:

docName = container.putDocument(txn, docName,

content, uc,

DBXML_GEN_NAME)

txn.commit()

except XmlException, ex:

print k,ex

txn.abort()

if printmsg:

# now, get the document in a new transaction

txn = mgr.createTransaction()

doc = container.getDocument(txn, docName)

name = doc.getName()

docContent = doc.getContentAsString()

txn.commit() # done with data

# print the name and content

print name

pass

except XmlException, inst:

print inst

if txn:

txn.abort()

# "main"

def main():

home = "wikienv"

# some configuration...

containerName = "Data.dbxml"

# initialize...

mgr = createEnvironment(home)

# create/open a transactional container

container = createContainer(mgr, containerName,

DBXML_TRANSACTIONAL)

startpage = re.compile(r"^\s*<page>\s*$")

endpage = re.compile(r"^\s*</page>\s*$")

id= re.compile(r"<id>(.*)</id>")

title = re.compile(r"<title>(.*)</title>",re.MULTILINE|re.DOTALL)

text = re.compile(r"<text ([^>]*)>(.*)</text>",re.MULTILINE|re.DOTALL)

k,k1,k2 = 0,0,0

startcollect = False

#src = file("enwiki-latest-pages-articles.xml","rb")

src = file("enwikiversity-20090627-pages-articles.xml",'rb')

for line in src:

try:

k1 = k1 +1

except:

k1 = 0

if divmod(k1,10000)[0]>0 and divmod(k1,10000)[1] == 0 :

print "k1=%012d,k=%012d ,sleep 1 sec." % (k1,k)

#time.sleep(1)

if startcollect and endpage.match(line) is None:

temp = ''.join((temp,line))

continue

if startpage.match(line) is not None:

temp = line

startcollect = True

pos = src.tell()

continue

if endpage.match(line) is not None:

content = ''.join((temp,line))

startcollect = False

if title.search(content) is not None and id.search(content) is not None:

#title_text = title.search(temp).group(1)

#id_text , content_len = id.search(temp,re.MULTILINE|re.DOTALL).group(1), len(temp)

#text_text = ((text.search(temp) is not None and text.search(temp).group(2)) or '' )+ ' ' + title_text

#keywords = [kk.lower() for kk in re.split("\W",text_text)

# if len(kk) >4 and kk.lower() != 'redirect'

# and kk.lower() != 'disambiguation' ]

#keywords.append(title_text)

#keywords = list(set(keywords))

#keywords.sort()

printmesg = False

if divmod(k,100)[1] == 0 and divmod(k,100)[0] >0:

print "%012d sync" %k

container.sync()

#del container

#container = mgr.openContainer(containerName,DBXML_TRANSACTIONAL)

#if divmod(k,1200)[1] == 0 and divmod(k,1200)[0] == 1:

#print k,title_text,id_text ,pos,content_len,keywords#,temp

#printmesg = True

#print '%09d insert data...'%k,

#return

loadcontent(mgr,container,content,printmesg,k)

k = k+1

src.close()

if __name__ == "__main__":

main()

</pre>

Luigi.scarso

Administrators

685

edits

Changes

User:Luigi.scarso/luatex lunatic (view source)

Revision as of 00:46, 28 July 2009

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Main

Navigation

Indexes

Interaction

Tools