Changes

Jump to navigation Jump to search
== dbxml ==
Oracle Berkeley DB XML From site [http://www.oracle.com/database/berkeley-db/xml/index.html (see here)] : ''Oracle Berkeley DB XML is an open source, embeddable XML database with XQuery-based access to documents stored in containers and indexed based on their content. Oracle Berkeley DB XML is built on top of Oracle Berkeley DB and inherits its rich features and attributes. Like Oracle Berkeley DB, it runs in process with the application with no need for human administration. Oracle Berkeley DB XML adds a document parser, XML indexer and XQuery engine on top of Oracle Berkeley DB to enable the fastest, most efficient retrieval of data.''
As test, we can use a dump from wikiversity [http://en.wikiversity.org/wiki/Getting_stats_out_of_Wikiversity_XML_dumps (see here)] .
if __name__ == "__main__":
main()
</pre>
 
The we use this to retrive a page
<pre>
from lxml import etree
from bsddb3.db import *
from dbxml import *
import StringIO
import fcntl
import os
import pprint
import sys
import time
import mwlib.docbookwriter
from mwlib.dummydb import DummyDB
from mwlib.uparser import parseString
 
 
def getXML(title,res):
db = DummyDB()
r = parseString(title=title, raw=res, wikidb=db)
dbw = mwlib.docbookwriter.DocBookWriter()
dbw.writeBook(r)
pprint.pprint( dbw.getTree() )
return dbw.asstring()
 
 
def getConTeXt(title,res):
db = DummyDB()
r = parseString(title=title, raw=res, wikidb=db)
dbw = mwlib.docbookwriter.DocBookWriter()
dbw.writeBook(r)
article = dbw.getTree()
res = []
 
def managepara(c,res):
if c.tag == 'para' and (len(c.text.strip()) > 0):
res.append(c.text.strip()+r"\par")
if c.tag == 'para' and (len(c.getchildren())>0):
for c1 in c.iterchildren('ulink'):
res.append(r'cfr~\type{%s}\par' %c1.get('url'))
 
 
def managetable(c,res):
if c.tag == 'informaltable' :
res.append(r'\bTABLE')
for row in c.iterchildren():
res.append(r'\bTR')
for col in row.iterchildren():
res.append(r'\bTD '+ col.text.strip()+ r'\eTD')
res.append(r'\eTR')
res.append(r'\eTABLE')
 
def section_content(c,res):
managepara(c,res)
if c.tag == 'section' :
subsection = c
subsection_title = subsection.find("sectioninfo").find("title").text.strip()
res.append(r"\subsection{%s}" % subsection_title)
for sc in subsection.iterchildren():
subsection_content(sc,res)
 
def subsection_content(c,res):
managepara(c,res)
if c.tag == 'section' :
subsubsection = c
subsubsection_title = subsubsection.find("sectioninfo").find("title").text.strip()
res.append(r"\subsubsection{%s}" % subsubsection_title)
for sc in subsubsection.iterchildren():
subsubsection_content(sc,res)
if c.tag == 'informaltable' :
managetable(c,res)
 
 
def subsubsection_content(c,res):
managepara(c,res)
 
 
chapter_title = article.find("articleinfo").find("title").text
res.append(r"\chapter{%s}" % chapter_title)
 
section = article.find("section")
section_title = section.find("sectioninfo").find("title").text
res.append(r"\section{%s}" % section_title)
 
for c in section.iterchildren():
section_content(c,res)
#pprint.pprint(res)
return '\n'.join(res)
 
 
 
 
 
def query(env=None,mgr=None,container=None,querystring='Foo'):
""" Always check with queryPlan
(for example matches is not optimized for indexes)
"""
anID = env.lock_id()
lock = env.lock_get(anID, "shared lock", DB_LOCK_READ)
updateContext = mgr.createUpdateContext();
try:
txn = mgr.createTransaction()
resultsContext = mgr.createQueryContext()
#queryString = "collection('%s')/page[contains(title,'%s')]" % (container.getName(),data)
#queryString = "collection('%s')%s" % (container.getName(),querystring)
results = mgr.query(txn, querystring, resultsContext)
res = [res.asString() for res in results]
txn.commit()
return res
#print "START",book_name
##
##
except XmlException, inst:
txn.abort()
print "XmlException (", inst.exceptionCode,"): ", inst.what#,'name=',theName
if inst.exceptionCode == DATABASE_ERROR:
print "Database error code:",inst.dbError
env.lock_put(lock)
env.lock_id_free(anID)
print 'OK exit'
 
def getArtitleByTitle(title):
pass
env = DBEnv()
env.set_cachesize(0, 64 * 1024 * 1024, 1)
path2DbEnv ='wikienv'
env.open(path2DbEnv,DB_THREAD|DB_REGISTER|DB_RECOVER|DB_INIT_MPOOL|DB_CREATE|DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_TXN, 0)
mgr = XmlManager(env,0)
containerTxn = mgr.createTransaction()
theContainer = "Data.dbxml"
container = mgr.openContainer(containerTxn, theContainer)
containerTxn.commit()
##
lockfile = open("lock.kmgr", "w")
fcntl.flock(lockfile, fcntl.LOCK_EX)
try:
res = set()
querystring = 'collection("%s")/page[contains(title,"%s")]/revision/text/text()' % (theContainer,title)
res = res.union(query(env,mgr,container,querystring=querystring))
res = ''.join(list(res)).decode('utf8')
#res = getXML(title,res)
#open('res.dbk','w').write( " ".join(res.split()) )
res = getConTeXt(title,res)
return res
except Exception,e:
print "error on read:" ,e
fcntl.flock(lockfile, fcntl.LOCK_UN)
lockfile.close()
 
def writeres(title,preamble,postamble,filename):
 
res = getArtitleByTitle(title=title)
res = res.replace('&',r'\&')
 
open(filename,'wb').write( '\n'.join((preamble,res,postamble)) )
 
pass
 
if __name__ == '__main__':
 
preamble = r"""\usetypescriptfile[type-gentium]
\usetypescript[gentium]
\setupbodyfont[gentium,10pt]
\setuppapersize[A5][A5]
\setuplayout[height=middle,topspace=1cm,header={2\lineheight},footer=0pt,backspace=1cm,margin=1cm, width=middle]
\starttext"""
 
postamble = r"""\stoptext"""
 
 
title="Primary mathematics/Numbers"
filename = 'res.tex'
writeres(title,preamble,postamble,filename)
</pre>

Navigation menu