Changes

User:Luigi.scarso/luatex lunatic (view source)

Revision as of 00:50, 28 July 2009

5,467 bytes added , 00:50, 28 July 2009

== dbxml ==

~~Oracle Berkeley DB XML~~ From site [http://www.oracle.com/database/berkeley-db/xml/index.html (see here)] : ''Oracle Berkeley DB XML is an open source, embeddable XML database with XQuery-based access to documents stored in containers and indexed based on their content. Oracle Berkeley DB XML is built on top of Oracle Berkeley DB and inherits its rich features and attributes. Like Oracle Berkeley DB, it runs in process with the application with no need for human administration. Oracle Berkeley DB XML adds a document parser, XML indexer and XQuery engine on top of Oracle Berkeley DB to enable the fastest, most efficient retrieval of data.''

As test, we can use a dump from wikiversity [http://en.wikiversity.org/wiki/Getting_stats_out_of_Wikiversity_XML_dumps (see here)] .

if __name__ == "__main__":

main()

</pre>

The we use this to retrive a page

<pre>

from lxml import etree

from bsddb3.db import *

from dbxml import *

import StringIO

import fcntl

import os

import pprint

import sys

import time

import mwlib.docbookwriter

from mwlib.dummydb import DummyDB

from mwlib.uparser import parseString

def getXML(title,res):

db = DummyDB()

r = parseString(title=title, raw=res, wikidb=db)

dbw = mwlib.docbookwriter.DocBookWriter()

dbw.writeBook(r)

pprint.pprint( dbw.getTree() )

return dbw.asstring()

def getConTeXt(title,res):

db = DummyDB()

r = parseString(title=title, raw=res, wikidb=db)

dbw = mwlib.docbookwriter.DocBookWriter()

dbw.writeBook(r)

article = dbw.getTree()

res = []

def managepara(c,res):

if c.tag == 'para' and (len(c.text.strip()) > 0):

res.append(c.text.strip()+r"\par")

if c.tag == 'para' and (len(c.getchildren())>0):

for c1 in c.iterchildren('ulink'):

res.append(r'cfr~\type{%s}\par' %c1.get('url'))

def managetable(c,res):

if c.tag == 'informaltable' :

res.append(r'\bTABLE')

for row in c.iterchildren():

res.append(r'\bTR')

for col in row.iterchildren():

res.append(r'\bTD '+ col.text.strip()+ r'\eTD')

res.append(r'\eTR')

res.append(r'\eTABLE')

def section_content(c,res):

managepara(c,res)

if c.tag == 'section' :

subsection = c

subsection_title = subsection.find("sectioninfo").find("title").text.strip()

res.append(r"\subsection{%s}" % subsection_title)

for sc in subsection.iterchildren():

subsection_content(sc,res)

def subsection_content(c,res):

managepara(c,res)

if c.tag == 'section' :

subsubsection = c

subsubsection_title = subsubsection.find("sectioninfo").find("title").text.strip()

res.append(r"\subsubsection{%s}" % subsubsection_title)

for sc in subsubsection.iterchildren():

subsubsection_content(sc,res)

if c.tag == 'informaltable' :

managetable(c,res)

def subsubsection_content(c,res):

managepara(c,res)

chapter_title = article.find("articleinfo").find("title").text

res.append(r"\chapter{%s}" % chapter_title)

section = article.find("section")

section_title = section.find("sectioninfo").find("title").text

res.append(r"\section{%s}" % section_title)

for c in section.iterchildren():

section_content(c,res)

#pprint.pprint(res)

return '\n'.join(res)

def query(env=None,mgr=None,container=None,querystring='Foo'):

""" Always check with queryPlan

(for example matches is not optimized for indexes)

"""

anID = env.lock_id()

lock = env.lock_get(anID, "shared lock", DB_LOCK_READ)

updateContext = mgr.createUpdateContext();

try:

txn = mgr.createTransaction()

resultsContext = mgr.createQueryContext()

#queryString = "collection('%s')/page[contains(title,'%s')]" % (container.getName(),data)

#queryString = "collection('%s')%s" % (container.getName(),querystring)

results = mgr.query(txn, querystring, resultsContext)

res = [res.asString() for res in results]

txn.commit()

return res

#print "START",book_name

##

except XmlException, inst:

txn.abort()

print "XmlException (", inst.exceptionCode,"): ", inst.what#,'name=',theName

if inst.exceptionCode == DATABASE_ERROR:

print "Database error code:",inst.dbError

env.lock_put(lock)

env.lock_id_free(anID)

print 'OK exit'

def getArtitleByTitle(title):

pass

env = DBEnv()

env.set_cachesize(0, 64 * 1024 * 1024, 1)

path2DbEnv ='wikienv'

mgr = XmlManager(env,0)

containerTxn = mgr.createTransaction()

theContainer = "Data.dbxml"

container = mgr.openContainer(containerTxn, theContainer)

containerTxn.commit()

##

lockfile = open("lock.kmgr", "w")

fcntl.flock(lockfile, fcntl.LOCK_EX)

try:

res = set()

querystring = 'collection("%s")/page[contains(title,"%s")]/revision/text/text()' % (theContainer,title)

res = res.union(query(env,mgr,container,querystring=querystring))

res = ''.join(list(res)).decode('utf8')

#res = getXML(title,res)

#open('res.dbk','w').write( " ".join(res.split()) )

res = getConTeXt(title,res)

return res

except Exception,e:

print "error on read:" ,e

fcntl.flock(lockfile, fcntl.LOCK_UN)

lockfile.close()

def writeres(title,preamble,postamble,filename):

res = getArtitleByTitle(title=title)

res = res.replace('&',r'\&')

open(filename,'wb').write( '\n'.join((preamble,res,postamble)) )

pass

if __name__ == '__main__':

preamble = r"""\usetypescriptfile[type-gentium]

\usetypescript[gentium]

\setupbodyfont[gentium,10pt]

\setuppapersize[A5][A5]

\setuplayout[height=middle,topspace=1cm,header={2\lineheight},footer=0pt,backspace=1cm,margin=1cm, width=middle]

\starttext"""

postamble = r"""\stoptext"""

title="Primary mathematics/Numbers"

filename = 'res.tex'

writeres(title,preamble,postamble,filename)

</pre>

Luigi.scarso

Administrators

685

edits

Changes

User:Luigi.scarso/luatex lunatic (view source)

Revision as of 00:50, 28 July 2009

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Main

Navigation

Indexes

Interaction

Tools