GMRKB.ReadMWDump Parser

From GM-RKB
Jump to navigation Jump to search

A GMRKB.ReadMWDump Parser is a MediaWiki XML File Parser.



References

2018

#ReadMWDump.py package definition
import xml.etree.ElementTree as etree
class GetAllPages(object):
   def __init__(self, inputFilename):
       self.tokenParser = etree.iterparse(inputFilename, events=('start', 'end'))
       self.currentPage = {}
   def __iter__(self):
       return self
   def next(self):
       for event, elem in self.tokenParser:
           tagName = strip_tag_name(elem)
if event == 'end': if tagName == 'page': # Found the end of </page> block ret = self.currentPage self.currentPage = {}
# Make sure that "text" is always populated, even for empty pages (0 bytes of text). if not ret.get('text'): ret['text'] = ""
if not ret.get('redirect'): ret['redirect'] = ""
return ret elif tagName == 'title': self.currentPage['title'] = elem.text elif tagName == 'text': self.currentPage['text'] = elem.text elif event == 'start': if tagName == 'redirect': self.currentPage['redirect'] = elem.attrib['title']
# Entire XML dump was parsed, nothing more to read raise StopIteration
def strip_tag_name(elem):
   t = elem.tag
   idx = k = t.rfind("}")
   if idx != -1:
       t = t[idx + 1:]
   return t
# example.py calling program
# set PYTHONIOENCODING=utf8 in MSWindows
import ReadMWDump;
for Page in ReadMWDump.GetAllPages('rkb-mediawiki-20181005-1210.xml'):
   print("Found page [" + Page['title'] + "] with text [" + Page['text'] + "], redirect=" + Page['redirect'])