GMRKB.ReadMWDump Parser
Jump to navigation
Jump to search
A GMRKB.ReadMWDump Parser is a MediaWiki XML File Parser.
- See: xml.etree.ElementTree.
References
2018
#ReadMWDump.py package definition import xml.etree.ElementTree as etree
class GetAllPages(object): def __init__(self, inputFilename): self.tokenParser = etree.iterparse(inputFilename, events=('start', 'end')) self.currentPage = {}
def __iter__(self): return self
def next(self): for event, elem in self.tokenParser: tagName = strip_tag_name(elem)
if event == 'end': if tagName == 'page': # Found the end of </page> block ret = self.currentPage self.currentPage = {}
# Make sure that "text" is always populated, even for empty pages (0 bytes of text). if not ret.get('text'): ret['text'] = ""
if not ret.get('redirect'): ret['redirect'] = ""
return ret elif tagName == 'title': self.currentPage['title'] = elem.text elif tagName == 'text': self.currentPage['text'] = elem.text elif event == 'start': if tagName == 'redirect': self.currentPage['redirect'] = elem.attrib['title']
# Entire XML dump was parsed, nothing more to read raise StopIteration
def strip_tag_name(elem): t = elem.tag idx = k = t.rfind("}") if idx != -1: t = t[idx + 1:] return t
# example.py calling program # set PYTHONIOENCODING=utf8 in MSWindows import ReadMWDump; for Page in ReadMWDump.GetAllPages('rkb-mediawiki-20181005-1210.xml'): print("Found page [" + Page['title'] + "] with text [" + Page['text'] + "], redirect=" + Page['redirect'])