Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import streams, parsexml, re, strutils
- # Wikitext handling
- # -----------------
- # This regex matches anywhere in the text that there *might* be wiki syntax
- # that we have to clean up.
- var ANYTHING_INTERESTING_RE: Regex = re"[*#:{['=]"
- # We skip the contents of these HTML tags entirely, and they don't nest
- # inside each other.
- var SKIP_SPANS = [
- "cite", "hiero", "gallery", "timeline", "noinclude",
- "caption", "ref", "references", "img", "source", "math"
- ]
- # This regex is for matching and skipping over simple wikitext formatting.
- var FORMATTING_RE: Regex = re(r"('''|''|^[ *#:]+|^[ =]+.*[ =]+$|^[|].*$)", {reMultiLine})
- var FAKE_FILENAME = "<wikipage>"
- proc skipNestedChars(text: string, pos: var int, open: char, close: char) =
- ## Move our position 'pos' forward in the text, to skip a number of
- ## matching instances of the characters 'open' and 'close'.
- ##
- ## Precondition: text[pos] == open
- ## Postcondition: pos will increase by at least 1
- pos += 1
- var count = 1
- while count > 0 and pos < text.len:
- var nextPos: int = text.find({open, close}, pos)
- if nextPos == -1:
- # We can't find any more closing characters in the text.
- # Abort here so that there's something left.
- return
- else:
- var nextChar: char = text[nextPos]
- if nextChar == open:
- count += 1
- else:
- count -= 1
- pos = nextPos + 1
- # forward declaration
- proc filterWikitext(text: string): string
- proc extractInternalLink(linkText: string): string =
- # Links with colons might be special MediaWiki syntax. Just throw them
- # all away.
- if linkText.contains(':'):
- return ""
- var contents: string = filterWikitext(linkText[2 .. < -2])
- var lastPart: int = contents.rfind('|') + 1
- return contents[lastPart .. -1]
- proc extractExternalLink(linkText: string): string =
- var spacePos = linkText.find(' ')
- if spacePos == -1:
- return ""
- else:
- return linkText[spacePos + 1 .. < -1]
- proc filterLink(text: string, pos: var int): string =
- var startPos: int = pos
- # No matter what, move pos to the end of the link
- skipNestedChars(text, pos, '[', ']')
- # Figure out what we skipped. If it's an ugly pseudo-link, return
- # nothing.
- if text[startPos .. startPos + 1] == "[[":
- # Get the displayed text out of the internal link.
- return extractInternalLink(text[startPos .. <pos])
- else:
- # Get the displayed text out of the external link.
- return extractExternalLink(text[startPos .. <pos])
- proc filterHTML(text: string): string =
- var xml: XmlParser
- var tstream: StringStream = newStringStream(text)
- result = ""
- xml.open(tstream, FAKE_FILENAME, options={reportWhitespace})
- while true:
- xml.next()
- case xml.kind
- of xmlElementStart, xmlElementOpen:
- if SKIP_SPANS.contains(xml.elementName):
- var skipTo: string = xml.elementName
- while true:
- xml.next()
- if xml.kind == xmlElementEnd and xml.elementName == skipTo:
- break
- elif xml.kind == xmlEof:
- break
- of xmlCharData, xmlWhitespace:
- result.add(xml.charData)
- of xmlEof:
- break
- else:
- discard
- # return result implicitly
- xml.close
- proc filterWikitext(text: string): string =
- ## Given the complete wikitext of an article, filter it for the part
- ## that's meant to be read as plain text.
- # This method works by building a 'result' string incrementally, and
- # advancing an index called 'pos' through the text as it goes. Some
- # of the procedures this relies on will also advance 'pos' themselves.
- result = ""
- var pos = 0
- var matched: int
- while pos < text.len:
- # Skip to the next character that could be wiki syntax.
- var found: int = text.find(ANYTHING_INTERESTING_RE, pos)
- if found == -1:
- found = text.len
- # Add everything up until then to the string.
- if found > pos:
- result.add(text[pos .. <found])
- # Figure out what's here and deal with it.
- pos = found
- if pos < text.len:
- if text[pos .. pos+1] == "{{" or text[pos .. pos+1] == "{|":
- # skip template invocations
- skipNestedChars(text, pos, '{', '}')
- elif text[pos] == '[':
- # pos gets updated by filterLink
- result.add(filterLink(text, pos))
- else:
- # Skip over formatting
- matched = text.matchLen(FORMATTING_RE, pos)
- if matched > 0:
- pos += matched
- else:
- # We didn't match any of the cases, so output one character
- # and proceed
- result.add($(text[pos]))
- pos += 1
- # XML handling
- # ------------
- type
- TagType = enum
- TITLE, TEXT, REDIRECT, NS
- ArticleData = array[TagType, string]
- var RELEVANT_XML_TAGS = ["title", "text", "redirect", "ns"]
- proc handleArticle(article: ArticleData) =
- if article[NS] == "0" and article[REDIRECT] == "":
- echo("\n## ", article[TITLE])
- echo(filterWikitext(filterHTML(article[TEXT])))
- proc readMediaWikiXML(input: Stream, filename="<input>") =
- var xml: XmlParser
- var textBuffer: string = ""
- var article: ArticleData
- for tag in TITLE..NS:
- article[tag] = ""
- var gettingText: bool = false
- xml.open(input, filename, options={reportWhitespace})
- while true:
- xml.next()
- case xml.kind
- of xmlElementStart, xmlElementOpen:
- if RELEVANT_XML_TAGS.contains(xml.elementName):
- textBuffer.delete(0, textBuffer.len - 1)
- gettingText = true
- elif xml.elementName == "page":
- # clear redirect status
- article[REDIRECT] = ""
- of xmlElementEnd:
- case xml.elementName
- of "title":
- article[TITLE] = textBuffer
- of "text":
- article[TEXT] = textBuffer
- of "redirect":
- article[REDIRECT] = textBuffer
- of "ns":
- article[NS] = textBuffer
- of "page":
- handleArticle(article)
- else:
- discard
- gettingText = false
- of xmlCharData, xmlWhitespace:
- if gettingText:
- textBuffer.add(xml.charData)
- of xmlEof:
- break
- else:
- discard
- xml.close
- when isMainModule:
- readMediaWikiXML(newFileStream(stdin))
Advertisement
Add Comment
Please, Sign In to add comment