Guest User

experimental Nimrod code to filter wikitext

a guest
Feb 19th, 2015
562
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.87 KB | None | 0 0
  1. import streams, parsexml, re, strutils
  2.  
  3. # Wikitext handling
  4. # -----------------
  5.  
  6. # This regex matches anywhere in the text that there *might* be wiki syntax
  7. # that we have to clean up.
  8. var ANYTHING_INTERESTING_RE: Regex = re"[*#:{['=]"
  9.  
  10. # We skip the contents of these HTML tags entirely, and they don't nest
  11. # inside each other.
  12. var SKIP_SPANS = [
  13. "cite", "hiero", "gallery", "timeline", "noinclude",
  14. "caption", "ref", "references", "img", "source", "math"
  15. ]
  16.  
  17. # This regex is for matching and skipping over simple wikitext formatting.
  18. var FORMATTING_RE: Regex = re(r"('''|''|^[ *#:]+|^[ =]+.*[ =]+$|^[|].*$)", {reMultiLine})
  19.  
  20. var FAKE_FILENAME = "<wikipage>"
  21.  
  22.  
  23. proc skipNestedChars(text: string, pos: var int, open: char, close: char) =
  24. ## Move our position 'pos' forward in the text, to skip a number of
  25. ## matching instances of the characters 'open' and 'close'.
  26. ##
  27. ## Precondition: text[pos] == open
  28. ## Postcondition: pos will increase by at least 1
  29. pos += 1
  30. var count = 1
  31. while count > 0 and pos < text.len:
  32. var nextPos: int = text.find({open, close}, pos)
  33. if nextPos == -1:
  34. # We can't find any more closing characters in the text.
  35. # Abort here so that there's something left.
  36. return
  37. else:
  38. var nextChar: char = text[nextPos]
  39. if nextChar == open:
  40. count += 1
  41. else:
  42. count -= 1
  43. pos = nextPos + 1
  44.  
  45.  
  46. # forward declaration
  47. proc filterWikitext(text: string): string
  48.  
  49. proc extractInternalLink(linkText: string): string =
  50. # Links with colons might be special MediaWiki syntax. Just throw them
  51. # all away.
  52. if linkText.contains(':'):
  53. return ""
  54. var contents: string = filterWikitext(linkText[2 .. < -2])
  55. var lastPart: int = contents.rfind('|') + 1
  56. return contents[lastPart .. -1]
  57.  
  58.  
  59. proc extractExternalLink(linkText: string): string =
  60. var spacePos = linkText.find(' ')
  61. if spacePos == -1:
  62. return ""
  63. else:
  64. return linkText[spacePos + 1 .. < -1]
  65.  
  66.  
  67. proc filterLink(text: string, pos: var int): string =
  68. var startPos: int = pos
  69.  
  70. # No matter what, move pos to the end of the link
  71. skipNestedChars(text, pos, '[', ']')
  72.  
  73. # Figure out what we skipped. If it's an ugly pseudo-link, return
  74. # nothing.
  75. if text[startPos .. startPos + 1] == "[[":
  76. # Get the displayed text out of the internal link.
  77. return extractInternalLink(text[startPos .. <pos])
  78. else:
  79. # Get the displayed text out of the external link.
  80. return extractExternalLink(text[startPos .. <pos])
  81.  
  82.  
  83. proc filterHTML(text: string): string =
  84. var xml: XmlParser
  85. var tstream: StringStream = newStringStream(text)
  86. result = ""
  87. xml.open(tstream, FAKE_FILENAME, options={reportWhitespace})
  88. while true:
  89. xml.next()
  90. case xml.kind
  91. of xmlElementStart, xmlElementOpen:
  92. if SKIP_SPANS.contains(xml.elementName):
  93. var skipTo: string = xml.elementName
  94. while true:
  95. xml.next()
  96. if xml.kind == xmlElementEnd and xml.elementName == skipTo:
  97. break
  98. elif xml.kind == xmlEof:
  99. break
  100. of xmlCharData, xmlWhitespace:
  101. result.add(xml.charData)
  102. of xmlEof:
  103. break
  104. else:
  105. discard
  106.  
  107. # return result implicitly
  108. xml.close
  109.  
  110.  
  111. proc filterWikitext(text: string): string =
  112. ## Given the complete wikitext of an article, filter it for the part
  113. ## that's meant to be read as plain text.
  114.  
  115. # This method works by building a 'result' string incrementally, and
  116. # advancing an index called 'pos' through the text as it goes. Some
  117. # of the procedures this relies on will also advance 'pos' themselves.
  118. result = ""
  119. var pos = 0
  120. var matched: int
  121. while pos < text.len:
  122. # Skip to the next character that could be wiki syntax.
  123. var found: int = text.find(ANYTHING_INTERESTING_RE, pos)
  124. if found == -1:
  125. found = text.len
  126.  
  127. # Add everything up until then to the string.
  128. if found > pos:
  129. result.add(text[pos .. <found])
  130.  
  131. # Figure out what's here and deal with it.
  132. pos = found
  133. if pos < text.len:
  134. if text[pos .. pos+1] == "{{" or text[pos .. pos+1] == "{|":
  135. # skip template invocations
  136. skipNestedChars(text, pos, '{', '}')
  137.  
  138. elif text[pos] == '[':
  139. # pos gets updated by filterLink
  140. result.add(filterLink(text, pos))
  141.  
  142. else:
  143. # Skip over formatting
  144. matched = text.matchLen(FORMATTING_RE, pos)
  145. if matched > 0:
  146. pos += matched
  147. else:
  148. # We didn't match any of the cases, so output one character
  149. # and proceed
  150. result.add($(text[pos]))
  151. pos += 1
  152.  
  153. # XML handling
  154. # ------------
  155.  
  156. type
  157. TagType = enum
  158. TITLE, TEXT, REDIRECT, NS
  159. ArticleData = array[TagType, string]
  160.  
  161. var RELEVANT_XML_TAGS = ["title", "text", "redirect", "ns"]
  162.  
  163. proc handleArticle(article: ArticleData) =
  164. if article[NS] == "0" and article[REDIRECT] == "":
  165. echo("\n## ", article[TITLE])
  166. echo(filterWikitext(filterHTML(article[TEXT])))
  167.  
  168.  
  169. proc readMediaWikiXML(input: Stream, filename="<input>") =
  170. var xml: XmlParser
  171. var textBuffer: string = ""
  172. var article: ArticleData
  173. for tag in TITLE..NS:
  174. article[tag] = ""
  175. var gettingText: bool = false
  176. xml.open(input, filename, options={reportWhitespace})
  177. while true:
  178. xml.next()
  179. case xml.kind
  180. of xmlElementStart, xmlElementOpen:
  181. if RELEVANT_XML_TAGS.contains(xml.elementName):
  182. textBuffer.delete(0, textBuffer.len - 1)
  183. gettingText = true
  184. elif xml.elementName == "page":
  185. # clear redirect status
  186. article[REDIRECT] = ""
  187. of xmlElementEnd:
  188. case xml.elementName
  189. of "title":
  190. article[TITLE] = textBuffer
  191. of "text":
  192. article[TEXT] = textBuffer
  193. of "redirect":
  194. article[REDIRECT] = textBuffer
  195. of "ns":
  196. article[NS] = textBuffer
  197. of "page":
  198. handleArticle(article)
  199. else:
  200. discard
  201. gettingText = false
  202. of xmlCharData, xmlWhitespace:
  203. if gettingText:
  204. textBuffer.add(xml.charData)
  205. of xmlEof:
  206. break
  207. else:
  208. discard
  209. xml.close
  210.  
  211.  
  212. when isMainModule:
  213. readMediaWikiXML(newFileStream(stdin))
Advertisement
Add Comment
Please, Sign In to add comment