SHARE
TWEET

wiki2plain.py

a guest Dec 16th, 2010 4,747 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2.  
  3. import re
  4.  
  5. class Wiki2Plain:
  6.     def __init__(self, wiki):
  7.         self.wiki = wiki
  8.        
  9.         self.text = wiki
  10.         self.text = self.unhtml(self.text)
  11.         self.text = self.unwiki(self.text)
  12.         self.text = self.punctuate(self.text)
  13.    
  14.     def __str__(self):
  15.         return self.text
  16.    
  17.     def unwiki(self, wiki):
  18.         """
  19.        Remove wiki markup from the text.
  20.        """
  21.         wiki = re.sub(r'(?i)\{\{IPA(\-[^\|\{\}]+)*?\|([^\|\{\}]+)(\|[^\{\}]+)*?\}\}', lambda m: m.group(2), wiki)
  22.         wiki = re.sub(r'(?i)\{\{Lang(\-[^\|\{\}]+)*?\|([^\|\{\}]+)(\|[^\{\}]+)*?\}\}', lambda m: m.group(2), wiki)
  23.         wiki = re.sub(r'\{\{[^\{\}]+\}\}', '', wiki)
  24.         wiki = re.sub(r'(?m)\{\{[^\{\}]+\}\}', '', wiki)
  25.         wiki = re.sub(r'(?m)\{\|[^\{\}]*?\|\}', '', wiki)
  26.         wiki = re.sub(r'(?i)\[\[Category:[^\[\]]*?\]\]', '', wiki)
  27.         wiki = re.sub(r'(?i)\[\[Image:[^\[\]]*?\]\]', '', wiki)
  28.         wiki = re.sub(r'(?i)\[\[File:[^\[\]]*?\]\]', '', wiki)
  29.         wiki = re.sub(r'\[\[[^\[\]]*?\|([^\[\]]*?)\]\]', lambda m: m.group(1), wiki)
  30.         wiki = re.sub(r'\[\[([^\[\]]+?)\]\]', lambda m: m.group(1), wiki)
  31.         wiki = re.sub(r'\[\[([^\[\]]+?)\]\]', '', wiki)
  32.         wiki = re.sub(r'(?i)File:[^\[\]]*?', '', wiki)
  33.         wiki = re.sub(r'\[[^\[\]]*? ([^\[\]]*?)\]', lambda m: m.group(1), wiki)
  34.         wiki = re.sub(r"''+", '', wiki)
  35.         wiki = re.sub(r'(?m)^\*$', '', wiki)
  36.        
  37.         return wiki
  38.    
  39.     def unhtml(self, html):
  40.         """
  41.        Remove HTML from the text.
  42.        """
  43.         html = re.sub(r'(?i) ', ' ', html)
  44.         html = re.sub(r'(?i)<br[ \\]*?>', '\n', html)
  45.         html = re.sub(r'(?m)<!--.*?--\s*>', '', html)
  46.         html = re.sub(r'(?i)<ref[^>]*>[^>]*<\/ ?ref>', '', html)
  47.         html = re.sub(r'(?m)<.*?>', '', html)
  48.         html = re.sub(r'(?i)&amp;', '&', html)
  49.        
  50.         return html
  51.    
  52.     def punctuate(self, text):
  53.         """
  54.        Convert every text part into well-formed one-space
  55.        separate paragraph.
  56.        """
  57.         text = re.sub(r'\r\n|\n|\r', '\n', text)
  58.         text = re.sub(r'\n\n+', '\n\n', text)
  59.        
  60.         parts = text.split('\n\n')
  61.         partsParsed = []
  62.        
  63.         for part in parts:
  64.             part = part.strip()
  65.            
  66.             if len(part) == 0:
  67.                 continue
  68.            
  69.             partsParsed.append(part)
  70.        
  71.         return '\n\n'.join(partsParsed)
  72.    
  73.     def image(self):
  74.         """
  75.        Retrieve the first image in the document.
  76.        """
  77.         # match = re.search(r'(?i)\|?\s*(image|img|image_flag)\s*=\s*(<!--.*-->)?\s*([^\\/:*?<>"|%]+\.[^\\/:*?<>"|%]{3,4})', self.wiki)
  78.         match = re.search(r'(?i)([^\\/:*?<>"|% =]+)\.(gif|jpg|jpeg|png|bmp)', self.wiki)
  79.        
  80.         if match:
  81.             return '%s.%s' % match.groups()
  82.        
  83.         return None
  84.  
  85. if __name__ == '__main__':
  86.     # @link http://simple.wikipedia.org/w/index.php?action=raw&title=Uruguay
  87.     wiki = """[[File:LocationUruguay.png|right|]]
  88. '''Uruguay''' is a country in [[South America]]. The language spoken there is Spanish. Its [[capital (city)|capital]] and largest [[city]] is [[Montevideo]]. Uruguay is bordered by two large neighbors, [[Brazil]] and [[Argentina]]. The only country in South America that is smaller than Uruguay is [[Suriname]].
  89. The land there is mostly flat and they have many farms there.
  90. {{geo-stub}}
  91.  
  92. {{South America}}
  93. {{Link FA|af}}
  94. {{Link FA|ast}}
  95. {{Link FA|ca}}
  96.  
  97. [[Category:Uruguay| ]]"""
  98.    
  99.     wiki2plain = Wiki2Plain(wiki)
  100.     content = wiki2plain.text
  101.     image = wiki2plain.image()
  102.    
  103.     print '---'
  104.     print content
  105.     print '---'
  106.     print image
  107.     print '---'
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top