Advertisement
FaceDeer

Text formatter for MediaWiki XML dump

Feb 14th, 2024
1,474
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.49 KB | Source Code | 0 0
  1. import xml.etree.ElementTree as ET
  2. import os
  3. import re
  4. import pypandoc
  5.  
  6. redirect = re.compile("#redirect", re.I)
  7. disambig = re.compile("{{Disambig}}", re.I)
  8.  
  9. def extract_pages(xml_file):
  10.     # Parse the XML file
  11.     tree = ET.parse(xml_file)
  12.     root = tree.getroot()
  13.  
  14.     total_count = 1
  15.  
  16.     # Iterate over each <page> tag in the <mediawiki> tag
  17.     for page in root.findall('.//page'):
  18.         # Extract the text inside the <title> tag
  19.         title = page.find('title').text
  20.  
  21.         if ":" in title:
  22.             continue
  23.  
  24.         # Find the <revision> tag and then find the <text> tag within it
  25.         revision = page.find('revision')
  26.         if revision is not None:
  27.             text = revision.find('text').text
  28.  
  29.             if len(text) < 100:
  30.                 #print(text)
  31.                 continue
  32.  
  33.             if redirect.match(text):
  34.                 continue
  35.             if disambig.match(text):
  36.                 #print(text)
  37.                 continue
  38.  
  39.             # Replace any characters in the title that are invalid in file names
  40.             valid_title = "".join(c for c in title if c.isalnum() or c in (' ',)).rstrip()
  41.  
  42.             # Check if a file with the same name already exists
  43.             counter = 1
  44.             original_title = valid_title
  45.             while os.path.isfile(f'Masseffect/{valid_title}.txt'):
  46.                 valid_title = f"{original_title}_{counter}"
  47.                 counter += 1
  48.  
  49.             print(str(total_count) + ": " + valid_title)
  50.             total_count = total_count + 1
  51.  
  52.             #if total_count > 10:
  53.                 #return
  54.  
  55.             # Assuming 'wiki_text' contains your Mediawiki markdown
  56.             try:
  57.                 plain_text = pypandoc.convert_text(text, 'plain', format='mediawiki', extra_args=['--wrap=preserve'])
  58.                 plain_text = re.sub('(\r?\n){2,}', '\n', plain_text)
  59.             except:
  60.                 print("pandoc threw an exception processing " + valid_title)
  61.                 plain_text = text
  62.                 valid_title = valid_title + "_md"
  63.            
  64.             # Write the text to a .txt file named after the title
  65.             with open(f'Masseffect/{valid_title}.txt', 'w', encoding='utf-8') as f:
  66.                 f.write(plain_text)
  67.  
  68. # Call the function with the path to your XML file
  69. # Note: to simplify the code I edited the <manifest> tag in this document to remove the namespace from the tag, you'll need to do that to make this work.
  70. extract_pages('masseffect_pages_current.xml')
  71.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement