Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import xml.etree.ElementTree as ET
- import os
- import re
- import pypandoc
- redirect = re.compile("#redirect", re.I)
- disambig = re.compile("{{Disambig}}", re.I)
- def extract_pages(xml_file):
- # Parse the XML file
- tree = ET.parse(xml_file)
- root = tree.getroot()
- total_count = 1
- # Iterate over each <page> tag in the <mediawiki> tag
- for page in root.findall('.//page'):
- # Extract the text inside the <title> tag
- title = page.find('title').text
- if ":" in title:
- continue
- # Find the <revision> tag and then find the <text> tag within it
- revision = page.find('revision')
- if revision is not None:
- text = revision.find('text').text
- if len(text) < 100:
- #print(text)
- continue
- if redirect.match(text):
- continue
- if disambig.match(text):
- #print(text)
- continue
- # Replace any characters in the title that are invalid in file names
- valid_title = "".join(c for c in title if c.isalnum() or c in (' ',)).rstrip()
- # Check if a file with the same name already exists
- counter = 1
- original_title = valid_title
- while os.path.isfile(f'Masseffect/{valid_title}.txt'):
- valid_title = f"{original_title}_{counter}"
- counter += 1
- print(str(total_count) + ": " + valid_title)
- total_count = total_count + 1
- #if total_count > 10:
- #return
- # Assuming 'wiki_text' contains your Mediawiki markdown
- try:
- plain_text = pypandoc.convert_text(text, 'plain', format='mediawiki', extra_args=['--wrap=preserve'])
- plain_text = re.sub('(\r?\n){2,}', '\n', plain_text)
- except:
- print("pandoc threw an exception processing " + valid_title)
- plain_text = text
- valid_title = valid_title + "_md"
- # Write the text to a .txt file named after the title
- with open(f'Masseffect/{valid_title}.txt', 'w', encoding='utf-8') as f:
- f.write(plain_text)
- # Call the function with the path to your XML file
- # Note: to simplify the code I edited the <manifest> tag in this document to remove the namespace from the tag, you'll need to do that to make this work.
- extract_pages('masseffect_pages_current.xml')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement