Text formatter for MediaWiki XML dump

import xml.etree.ElementTree as ET
import os
import re
import pypandoc

redirect = re.compile("#redirect", re.I)
disambig = re.compile("{{Disambig}}", re.I)

def extract_pages(xml_file):
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    total_count = 1

    # Iterate over each <page> tag in the <mediawiki> tag
    for page in root.findall('.//page'):
        # Extract the text inside the <title> tag
        title = page.find('title').text

        if ":" in title:
            continue

        # Find the <revision> tag and then find the <text> tag within it
        revision = page.find('revision')
        if revision is not None:
            text = revision.find('text').text

            if len(text) < 100:
                #print(text)
                continue

            if redirect.match(text):
                continue
            if disambig.match(text):
                #print(text)
                continue

            # Replace any characters in the title that are invalid in file names
            valid_title = "".join(c for c in title if c.isalnum() or c in (' ',)).rstrip()

            # Check if a file with the same name already exists
            counter = 1
            original_title = valid_title
            while os.path.isfile(f'Masseffect/{valid_title}.txt'):
                valid_title = f"{original_title}_{counter}"
                counter += 1

            print(str(total_count) + ": " + valid_title)
            total_count = total_count + 1

            #if total_count > 10:
                #return

            # Assuming 'wiki_text' contains your Mediawiki markdown
            try:
                plain_text = pypandoc.convert_text(text, 'plain', format='mediawiki', extra_args=['--wrap=preserve'])
                plain_text = re.sub('(\r?\n){2,}', '\n', plain_text)
            except:
                print("pandoc threw an exception processing " + valid_title)
                plain_text = text
                valid_title = valid_title + "_md"

            # Write the text to a .txt file named after the title
            with open(f'Masseffect/{valid_title}.txt', 'w', encoding='utf-8') as f:
                f.write(plain_text)

# Call the function with the path to your XML file
# Note: to simplify the code I edited the <manifest> tag in this document to remove the namespace from the tag, you'll need to do that to make this work.
extract_pages('masseffect_pages_current.xml')