stats_updater_base.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
統計更新器共享基類
為維基詞典和維基百科統計更新器提供共享功能
"""

import re
import time
import csv
import io
import logging
from datetime import datetime, timezone, timedelta
from typing import List, Optional, Dict
import requests
import mwclient
from abc import ABC, abstractmethod

class BaseStatsUpdater(ABC):
    """統計更新器基類"""

    def __init__(self, username: str, password: str, site_domain: str,
                 target_languages: List[str], csv_api_url: str,
                 page_title: str, log_filename: str):
        """
        初始化基類

        Args:
            username: 用戶名
            password: 密碼
            site_domain: 網站域名
            target_languages: 目標語言代碼列表
            csv_api_url: CSV API URL
            page_title: 目標頁面標題
            log_filename: 日誌文件名
        """
        self.username = username
        self.password = password
        self.site_domain = site_domain
        self.site = None
        self.target_languages = target_languages
        self.csv_url = csv_api_url
        self.page_title = page_title
        self.log_filename = log_filename

        # 設置日誌
        self._setup_logging()

        self.logger = logging.getLogger(f"{self.__class__.__name__}")

    def _setup_logging(self):
        """設置日誌配置"""
        # 清除可能存在的 handlers 避免重複
        for handler in logging.root.handlers[:]:
            logging.root.removeHandler(handler)

        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(self.log_filename, encoding='utf-8'),
                logging.StreamHandler()
            ]
        )

    def connect_to_wiki(self) -> bool:
        """連接到維基網站"""
        try:
            self.site = mwclient.Site(self.site_domain)
            self.site.login(self.username, self.password)
            self.logger.info(f"成功連接到 {self.site_domain}")
            return True
        except Exception as e:
            self.logger.error(f"連接 {self.site_domain} 失敗: {e}")
            return False

    def fetch_wikistats_data(self, max_retries: int = 5) -> Optional[List[int]]:
        """
        從 wikistats CSV API 獲取數據

        Args:
            max_retries: 最大重試次數

        Returns:
            各語言的條目數量列表，或 None（如果失敗）
        """
        for attempt in range(max_retries):
            try:
                self.logger.info(f"第 {attempt + 1} 次嘗試從 wikistats CSV API 獲取數據...")

                headers = {
                    'User-Agent': f'{self.__class__.__name__}/1.0 (https://{self.site_domain}/wiki/User:{self.username})'
                }
                response = requests.get(self.csv_url, headers=headers, timeout=30)
                response.raise_for_status()

                self.logger.info(f"成功獲取 CSV 數據，大小: {len(response.text)} 字符")

                # 解析 CSV 數據
                stats_data = self._parse_csv_data(response.text)
                if stats_data:
                    self.logger.info(f"成功解析統計數據: {stats_data}")
                    return stats_data
                else:
                    self.logger.error("無法解析CSV數據")
                    if attempt < max_retries - 1:
                        time.sleep(60)  # 等待1分鐘後重試
                        continue
                    else:
                        return None

            except Exception as e:
                self.logger.error(f"獲取數據失敗 (嘗試 {attempt + 1}): {e}")
                if attempt < max_retries - 1:
                    time.sleep(60)  # 等待1分鐘後重試

        return None

    def _parse_csv_data(self, csv_content: str) -> Optional[List[int]]:
        """解析CSV數據並提取目標語言的統計數據"""
        try:
            self.logger.info("開始解析 CSV 數據...")

            # 解析CSV
            csv_reader = csv.DictReader(io.StringIO(csv_content))

            # 記錄 CSV 欄位
            fieldnames = csv_reader.fieldnames
            self.logger.info(f"CSV 欄位: {fieldnames}")

            # 存儲找到的數據，按排名排序
            all_entries = []

            for row in csv_reader:
                try:
                    # 嘗試多種可能的欄位名稱
                    prefix = None
                    good_count = None

                    # 查找語言代碼欄位
                    for field in ['prefix', 'lang', 'language', 'code']:
                        if field in row and row[field]:
                            prefix = row[field].strip()
                            break

                    # 查找條目數量欄位
                    for field in ['good', 'articles', 'entries', 'pages']:
                        if field in row and row[field]:
                            try:
                                good_count = int(row[field])
                                break
                            except (ValueError, TypeError):
                                continue

                    if prefix and good_count is not None:
                        all_entries.append({'prefix': prefix, 'good': good_count, 'row': row})
                        self.logger.debug(f"找到條目: {prefix} = {good_count}")

                except Exception as e:
                    self.logger.debug(f"解析行時出錯: {e}, 行數據: {row}")
                    continue

            project_type = "維基詞典" if "wiktionar" in self.csv_url else "維基百科"
            self.logger.info(f"共解析到 {len(all_entries)} 個{project_type}條目")

            # 按條目數量排序（降序）
            all_entries.sort(key=lambda x: x['good'], reverse=True)

            # 顯示前20名以供參考（確保涵蓋所有目標語言）
            display_count = min(20, len(all_entries))
            self.logger.info(f"前{display_count}名{project_type}:")
            for i, entry in enumerate(all_entries[:display_count]):
                self.logger.info(f"  {i+1}. {entry['prefix']}: {entry['good']:,} 條目")

            # 查找目標語言並按排名獲取
            result = []
            found_languages = {}

            for entry in all_entries:
                if entry['prefix'] in self.target_languages:
                    found_languages[entry['prefix']] = entry['good']

            # 按照目標順序排列數據
            for lang in self.target_languages:
                if lang in found_languages:
                    result.append(found_languages[lang])
                    self.logger.info(f"目標語言 {lang}: {found_languages[lang]:,} 條目")
                else:
                    self.logger.error(f"未找到語言 {lang} 的統計數據")
                    # 顯示可用的語言代碼以供調試
                    available_codes = [entry['prefix'] for entry in all_entries[:30]]
                    self.logger.error(f"可用的前30個語言代碼: {available_codes}")
                    return None

            return result if len(result) == len(self.target_languages) else None

        except Exception as e:
            self.logger.error(f"解析CSV數據時出錯: {e}")
            # 顯示 CSV 內容的開頭部分以供調試
            if csv_content:
                self.logger.error(f"CSV 內容開頭: {csv_content[:500]}")
            return None

    def _is_data_line(self, line: str) -> bool:
        """
        判斷一行是否為數字數據行
        數據行特徵：以|開頭，主要包含純數字，而非文字描述
        """
        line = line.strip()
        if not line.startswith('|'):
            return False

        # 移除首尾的 |，分割內容
        content = line.strip('|').strip()
        if not content:
            return False

        # 分割各個字段
        fields = [field.strip() for field in content.split('|')]

        # 檢查是否主要包含純數字
        numeric_fields = 0
        total_fields = len(fields)

        for field in fields:
            if field.isdigit():
                numeric_fields += 1

        # 如果超過一半的字段是數字，則認為是數據行
        if total_fields > 0 and numeric_fields / total_fields > 0.5:
            return True

        # 額外檢查：如果行中包含明顯的中文描述詞，則不是數據行
        chinese_description_patterns = ['增长', '发展', '缓慢', '稳定', '增大', '缩小']
        for pattern in chinese_description_patterns:
            if pattern in content:
                return False

        return False

    def update_wiki_page(self, stats: List[int]) -> bool:
        """更新維基頁面"""
        try:
            page = self.site.pages[self.page_title]
            current_content = page.text()

            # 獲取當前日期
            today = datetime.now(timezone(timedelta(hours=8))).strftime('%Y-%m-%d')

            # 構建新的統計行
            stats_line = f"| {' | '.join(map(str, stats))}"

            # 查找 autoStat 模板
            autostat_pattern = r'({{autoStat\s*\n.*?\n)(}})'
            match = re.search(autostat_pattern, current_content, re.DOTALL)

            if not match:
                self.logger.error("未找到 autoStat 模板")
                # 提供頁面內容的調試信息
                self.logger.info("正在分析頁面內容格式...")

                # 檢查是否有其他可能的模板格式
                possible_templates = re.findall(r'{{[^}]*}}', current_content)
                if possible_templates:
                    self.logger.info(f"找到的模板: {possible_templates[:5]}")  # 只顯示前5個
                else:
                    self.logger.info("頁面中未找到任何模板")

                # 顯示頁面內容片段以供調試
                content_preview = current_content[:500] if len(current_content) > 500 else current_content
                self.logger.info(f"頁面內容預覽（前500字符）:\n{content_preview}")
                return False

            template_content = match.group(1)  # autoStat 模板內容

            # 檢查統計行是否已經存在
            if stats_line in template_content:
                self.logger.info(f"統計數據已存在於頁面中: {stats_line}")
                self.logger.info("跳過編輯，無需更新")
                return True

            # 檢查是否有相同數字組合但格式稍有不同的情況
            # 創建用於比較的數字字符串（去除空格）
            stats_numbers = ''.join(map(str, stats))
            template_lines = template_content.split('\n')

            for line in template_lines:
                if line.strip().startswith('|'):
                    # 提取該行的數字（去除空格和分隔符）
                    line_numbers = re.sub(r'[^\d]', '', line)
                    if line_numbers == stats_numbers:
                        self.logger.info(f"發現相同的統計數據（格式略有不同）:")
                        self.logger.info(f"  頁面中已有: {line.strip()}")
                        self.logger.info(f"  欲添加的: {stats_line}")
                        self.logger.info("跳過編輯，無需更新")
                        return True

            # 數據不存在，進行更新
            self.logger.info(f"統計數據不存在，準備添加: {stats_line}")

            # 找到最後一行數字數據的位置
            template_lines = template_content.split('\n')
            last_data_line_index = -1

            # 從後往前搜索，找到最後一行包含純數字數據的行
            for i in range(len(template_lines) - 1, -1, -1):
                line = template_lines[i].strip()
                if line.startswith('|') and self._is_data_line(line):
                    last_data_line_index = i
                    break

            if last_data_line_index == -1:
                self.logger.error("未找到數據行，無法插入新統計數據")
                return False

            self.logger.info(f"找到最後一行數據: {template_lines[last_data_line_index].strip()}")
            self.logger.info(f"將在第 {last_data_line_index + 1} 行後插入新數據")

            # 重構模板內容：在最後一行數據後插入新行
            new_template_lines = (
                template_lines[:last_data_line_index + 1] +  # 包含最後一行數據
                [stats_line] +                               # 新統計行
                template_lines[last_data_line_index + 1:]    # 其餘內容（如描述行）
            )

            new_template_content = '\n'.join(new_template_lines)

            # 構建完整的新內容
            before_template = current_content[:match.start()]
            template_end = match.group(2)      # }}
            after_template = current_content[match.end():]  # 模板之後的所有內容

            new_content = before_template + new_template_content + template_end + after_template

            self.logger.info(f"頁面更新預覽:")
            self.logger.info(f"  原始內容長度: {len(current_content)} 字符")
            self.logger.info(f"  新內容長度: {len(new_content)} 字符")
            self.logger.info(f"  添加的統計行: {stats_line}")

            # 保存頁面
            edit_summary = self._get_edit_summary(today)
            page.save(new_content, summary=edit_summary, minor=True, bot=self._is_bot_account())

            self.logger.info(f"成功更新頁面，添加數據: {stats_line}")
            return True

        except Exception as e:
            self.logger.error(f"更新維基頁面失敗: {e}")
            return False

    @abstractmethod
    def _get_edit_summary(self, date: str) -> str:
        """獲取編輯摘要（子類實現）"""
        pass

    @abstractmethod
    def _is_bot_account(self) -> bool:
        """是否為機器人帳號（子類實現）"""
        pass

    def run(self):
        """執行主要任務"""
        project_type = "維基詞典" if "wiktionary" in self.csv_url else "維基百科"
        self.logger.info(f"開始執行{project_type}統計更新任務")

        # 連接到維基
        if not self.connect_to_wiki():
            return False

        # 獲取統計數據
        stats = self.fetch_wikistats_data()
        if not stats:
            self.logger.error("無法獲取統計數據，任務失敗")
            return False

        # 更新頁面
        if self.update_wiki_page(stats):
            self.logger.info("任務成功完成！")
            return True
        else:
            self.logger.error("更新頁面失敗")
            return False