Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 統計更新器共享基類
- 為維基詞典和維基百科統計更新器提供共享功能
- """
- import re
- import time
- import csv
- import io
- import logging
- from datetime import datetime, timezone, timedelta
- from typing import List, Optional, Dict
- import requests
- import mwclient
- from abc import ABC, abstractmethod
- class BaseStatsUpdater(ABC):
- """統計更新器基類"""
- def __init__(self, username: str, password: str, site_domain: str,
- target_languages: List[str], csv_api_url: str,
- page_title: str, log_filename: str):
- """
- 初始化基類
- Args:
- username: 用戶名
- password: 密碼
- site_domain: 網站域名
- target_languages: 目標語言代碼列表
- csv_api_url: CSV API URL
- page_title: 目標頁面標題
- log_filename: 日誌文件名
- """
- self.username = username
- self.password = password
- self.site_domain = site_domain
- self.site = None
- self.target_languages = target_languages
- self.csv_url = csv_api_url
- self.page_title = page_title
- self.log_filename = log_filename
- # 設置日誌
- self._setup_logging()
- self.logger = logging.getLogger(f"{self.__class__.__name__}")
- def _setup_logging(self):
- """設置日誌配置"""
- # 清除可能存在的 handlers 避免重複
- for handler in logging.root.handlers[:]:
- logging.root.removeHandler(handler)
- logging.basicConfig(
- level=logging.INFO,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
- handlers=[
- logging.FileHandler(self.log_filename, encoding='utf-8'),
- logging.StreamHandler()
- ]
- )
- def connect_to_wiki(self) -> bool:
- """連接到維基網站"""
- try:
- self.site = mwclient.Site(self.site_domain)
- self.site.login(self.username, self.password)
- self.logger.info(f"成功連接到 {self.site_domain}")
- return True
- except Exception as e:
- self.logger.error(f"連接 {self.site_domain} 失敗: {e}")
- return False
- def fetch_wikistats_data(self, max_retries: int = 5) -> Optional[List[int]]:
- """
- 從 wikistats CSV API 獲取數據
- Args:
- max_retries: 最大重試次數
- Returns:
- 各語言的條目數量列表,或 None(如果失敗)
- """
- for attempt in range(max_retries):
- try:
- self.logger.info(f"第 {attempt + 1} 次嘗試從 wikistats CSV API 獲取數據...")
- headers = {
- 'User-Agent': f'{self.__class__.__name__}/1.0 (https://{self.site_domain}/wiki/User:{self.username})'
- }
- response = requests.get(self.csv_url, headers=headers, timeout=30)
- response.raise_for_status()
- self.logger.info(f"成功獲取 CSV 數據,大小: {len(response.text)} 字符")
- # 解析 CSV 數據
- stats_data = self._parse_csv_data(response.text)
- if stats_data:
- self.logger.info(f"成功解析統計數據: {stats_data}")
- return stats_data
- else:
- self.logger.error("無法解析CSV數據")
- if attempt < max_retries - 1:
- time.sleep(60) # 等待1分鐘後重試
- continue
- else:
- return None
- except Exception as e:
- self.logger.error(f"獲取數據失敗 (嘗試 {attempt + 1}): {e}")
- if attempt < max_retries - 1:
- time.sleep(60) # 等待1分鐘後重試
- return None
- def _parse_csv_data(self, csv_content: str) -> Optional[List[int]]:
- """解析CSV數據並提取目標語言的統計數據"""
- try:
- self.logger.info("開始解析 CSV 數據...")
- # 解析CSV
- csv_reader = csv.DictReader(io.StringIO(csv_content))
- # 記錄 CSV 欄位
- fieldnames = csv_reader.fieldnames
- self.logger.info(f"CSV 欄位: {fieldnames}")
- # 存儲找到的數據,按排名排序
- all_entries = []
- for row in csv_reader:
- try:
- # 嘗試多種可能的欄位名稱
- prefix = None
- good_count = None
- # 查找語言代碼欄位
- for field in ['prefix', 'lang', 'language', 'code']:
- if field in row and row[field]:
- prefix = row[field].strip()
- break
- # 查找條目數量欄位
- for field in ['good', 'articles', 'entries', 'pages']:
- if field in row and row[field]:
- try:
- good_count = int(row[field])
- break
- except (ValueError, TypeError):
- continue
- if prefix and good_count is not None:
- all_entries.append({'prefix': prefix, 'good': good_count, 'row': row})
- self.logger.debug(f"找到條目: {prefix} = {good_count}")
- except Exception as e:
- self.logger.debug(f"解析行時出錯: {e}, 行數據: {row}")
- continue
- project_type = "維基詞典" if "wiktionar" in self.csv_url else "維基百科"
- self.logger.info(f"共解析到 {len(all_entries)} 個{project_type}條目")
- # 按條目數量排序(降序)
- all_entries.sort(key=lambda x: x['good'], reverse=True)
- # 顯示前20名以供參考(確保涵蓋所有目標語言)
- display_count = min(20, len(all_entries))
- self.logger.info(f"前{display_count}名{project_type}:")
- for i, entry in enumerate(all_entries[:display_count]):
- self.logger.info(f" {i+1}. {entry['prefix']}: {entry['good']:,} 條目")
- # 查找目標語言並按排名獲取
- result = []
- found_languages = {}
- for entry in all_entries:
- if entry['prefix'] in self.target_languages:
- found_languages[entry['prefix']] = entry['good']
- # 按照目標順序排列數據
- for lang in self.target_languages:
- if lang in found_languages:
- result.append(found_languages[lang])
- self.logger.info(f"目標語言 {lang}: {found_languages[lang]:,} 條目")
- else:
- self.logger.error(f"未找到語言 {lang} 的統計數據")
- # 顯示可用的語言代碼以供調試
- available_codes = [entry['prefix'] for entry in all_entries[:30]]
- self.logger.error(f"可用的前30個語言代碼: {available_codes}")
- return None
- return result if len(result) == len(self.target_languages) else None
- except Exception as e:
- self.logger.error(f"解析CSV數據時出錯: {e}")
- # 顯示 CSV 內容的開頭部分以供調試
- if csv_content:
- self.logger.error(f"CSV 內容開頭: {csv_content[:500]}")
- return None
- def _is_data_line(self, line: str) -> bool:
- """
- 判斷一行是否為數字數據行
- 數據行特徵:以|開頭,主要包含純數字,而非文字描述
- """
- line = line.strip()
- if not line.startswith('|'):
- return False
- # 移除首尾的 |,分割內容
- content = line.strip('|').strip()
- if not content:
- return False
- # 分割各個字段
- fields = [field.strip() for field in content.split('|')]
- # 檢查是否主要包含純數字
- numeric_fields = 0
- total_fields = len(fields)
- for field in fields:
- if field.isdigit():
- numeric_fields += 1
- # 如果超過一半的字段是數字,則認為是數據行
- if total_fields > 0 and numeric_fields / total_fields > 0.5:
- return True
- # 額外檢查:如果行中包含明顯的中文描述詞,則不是數據行
- chinese_description_patterns = ['增长', '发展', '缓慢', '稳定', '增大', '缩小']
- for pattern in chinese_description_patterns:
- if pattern in content:
- return False
- return False
- def update_wiki_page(self, stats: List[int]) -> bool:
- """更新維基頁面"""
- try:
- page = self.site.pages[self.page_title]
- current_content = page.text()
- # 獲取當前日期
- today = datetime.now(timezone(timedelta(hours=8))).strftime('%Y-%m-%d')
- # 構建新的統計行
- stats_line = f"| {' | '.join(map(str, stats))}"
- # 查找 autoStat 模板
- autostat_pattern = r'({{autoStat\s*\n.*?\n)(}})'
- match = re.search(autostat_pattern, current_content, re.DOTALL)
- if not match:
- self.logger.error("未找到 autoStat 模板")
- # 提供頁面內容的調試信息
- self.logger.info("正在分析頁面內容格式...")
- # 檢查是否有其他可能的模板格式
- possible_templates = re.findall(r'{{[^}]*}}', current_content)
- if possible_templates:
- self.logger.info(f"找到的模板: {possible_templates[:5]}") # 只顯示前5個
- else:
- self.logger.info("頁面中未找到任何模板")
- # 顯示頁面內容片段以供調試
- content_preview = current_content[:500] if len(current_content) > 500 else current_content
- self.logger.info(f"頁面內容預覽(前500字符):\n{content_preview}")
- return False
- template_content = match.group(1) # autoStat 模板內容
- # 檢查統計行是否已經存在
- if stats_line in template_content:
- self.logger.info(f"統計數據已存在於頁面中: {stats_line}")
- self.logger.info("跳過編輯,無需更新")
- return True
- # 檢查是否有相同數字組合但格式稍有不同的情況
- # 創建用於比較的數字字符串(去除空格)
- stats_numbers = ''.join(map(str, stats))
- template_lines = template_content.split('\n')
- for line in template_lines:
- if line.strip().startswith('|'):
- # 提取該行的數字(去除空格和分隔符)
- line_numbers = re.sub(r'[^\d]', '', line)
- if line_numbers == stats_numbers:
- self.logger.info(f"發現相同的統計數據(格式略有不同):")
- self.logger.info(f" 頁面中已有: {line.strip()}")
- self.logger.info(f" 欲添加的: {stats_line}")
- self.logger.info("跳過編輯,無需更新")
- return True
- # 數據不存在,進行更新
- self.logger.info(f"統計數據不存在,準備添加: {stats_line}")
- # 找到最後一行數字數據的位置
- template_lines = template_content.split('\n')
- last_data_line_index = -1
- # 從後往前搜索,找到最後一行包含純數字數據的行
- for i in range(len(template_lines) - 1, -1, -1):
- line = template_lines[i].strip()
- if line.startswith('|') and self._is_data_line(line):
- last_data_line_index = i
- break
- if last_data_line_index == -1:
- self.logger.error("未找到數據行,無法插入新統計數據")
- return False
- self.logger.info(f"找到最後一行數據: {template_lines[last_data_line_index].strip()}")
- self.logger.info(f"將在第 {last_data_line_index + 1} 行後插入新數據")
- # 重構模板內容:在最後一行數據後插入新行
- new_template_lines = (
- template_lines[:last_data_line_index + 1] + # 包含最後一行數據
- [stats_line] + # 新統計行
- template_lines[last_data_line_index + 1:] # 其餘內容(如描述行)
- )
- new_template_content = '\n'.join(new_template_lines)
- # 構建完整的新內容
- before_template = current_content[:match.start()]
- template_end = match.group(2) # }}
- after_template = current_content[match.end():] # 模板之後的所有內容
- new_content = before_template + new_template_content + template_end + after_template
- self.logger.info(f"頁面更新預覽:")
- self.logger.info(f" 原始內容長度: {len(current_content)} 字符")
- self.logger.info(f" 新內容長度: {len(new_content)} 字符")
- self.logger.info(f" 添加的統計行: {stats_line}")
- # 保存頁面
- edit_summary = self._get_edit_summary(today)
- page.save(new_content, summary=edit_summary, minor=True, bot=self._is_bot_account())
- self.logger.info(f"成功更新頁面,添加數據: {stats_line}")
- return True
- except Exception as e:
- self.logger.error(f"更新維基頁面失敗: {e}")
- return False
- @abstractmethod
- def _get_edit_summary(self, date: str) -> str:
- """獲取編輯摘要(子類實現)"""
- pass
- @abstractmethod
- def _is_bot_account(self) -> bool:
- """是否為機器人帳號(子類實現)"""
- pass
- def run(self):
- """執行主要任務"""
- project_type = "維基詞典" if "wiktionary" in self.csv_url else "維基百科"
- self.logger.info(f"開始執行{project_type}統計更新任務")
- # 連接到維基
- if not self.connect_to_wiki():
- return False
- # 獲取統計數據
- stats = self.fetch_wikistats_data()
- if not stats:
- self.logger.error("無法獲取統計數據,任務失敗")
- return False
- # 更新頁面
- if self.update_wiki_page(stats):
- self.logger.info("任務成功完成!")
- return True
- else:
- self.logger.error("更新頁面失敗")
- return False
Advertisement
Add Comment
Please, Sign In to add comment