find_empty_images.py （用于提取空图片的json字段）

import os
import json

# 定义根目录
root_dir = '/home/xie.zhongwei/workspace/data/PhysReason/PhysReason-mini'

# 用于存储空 images 文件夹对应的 problem.json 文件路径
empty_images_json_paths = []

# 遍历根目录下的所有子文件夹
for root, dirs, files in os.walk(root_dir):
    # 检查是否存在 images 文件夹
    if 'images' in dirs:
        images_dir = os.path.join(root, 'images')
        # 检查 images 文件夹是否为空
        if not os.listdir(images_dir):
            # 找到对应的 problem.json 文件路径
            json_path = os.path.join(root, 'problem.json')
            empty_images_json_paths.append(json_path)

# 用于存储提取的 question_structure 和 answer 字段
extracted_data = []

# 遍历所有 problem.json 文件路径
for json_path in empty_images_json_paths:
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            # 提取 question_structure 和 answer 字段
            answer = data.get('answer')
            question_structure = data.get('question_structure', {})

            # 提取子问题
            sub_questions = {}
            for key in sorted(question_structure.keys()):
                if key.startswith('sub_question'):
                    sub_questions[key] = question_structure[key]

            extracted_data.append({
                'question_structure': question_structure,
                'sub_questions': sub_questions,
                'answer': answer
            })
    except Exception as e:
        print(f"读取 {json_path} 时出错: {e}")

# 定义输出文件路径
output_file = 'extracted_data.json'

# 将提取的数据写入输出文件
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(extracted_data, f, ensure_ascii=False, indent=4)

print(f"提取的数据已保存到 {output_file}")