Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import json
- # 定义根目录
- root_dir = '/home/xie.zhongwei/workspace/data/PhysReason/PhysReason-mini'
- # 用于存储空 images 文件夹对应的 problem.json 文件路径
- empty_images_json_paths = []
- # 遍历根目录下的所有子文件夹
- for root, dirs, files in os.walk(root_dir):
- # 检查是否存在 images 文件夹
- if 'images' in dirs:
- images_dir = os.path.join(root, 'images')
- # 检查 images 文件夹是否为空
- if not os.listdir(images_dir):
- # 找到对应的 problem.json 文件路径
- json_path = os.path.join(root, 'problem.json')
- empty_images_json_paths.append(json_path)
- # 用于存储提取的 question_structure 和 answer 字段
- extracted_data = []
- # 遍历所有 problem.json 文件路径
- for json_path in empty_images_json_paths:
- try:
- with open(json_path, 'r', encoding='utf-8') as f:
- data = json.load(f)
- # 提取 question_structure 和 answer 字段
- answer = data.get('answer')
- question_structure = data.get('question_structure', {})
- # 提取子问题
- sub_questions = {}
- for key in sorted(question_structure.keys()):
- if key.startswith('sub_question'):
- sub_questions[key] = question_structure[key]
- extracted_data.append({
- 'question_structure': question_structure,
- 'sub_questions': sub_questions,
- 'answer': answer
- })
- except Exception as e:
- print(f"读取 {json_path} 时出错: {e}")
- # 定义输出文件路径
- output_file = 'extracted_data.json'
- # 将提取的数据写入输出文件
- with open(output_file, 'w', encoding='utf-8') as f:
- json.dump(extracted_data, f, ensure_ascii=False, indent=4)
- print(f"提取的数据已保存到 {output_file}")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement