Advertisement
witw78

find_empty_images.py (用于提取空图片的json字段)

Apr 20th, 2025 (edited)
27
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.89 KB | None | 0 0
  1. import os
  2. import json
  3.  
  4. # 定义根目录
  5. root_dir = '/home/xie.zhongwei/workspace/data/PhysReason/PhysReason-mini'
  6.  
  7. # 用于存储空 images 文件夹对应的 problem.json 文件路径
  8. empty_images_json_paths = []
  9.  
  10. # 遍历根目录下的所有子文件夹
  11. for root, dirs, files in os.walk(root_dir):
  12.     # 检查是否存在 images 文件夹
  13.     if 'images' in dirs:
  14.         images_dir = os.path.join(root, 'images')
  15.         # 检查 images 文件夹是否为空
  16.         if not os.listdir(images_dir):
  17.             # 找到对应的 problem.json 文件路径
  18.             json_path = os.path.join(root, 'problem.json')
  19.             empty_images_json_paths.append(json_path)
  20.  
  21. # 用于存储提取的 question_structure 和 answer 字段
  22. extracted_data = []
  23.  
  24. # 遍历所有 problem.json 文件路径
  25. for json_path in empty_images_json_paths:
  26.     try:
  27.         with open(json_path, 'r', encoding='utf-8') as f:
  28.             data = json.load(f)
  29.             # 提取 question_structure 和 answer 字段
  30.             answer = data.get('answer')
  31.             question_structure = data.get('question_structure', {})
  32.  
  33.             # 提取子问题
  34.             sub_questions = {}
  35.             for key in sorted(question_structure.keys()):
  36.                 if key.startswith('sub_question'):
  37.                     sub_questions[key] = question_structure[key]
  38.  
  39.             extracted_data.append({
  40.                 'question_structure': question_structure,
  41.                 'sub_questions': sub_questions,
  42.                 'answer': answer
  43.             })
  44.     except Exception as e:
  45.         print(f"读取 {json_path} 时出错: {e}")
  46.  
  47. # 定义输出文件路径
  48. output_file = 'extracted_data.json'
  49.  
  50. # 将提取的数据写入输出文件
  51. with open(output_file, 'w', encoding='utf-8') as f:
  52.     json.dump(extracted_data, f, ensure_ascii=False, indent=4)
  53.  
  54. print(f"提取的数据已保存到 {output_file}")
  55.    
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement