Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # arguement required.
- # Pre-requisites: 1. Format the dataset as explained by deboc
- # 2. create the Imageset text file using 'ls Annotations/ -m | sed s/\\s/\\n/g | sed s/.xml//g | sed s/,//g > ImageSets/train.txt'
- # 1st arguement = Imageset file name Example: train.txt
- # 2nd arguement = path to custom dataset Example: ~/py-faster-rcnn/data/VOC_devkit/
- # Using Instructions
- # $ python discard_shuffle_data.py [arg1] [arg2]
- import numpy as np
- import random, sys, os
- import cv2
- import xml.etree.ElementTree as ET
- ext=['.png','.jpg','.jpeg']
- discard_list=[]
- def get_image_path_from_index(index):
- """
- Construct an image path from the image file name form annotation path to be given to cv function for bbox verification
- """
- for item in ext:
- image_path = os.path.join(data_path, 'data', 'Images', index + item)
- if os.path.exists(image_path):
- break
- assert os.path.exists(image_path), \
- 'Path does not exist: {}'.format(image_path)
- return image_path
- def get_annotation_path_from_index(index):
- "returns annotation path"
- annotation_path=os.path.join(data_path, 'data', 'Annotations',index + '.xml')
- assert os.path.exists(annotation_path), \
- 'Path does not exist: {}'.format(annotation_path)
- return annotation_path
- def get_image_size(image_path):
- "output image height and width of input image"
- img = cv2.imread(image_path,0)
- height, width = img.shape[:2]
- def get_bbox_size():
- "will return bbox size which will be used to compare to image_size heigth and width"
- "xml parsed here"
- imageset_filename=sys.argv[1]
- data_path=sys.argv[2]
- imageset_file_path = os.path.join(data_path, 'data', 'ImageSets', imageset_filename)
- assert os.path.exists(imageset_file_path), \
- 'Path does not exist: {}'.format(imageset_file_path)
- lines = open(imageset_file_path).readlines()
- for line in lines:
- image_index=line.strip() #should not contain newline character
- image_path=get_image_path_from_index(image_index)
- annotation_path=get_annotation_path_from_index(image_index)
- tree=ET.parse(annotation_path)
- img_max_width = int(tree.find('.//width').text)
- img_max_height= int(tree.find('.//height').text)
- objs=tree.findall('object')
- for obj in objs:
- bbox = obj.find('bndbox')
- x1 = int(bbox.find('xmin').text) - 1
- y1 = int(bbox.find('ymin').text) - 1
- x2 = int(bbox.find('xmax').text) - 1
- y2 = int(bbox.find('ymax').text) - 1
- box=[x1,y1,x2,y2]
- #print box
- if x1 < -1 or y1 < -1 or (x2-x1) > img_max_width or (y2-y1) > img_max_height:
- # Conditional loop for checking the error in bbox from dataset
- discard_list.append(image_index)
- print "Discarded image index : ",image_index
- break
- print "No. of discarded indexes : ",len(discard_list)
- random.shuffle(lines) # Shuffles the lines in the list "lines"
- text_file = open(imageset_file_path, "w")
- for line in lines: # Check if line belongs to discard_list. If yes, that image index is not written
- if line.strip() not in discard_list:
- text_file.write(line)
- text_file.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement