Untitled

# arguement required.
# Pre-requisites: 1. Format the dataset as explained by deboc
#                 2. create the Imageset text file using 'ls Annotations/ -m | sed s/\\s/\\n/g | sed s/.xml//g | sed s/,//g > ImageSets/train.txt'

# 1st arguement = Imageset file name  Example: train.txt
# 2nd arguement = path to custom dataset  Example: ~/py-faster-rcnn/data/VOC_devkit/

# Using Instructions
# $ python discard_shuffle_data.py [arg1] [arg2]

import numpy as np
import random, sys, os
import cv2
import xml.etree.ElementTree as ET

ext=['.png','.jpg','.jpeg']
discard_list=[]

def get_image_path_from_index(index):
        """
        Construct an image path from the image file name form annotation path to be given to cv function for bbox verification
        """
        for item in ext:
            image_path = os.path.join(data_path, 'data', 'Images', index + item)
            if os.path.exists(image_path):
                break
        assert os.path.exists(image_path), \
                'Path does not exist: {}'.format(image_path)
        return image_path

def get_annotation_path_from_index(index):
	"returns annotation path"
	annotation_path=os.path.join(data_path, 'data', 'Annotations',index + '.xml')
	assert os.path.exists(annotation_path), \
                'Path does not exist: {}'.format(annotation_path)
        return annotation_path


def get_image_size(image_path):
	"output image height and width of input image"
	img = cv2.imread(image_path,0)
	height, width = img.shape[:2]

def get_bbox_size():
	"will return bbox size which will be used to compare to image_size heigth and width"
	"xml parsed here"


imageset_filename=sys.argv[1]
data_path=sys.argv[2]

imageset_file_path = os.path.join(data_path, 'data', 'ImageSets', imageset_filename)
assert os.path.exists(imageset_file_path), \
	'Path does not exist: {}'.format(imageset_file_path)


lines = open(imageset_file_path).readlines()
for line in lines:
	image_index=line.strip()  #should not contain newline character

	image_path=get_image_path_from_index(image_index)
	annotation_path=get_annotation_path_from_index(image_index)
	tree=ET.parse(annotation_path)

	img_max_width = int(tree.find('.//width').text)
	img_max_height= int(tree.find('.//height').text)

	objs=tree.findall('object')

	for obj in objs:
		bbox = obj.find('bndbox')
		x1 = int(bbox.find('xmin').text) - 1
                y1 = int(bbox.find('ymin').text) - 1
                x2 = int(bbox.find('xmax').text) - 1
                y2 = int(bbox.find('ymax').text) - 1

		box=[x1,y1,x2,y2]
		#print box

		if x1 < -1 or y1 < -1 or (x2-x1) > img_max_width or (y2-y1) > img_max_height:
			# Conditional loop for checking the error in bbox from dataset
			discard_list.append(image_index)
			print "Discarded image index : ",image_index
			break

print "No. of discarded indexes : ",len(discard_list)

random.shuffle(lines)   # Shuffles the lines in the list "lines"
text_file = open(imageset_file_path, "w")

for line in lines:      # Check if line belongs to discard_list. If yes, that image index is not written
	if line.strip() not in discard_list:
		text_file.write(line)
text_file.close()