Advertisement
Guest User

Untitled

a guest
Apr 24th, 2020
907
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 128.59 KB | None | 0 0
  1. """
  2. Mask R-CNN
  3. The main Mask R-CNN model implementation.
  4.  
  5. Copyright (c) 2017 Matterport, Inc.
  6. Licensed under the MIT License (see LICENSE for details)
  7. Written by Waleed Abdulla
  8. """
  9.  
  10. import os
  11. import random
  12. import datetime
  13. import re
  14. import math
  15. import logging
  16. from collections import OrderedDict
  17. import multiprocessing
  18. import numpy as np
  19. import tensorflow as tf
  20. import tensorflow.keras
  21. import tensorflow.keras.backend as K
  22. import tensorflow.keras.layers as KL
  23. import tensorflow.keras.layers as KE
  24. import tensorflow.keras.models as KM
  25. import tensorflow.keras.utils as KU
  26. from tensorflow.keras.callbacks import CSVLogger
  27.  
  28. from mrcnn import utils
  29.  
  30. # Requires TensorFlow 1.3+ and Keras 2.0.8+.
  31. from distutils.version import LooseVersion
  32. from distutils.version import LooseVersion
  33. assert LooseVersion(tf.__version__) >= LooseVersion("1.3")
  34. #assert LooseVersion(keras.__version__) >= LooseVersion('2.0.8')
  35.  
  36.  
  37. ############################################################
  38. #  Utility Functions
  39. ############################################################
  40.  
  41. def log(text, array=None):
  42.     """Prints a text message. And, optionally, if a Numpy array is provided it
  43.    prints it's shape, min, and max values.
  44.    """
  45.     if array is not None:
  46.         text = text.ljust(25)
  47.         text += ("shape: {:20}  ".format(str(array.shape)))
  48.         if array.size:
  49.             text += ("min: {:10.5f}  max: {:10.5f}".format(array.min(),array.max()))
  50.         else:
  51.             text += ("min: {:10}  max: {:10}".format("",""))
  52.         text += "  {}".format(array.dtype)
  53.     print(text)
  54.  
  55.  
  56. class BatchNorm(KL.BatchNormalization):
  57.     """Extends the Keras BatchNormalization class to allow a central place
  58.    to make changes if needed.
  59.  
  60.    Batch normalization has a negative effect on training if batches are small
  61.    so this layer is often frozen (via setting in Config class) and functions
  62.    as linear layer.
  63.    """
  64.     def call(self, inputs, training=None):
  65.         """
  66.        Note about training values:
  67.            None: Train BN layers. This is the normal mode
  68.            False: Freeze BN layers. Good when batch size is small
  69.            True: (don't use). Set layer in training mode even when making inferences
  70.        """
  71.         return super(self.__class__, self).call(inputs, training=training)
  72.  
  73.  
  74. def compute_backbone_shapes(config, image_shape):
  75.     """Computes the width and height of each stage of the backbone network.
  76.  
  77.    Returns:
  78.        [N, (height, width)]. Where N is the number of stages
  79.    """
  80.     if callable(config.BACKBONE):
  81.         return config.COMPUTE_BACKBONE_SHAPE(image_shape)
  82.  
  83.     # Currently supports ResNet only
  84.     assert config.BACKBONE in ["resnet50", "resnet101"]
  85.     return np.array(
  86.         [[int(math.ceil(image_shape[0] / stride)),
  87.             int(math.ceil(image_shape[1] / stride))]
  88.             for stride in config.BACKBONE_STRIDES])
  89.  
  90.  
  91. ############################################################
  92. #  Resnet Graph
  93. ############################################################
  94.  
  95. # Code adopted from:
  96. # https://github.com/fchollet/deep-learning-models/blob/master/resnet50.py
  97.  
  98. def identity_block(input_tensor, kernel_size, filters, stage, block,
  99.                    use_bias=True, train_bn=True):
  100.     """The identity_block is the block that has no conv layer at shortcut
  101.    # Arguments
  102.        input_tensor: input tensor
  103.        kernel_size: default 3, the kernel size of middle conv layer at main path
  104.        filters: list of integers, the nb_filters of 3 conv layer at main path
  105.        stage: integer, current stage label, used for generating layer names
  106.        block: 'a','b'..., current block label, used for generating layer names
  107.        use_bias: Boolean. To use or not use a bias in conv layers.
  108.        train_bn: Boolean. Train or freeze Batch Norm layers
  109.    """
  110.     nb_filter1, nb_filter2, nb_filter3 = filters
  111.     conv_name_base = 'res' + str(stage) + block + '_branch'
  112.     bn_name_base = 'bn' + str(stage) + block + '_branch'
  113.  
  114.     x = KL.Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a',
  115.                   use_bias=use_bias)(input_tensor)
  116.     x = BatchNorm(name=bn_name_base + '2a')(x, training=train_bn)
  117.     x = KL.Activation('relu')(x)
  118.  
  119.     x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
  120.                   name=conv_name_base + '2b', use_bias=use_bias)(x)
  121.     x = BatchNorm(name=bn_name_base + '2b')(x, training=train_bn)
  122.     x = KL.Activation('relu')(x)
  123.  
  124.     x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c',
  125.                   use_bias=use_bias)(x)
  126.     x = BatchNorm(name=bn_name_base + '2c')(x, training=train_bn)
  127.  
  128.     x = KL.Add()([x, input_tensor])
  129.     x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
  130.     return x
  131.  
  132.  
  133. def conv_block(input_tensor, kernel_size, filters, stage, block,
  134.                strides=(2, 2), use_bias=True, train_bn=True):
  135.     """conv_block is the block that has a conv layer at shortcut
  136.    # Arguments
  137.        input_tensor: input tensor
  138.        kernel_size: default 3, the kernel size of middle conv layer at main path
  139.        filters: list of integers, the nb_filters of 3 conv layer at main path
  140.        stage: integer, current stage label, used for generating layer names
  141.        block: 'a','b'..., current block label, used for generating layer names
  142.        use_bias: Boolean. To use or not use a bias in conv layers.
  143.        train_bn: Boolean. Train or freeze Batch Norm layers
  144.    Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
  145.    And the shortcut should have subsample=(2,2) as well
  146.    """
  147.     nb_filter1, nb_filter2, nb_filter3 = filters
  148.     conv_name_base = 'res' + str(stage) + block + '_branch'
  149.     bn_name_base = 'bn' + str(stage) + block + '_branch'
  150.  
  151.     x = KL.Conv2D(nb_filter1, (1, 1), strides=strides,
  152.                   name=conv_name_base + '2a', use_bias=use_bias)(input_tensor)
  153.     x = BatchNorm(name=bn_name_base + '2a')(x, training=train_bn)
  154.     x = KL.Activation('relu')(x)
  155.  
  156.     x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
  157.                   name=conv_name_base + '2b', use_bias=use_bias)(x)
  158.     x = BatchNorm(name=bn_name_base + '2b')(x, training=train_bn)
  159.     x = KL.Activation('relu')(x)
  160.  
  161.     x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base +
  162.                   '2c', use_bias=use_bias)(x)
  163.     x = BatchNorm(name=bn_name_base + '2c')(x, training=train_bn)
  164.  
  165.     shortcut = KL.Conv2D(nb_filter3, (1, 1), strides=strides,
  166.                          name=conv_name_base + '1', use_bias=use_bias)(input_tensor)
  167.     shortcut = BatchNorm(name=bn_name_base + '1')(shortcut, training=train_bn)
  168.  
  169.     x = KL.Add()([x, shortcut])
  170.     x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
  171.     return x
  172.  
  173.  
  174. def resnet_graph(input_image, architecture, stage5=False, train_bn=True):
  175.     """Build a ResNet graph.
  176.        architecture: Can be resnet50 or resnet101
  177.        stage5: Boolean. If False, stage5 of the network is not created
  178.        train_bn: Boolean. Train or freeze Batch Norm layers
  179.    """
  180.     assert architecture in ["resnet50", "resnet101"]
  181.     # Stage 1
  182.     x = KL.ZeroPadding2D((3, 3))(input_image)
  183.     x = KL.Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=True)(x)
  184.     x = BatchNorm(name='bn_conv1')(x, training=train_bn)
  185.     x = KL.Activation('relu')(x)
  186.     C1 = x = KL.MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
  187.     # Stage 2
  188.     x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), train_bn=train_bn)
  189.     x = identity_block(x, 3, [64, 64, 256], stage=2, block='b', train_bn=train_bn)
  190.     C2 = x = identity_block(x, 3, [64, 64, 256], stage=2, block='c', train_bn=train_bn)
  191.     # Stage 3
  192.     x = conv_block(x, 3, [128, 128, 512], stage=3, block='a', train_bn=train_bn)
  193.     x = identity_block(x, 3, [128, 128, 512], stage=3, block='b', train_bn=train_bn)
  194.     x = identity_block(x, 3, [128, 128, 512], stage=3, block='c', train_bn=train_bn)
  195.     C3 = x = identity_block(x, 3, [128, 128, 512], stage=3, block='d', train_bn=train_bn)
  196.     # Stage 4
  197.     x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', train_bn=train_bn)
  198.     block_count = {"resnet50": 5, "resnet101": 22}[architecture]
  199.     for i in range(block_count):
  200.         x = identity_block(x, 3, [256, 256, 1024], stage=4, block=chr(98 + i), train_bn=train_bn)
  201.     C4 = x
  202.     # Stage 5
  203.     if stage5:
  204.         x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a', train_bn=train_bn)
  205.         x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b', train_bn=train_bn)
  206.         C5 = x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c', train_bn=train_bn)
  207.     else:
  208.         C5 = None
  209.     return [C1, C2, C3, C4, C5]
  210.  
  211.  
  212. ############################################################
  213. #  Proposal Layer
  214. ############################################################
  215.  
  216. def apply_box_deltas_graph(boxes, deltas):
  217.     """Applies the given deltas to the given boxes.
  218.    boxes: [N, (y1, x1, y2, x2)] boxes to update
  219.    deltas: [N, (dy, dx, log(dh), log(dw))] refinements to apply
  220.    """
  221.     # Convert to y, x, h, w
  222.     height = boxes[:, 2] - boxes[:, 0]
  223.     width = boxes[:, 3] - boxes[:, 1]
  224.     center_y = boxes[:, 0] + 0.5 * height
  225.     center_x = boxes[:, 1] + 0.5 * width
  226.     # Apply deltas
  227.     center_y += deltas[:, 0] * height
  228.     center_x += deltas[:, 1] * width
  229.     height *= tf.exp(deltas[:, 2])
  230.     width *= tf.exp(deltas[:, 3])
  231.     # Convert back to y1, x1, y2, x2
  232.     y1 = center_y - 0.5 * height
  233.     x1 = center_x - 0.5 * width
  234.     y2 = y1 + height
  235.     x2 = x1 + width
  236.     result = tf.stack([y1, x1, y2, x2], axis=1, name="apply_box_deltas_out")
  237.     return result
  238.  
  239.  
  240. def clip_boxes_graph(boxes, window):
  241.     """
  242.    boxes: [N, (y1, x1, y2, x2)]
  243.    window: [4] in the form y1, x1, y2, x2
  244.    """
  245.     # Split
  246.     wy1, wx1, wy2, wx2 = tf.split(window, 4)
  247.     y1, x1, y2, x2 = tf.split(boxes, 4, axis=1)
  248.     # Clip
  249.     y1 = tf.maximum(tf.minimum(y1, wy2), wy1)
  250.     x1 = tf.maximum(tf.minimum(x1, wx2), wx1)
  251.     y2 = tf.maximum(tf.minimum(y2, wy2), wy1)
  252.     x2 = tf.maximum(tf.minimum(x2, wx2), wx1)
  253.     clipped = tf.concat([y1, x1, y2, x2], axis=1, name="clipped_boxes")
  254.     clipped.set_shape((clipped.shape[0], 4))
  255.     return clipped
  256.  
  257.  
  258. class ProposalLayer(KE.Layer):
  259.     """Receives anchor scores and selects a subset to pass as proposals
  260.    to the second stage. Filtering is done based on anchor scores and
  261.    non-max suppression to remove overlaps. It also applies bounding
  262.    box refinement deltas to anchors.
  263.  
  264.    Inputs:
  265.        rpn_probs: [batch, num_anchors, (bg prob, fg prob)]
  266.        rpn_bbox: [batch, num_anchors, (dy, dx, log(dh), log(dw))]
  267.        anchors: [batch, num_anchors, (y1, x1, y2, x2)] anchors in normalized coordinates
  268.  
  269.    Returns:
  270.        Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)]
  271.    """
  272.  
  273.     def __init__(self, proposal_count, nms_threshold, config=None, **kwargs):
  274.         super(ProposalLayer, self).__init__(**kwargs)
  275.         self.config = config
  276.         self.proposal_count = proposal_count
  277.         self.nms_threshold = nms_threshold
  278.        
  279.     def get_config(self):
  280.         config = super(ProposalLayer, self).get_config()
  281.         config["config"] = self.config.to_dict()
  282.         config["proposal_count"] = self.proposal_count
  283.         config["nms_threshold"] = self.nms_threshold
  284.         return config
  285.  
  286.     def call(self, inputs):
  287.         # Box Scores. Use the foreground class confidence. [Batch, num_rois, 1]
  288.         scores = inputs[0][:, :, 1]
  289.         # Box deltas [batch, num_rois, 4]
  290.         deltas = inputs[1]
  291.         deltas = deltas * np.reshape(self.config.RPN_BBOX_STD_DEV, [1, 1, 4])
  292.         # Anchors
  293.         anchors = inputs[2]
  294.  
  295.         # Improve performance by trimming to top anchors by score
  296.         # and doing the rest on the smaller subset.
  297.         pre_nms_limit = tf.minimum(self.config.PRE_NMS_LIMIT, tf.shape(anchors)[1])
  298.         ix = tf.nn.top_k(scores, pre_nms_limit, sorted=True,
  299.                          name="top_anchors").indices
  300.         scores = utils.batch_slice([scores, ix], lambda x, y: tf.gather(x, y),
  301.                                    self.config.IMAGES_PER_GPU)
  302.         deltas = utils.batch_slice([deltas, ix], lambda x, y: tf.gather(x, y),
  303.                                    self.config.IMAGES_PER_GPU)
  304.         pre_nms_anchors = utils.batch_slice([anchors, ix], lambda a, x: tf.gather(a, x),
  305.                                     self.config.IMAGES_PER_GPU,
  306.                                     names=["pre_nms_anchors"])
  307.  
  308.         # Apply deltas to anchors to get refined anchors.
  309.         # [batch, N, (y1, x1, y2, x2)]
  310.         boxes = utils.batch_slice([pre_nms_anchors, deltas],
  311.                                   lambda x, y: apply_box_deltas_graph(x, y),
  312.                                   self.config.IMAGES_PER_GPU,
  313.                                   names=["refined_anchors"])
  314.  
  315.         # Clip to image boundaries. Since we're in normalized coordinates,
  316.         # clip to 0..1 range. [batch, N, (y1, x1, y2, x2)]
  317.         window = np.array([0, 0, 1, 1], dtype=np.float32)
  318.         boxes = utils.batch_slice(boxes,
  319.                                   lambda x: clip_boxes_graph(x, window),
  320.                                   self.config.IMAGES_PER_GPU,
  321.                                   names=["refined_anchors_clipped"])
  322.  
  323.         # Filter out small boxes
  324.         # According to Xinlei Chen's paper, this reduces detection accuracy
  325.         # for small objects, so we're skipping it.
  326.  
  327.         # Non-max suppression
  328.         def nms(boxes, scores):
  329.             indices = tf.image.non_max_suppression(
  330.                 boxes, scores, self.proposal_count,
  331.                 self.nms_threshold, name="rpn_non_max_suppression")
  332.             proposals = tf.gather(boxes, indices)
  333.             # Pad if needed
  334.             padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0], 0)
  335.             proposals = tf.pad(proposals, [(0, padding), (0, 0)])
  336.             return proposals
  337.         proposals = utils.batch_slice([boxes, scores], nms,
  338.                                       self.config.IMAGES_PER_GPU)
  339.         return proposals
  340.  
  341.     def compute_output_shape(self, input_shape):
  342.         return (None, self.proposal_count, 4)
  343.  
  344.  
  345. ############################################################
  346. #  ROIAlign Layer
  347. ############################################################
  348.  
  349. def log2_graph(x):
  350.     """Implementation of Log2. TF doesn't have a native implementation."""
  351.     return tf.log(x) / tf.log(2.0)
  352.  
  353.  
  354. class PyramidROIAlign(KE.Layer):
  355.     """Implements ROI Pooling on multiple levels of the feature pyramid.
  356.  
  357.    Params:
  358.    - pool_shape: [pool_height, pool_width] of the output pooled regions. Usually [7, 7]
  359.  
  360.    Inputs:
  361.    - boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized
  362.             coordinates. Possibly padded with zeros if not enough
  363.             boxes to fill the array.
  364.    - image_meta: [batch, (meta data)] Image details. See compose_image_meta()
  365.    - feature_maps: List of feature maps from different levels of the pyramid.
  366.                    Each is [batch, height, width, channels]
  367.  
  368.    Output:
  369.    Pooled regions in the shape: [batch, num_boxes, pool_height, pool_width, channels].
  370.    The width and height are those specific in the pool_shape in the layer
  371.    constructor.
  372.    """
  373.  
  374.     def __init__(self, pool_shape, **kwargs):
  375.         super(PyramidROIAlign, self).__init__(**kwargs)
  376.         self.pool_shape = tuple(pool_shape)
  377.  
  378.     def call(self, inputs):
  379.         # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords
  380.         boxes = inputs[0]
  381.  
  382.         # Image meta
  383.         # Holds details about the image. See compose_image_meta()
  384.         image_meta = inputs[1]
  385.  
  386.         # Feature Maps. List of feature maps from different level of the
  387.         # feature pyramid. Each is [batch, height, width, channels]
  388.         feature_maps = inputs[2:]
  389.  
  390.         # Assign each ROI to a level in the pyramid based on the ROI area.
  391.         y1, x1, y2, x2 = tf.split(boxes, 4, axis=2)
  392.         h = y2 - y1
  393.         w = x2 - x1
  394.         # Use shape of first image. Images in a batch must have the same size.
  395.         image_shape = parse_image_meta_graph(image_meta)['image_shape'][0]
  396.         # Equation 1 in the Feature Pyramid Networks paper. Account for
  397.         # the fact that our coordinates are normalized here.
  398.         # e.g. a 224x224 ROI (in pixels) maps to P4
  399.         image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32)
  400.         roi_level = log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area)))
  401.         roi_level = tf.minimum(5, tf.maximum(
  402.             2, 4 + tf.cast(tf.round(roi_level), tf.int32)))
  403.         roi_level = tf.squeeze(roi_level, 2)
  404.  
  405.         # Loop through levels and apply ROI pooling to each. P2 to P5.
  406.         pooled = []
  407.         box_to_level = []
  408.         for i, level in enumerate(range(2, 6)):
  409.             ix = tf.where(tf.equal(roi_level, level))
  410.             level_boxes = tf.gather_nd(boxes, ix)
  411.  
  412.             # Box indices for crop_and_resize.
  413.             box_indices = tf.cast(ix[:, 0], tf.int32)
  414.  
  415.             # Keep track of which box is mapped to which level
  416.             box_to_level.append(ix)
  417.  
  418.             # Stop gradient propogation to ROI proposals
  419.             level_boxes = tf.stop_gradient(level_boxes)
  420.             box_indices = tf.stop_gradient(box_indices)
  421.  
  422.             # Crop and Resize
  423.             # From Mask R-CNN paper: "We sample four regular locations, so
  424.             # that we can evaluate either max or average pooling. In fact,
  425.             # interpolating only a single value at each bin center (without
  426.             # pooling) is nearly as effective."
  427.             #
  428.             # Here we use the simplified approach of a single value per bin,
  429.             # which is how it's done in tf.crop_and_resize()
  430.             # Result: [batch * num_boxes, pool_height, pool_width, channels]
  431.             pooled.append(tf.image.crop_and_resize(
  432.                 feature_maps[i], level_boxes, box_indices, self.pool_shape,
  433.                 method="bilinear"))
  434.  
  435.         # Pack pooled features into one tensor
  436.         pooled = tf.concat(pooled, axis=0)
  437.  
  438.         # Pack box_to_level mapping into one array and add another
  439.         # column representing the order of pooled boxes
  440.         box_to_level = tf.concat(box_to_level, axis=0)
  441.         box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1)
  442.         box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range],
  443.                                  axis=1)
  444.  
  445.         # Rearrange pooled features to match the order of the original boxes
  446.         # Sort box_to_level by batch then box index
  447.         # TF doesn't have a way to sort by two columns, so merge them and sort.
  448.         sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1]
  449.         ix = tf.nn.top_k(sorting_tensor, k=tf.shape(
  450.             box_to_level)[0]).indices[::-1]
  451.         ix = tf.gather(box_to_level[:, 2], ix)
  452.         pooled = tf.gather(pooled, ix)
  453.  
  454.         # Re-add the batch dimension
  455.         shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis=0)
  456.         pooled = tf.reshape(pooled, shape)
  457.         return pooled
  458.  
  459.     def compute_output_shape(self, input_shape):
  460.         return input_shape[0][:2] + self.pool_shape + (input_shape[2][-1], )
  461.  
  462.  
  463. ############################################################
  464. #  Detection Target Layer
  465. ############################################################
  466.  
  467. def overlaps_graph(boxes1, boxes2):
  468.     """Computes IoU overlaps between two sets of boxes.
  469.    boxes1, boxes2: [N, (y1, x1, y2, x2)].
  470.    """
  471.     # 1. Tile boxes2 and repeat boxes1. This allows us to compare
  472.     # every boxes1 against every boxes2 without loops.
  473.     # TF doesn't have an equivalent to np.repeat() so simulate it
  474.     # using tf.tile() and tf.reshape.
  475.     b1 = tf.reshape(tf.tile(tf.expand_dims(boxes1, 1),
  476.                             [1, 1, tf.shape(boxes2)[0]]), [-1, 4])
  477.     b2 = tf.tile(boxes2, [tf.shape(boxes1)[0], 1])
  478.     # 2. Compute intersections
  479.     b1_y1, b1_x1, b1_y2, b1_x2 = tf.split(b1, 4, axis=1)
  480.     b2_y1, b2_x1, b2_y2, b2_x2 = tf.split(b2, 4, axis=1)
  481.     y1 = tf.maximum(b1_y1, b2_y1)
  482.     x1 = tf.maximum(b1_x1, b2_x1)
  483.     y2 = tf.minimum(b1_y2, b2_y2)
  484.     x2 = tf.minimum(b1_x2, b2_x2)
  485.     intersection = tf.maximum(x2 - x1, 0) * tf.maximum(y2 - y1, 0)
  486.     # 3. Compute unions
  487.     b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)
  488.     b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)
  489.     union = b1_area + b2_area - intersection
  490.     # 4. Compute IoU and reshape to [boxes1, boxes2]
  491.     iou = intersection / union
  492.     overlaps = tf.reshape(iou, [tf.shape(boxes1)[0], tf.shape(boxes2)[0]])
  493.     return overlaps
  494.  
  495.  
  496. def detection_targets_graph(proposals, gt_class_ids, gt_boxes, gt_masks, config):
  497.     """Generates detection targets for one image. Subsamples proposals and
  498.    generates target class IDs, bounding box deltas, and masks for each.
  499.  
  500.    Inputs:
  501.    proposals: [POST_NMS_ROIS_TRAINING, (y1, x1, y2, x2)] in normalized coordinates. Might
  502.               be zero padded if there are not enough proposals.
  503.    gt_class_ids: [MAX_GT_INSTANCES] int class IDs
  504.    gt_boxes: [MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates.
  505.    gt_masks: [height, width, MAX_GT_INSTANCES] of boolean type.
  506.  
  507.    Returns: Target ROIs and corresponding class IDs, bounding box shifts,
  508.    and masks.
  509.    rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates
  510.    class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. Zero padded.
  511.    deltas: [TRAIN_ROIS_PER_IMAGE, (dy, dx, log(dh), log(dw))]
  512.    masks: [TRAIN_ROIS_PER_IMAGE, height, width]. Masks cropped to bbox
  513.           boundaries and resized to neural network output size.
  514.  
  515.    Note: Returned arrays might be zero padded if not enough target ROIs.
  516.    """
  517.     # Assertions
  518.     asserts = [
  519.         tf.Assert(tf.greater(tf.shape(proposals)[0], 0), [proposals],
  520.                   name="roi_assertion"),
  521.     ]
  522.     with tf.control_dependencies(asserts):
  523.         proposals = tf.identity(proposals)
  524.  
  525.     # Remove zero padding
  526.     proposals, _ = trim_zeros_graph(proposals, name="trim_proposals")
  527.     gt_boxes, non_zeros = trim_zeros_graph(gt_boxes, name="trim_gt_boxes")
  528.     gt_class_ids = tf.boolean_mask(gt_class_ids, non_zeros,
  529.                                    name="trim_gt_class_ids")
  530.     gt_masks = tf.gather(gt_masks, tf.where(non_zeros)[:, 0], axis=2,
  531.                          name="trim_gt_masks")
  532.  
  533.     # Handle COCO crowds
  534.     # A crowd box in COCO is a bounding box around several instances. Exclude
  535.     # them from training. A crowd box is given a negative class ID.
  536.     crowd_ix = tf.where(gt_class_ids < 0)[:, 0]
  537.     non_crowd_ix = tf.where(gt_class_ids > 0)[:, 0]
  538.     crowd_boxes = tf.gather(gt_boxes, crowd_ix)
  539.     gt_class_ids = tf.gather(gt_class_ids, non_crowd_ix)
  540.     gt_boxes = tf.gather(gt_boxes, non_crowd_ix)
  541.     gt_masks = tf.gather(gt_masks, non_crowd_ix, axis=2)
  542.  
  543.     # Compute overlaps matrix [proposals, gt_boxes]
  544.     overlaps = overlaps_graph(proposals, gt_boxes)
  545.  
  546.     # Compute overlaps with crowd boxes [proposals, crowd_boxes]
  547.     crowd_overlaps = overlaps_graph(proposals, crowd_boxes)
  548.     crowd_iou_max = tf.reduce_max(crowd_overlaps, axis=1)
  549.     no_crowd_bool = (crowd_iou_max < 0.001)
  550.  
  551.     # Determine positive and negative ROIs
  552.     roi_iou_max = tf.reduce_max(overlaps, axis=1)
  553.     # 1. Positive ROIs are those with >= 0.5 IoU with a GT box
  554.     positive_roi_bool = (roi_iou_max >= 0.5)
  555.     positive_indices = tf.where(positive_roi_bool)[:, 0]
  556.     # 2. Negative ROIs are those with < 0.5 with every GT box. Skip crowds.
  557.     negative_indices = tf.where(tf.logical_and(roi_iou_max < 0.5, no_crowd_bool))[:, 0]
  558.  
  559.     # Subsample ROIs. Aim for 33% positive
  560.     # Positive ROIs
  561.     positive_count = int(config.TRAIN_ROIS_PER_IMAGE *
  562.                          config.ROI_POSITIVE_RATIO)
  563.     positive_indices = tf.random_shuffle(positive_indices)[:positive_count]
  564.     positive_count = tf.shape(positive_indices)[0]
  565.     # Negative ROIs. Add enough to maintain positive:negative ratio.
  566.     r = 1.0 / config.ROI_POSITIVE_RATIO
  567.     negative_count = tf.cast(r * tf.cast(positive_count, tf.float32), tf.int32) - positive_count
  568.     negative_indices = tf.random_shuffle(negative_indices)[:negative_count]
  569.     # Gather selected ROIs
  570.     positive_rois = tf.gather(proposals, positive_indices)
  571.     negative_rois = tf.gather(proposals, negative_indices)
  572.  
  573.     # Assign positive ROIs to GT boxes.
  574.     positive_overlaps = tf.gather(overlaps, positive_indices)
  575.     roi_gt_box_assignment = tf.cond(
  576.         tf.greater(tf.shape(positive_overlaps)[1], 0),
  577.         true_fn = lambda: tf.argmax(positive_overlaps, axis=1),
  578.         false_fn = lambda: tf.cast(tf.constant([]),tf.int64)
  579.     )
  580.     roi_gt_boxes = tf.gather(gt_boxes, roi_gt_box_assignment)
  581.     roi_gt_class_ids = tf.gather(gt_class_ids, roi_gt_box_assignment)
  582.  
  583.     # Compute bbox refinement for positive ROIs
  584.     deltas = utils.box_refinement_graph(positive_rois, roi_gt_boxes)
  585.     deltas /= config.BBOX_STD_DEV
  586.  
  587.     # Assign positive ROIs to GT masks
  588.     # Permute masks to [N, height, width, 1]
  589.     transposed_masks = tf.expand_dims(tf.transpose(gt_masks, [2, 0, 1]), -1)
  590.     # Pick the right mask for each ROI
  591.     roi_masks = tf.gather(transposed_masks, roi_gt_box_assignment)
  592.  
  593.     # Compute mask targets
  594.     boxes = positive_rois
  595.     if config.USE_MINI_MASK:
  596.         # Transform ROI coordinates from normalized image space
  597.         # to normalized mini-mask space.
  598.         y1, x1, y2, x2 = tf.split(positive_rois, 4, axis=1)
  599.         gt_y1, gt_x1, gt_y2, gt_x2 = tf.split(roi_gt_boxes, 4, axis=1)
  600.         gt_h = gt_y2 - gt_y1
  601.         gt_w = gt_x2 - gt_x1
  602.         y1 = (y1 - gt_y1) / gt_h
  603.         x1 = (x1 - gt_x1) / gt_w
  604.         y2 = (y2 - gt_y1) / gt_h
  605.         x2 = (x2 - gt_x1) / gt_w
  606.         boxes = tf.concat([y1, x1, y2, x2], 1)
  607.     box_ids = tf.range(0, tf.shape(roi_masks)[0])
  608.     masks = tf.image.crop_and_resize(tf.cast(roi_masks, tf.float32), boxes,
  609.                                      box_ids,
  610.                                      config.MASK_SHAPE)
  611.     # Remove the extra dimension from masks.
  612.     masks = tf.squeeze(masks, axis=3)
  613.  
  614.     # Threshold mask pixels at 0.5 to have GT masks be 0 or 1 to use with
  615.     # binary cross entropy loss.
  616.     masks = tf.round(masks)
  617.  
  618.     # Append negative ROIs and pad bbox deltas and masks that
  619.     # are not used for negative ROIs with zeros.
  620.     rois = tf.concat([positive_rois, negative_rois], axis=0)
  621.     N = tf.shape(negative_rois)[0]
  622.     P = tf.maximum(config.TRAIN_ROIS_PER_IMAGE - tf.shape(rois)[0], 0)
  623.     rois = tf.pad(rois, [(0, P), (0, 0)])
  624.     roi_gt_boxes = tf.pad(roi_gt_boxes, [(0, N + P), (0, 0)])
  625.     roi_gt_class_ids = tf.pad(roi_gt_class_ids, [(0, N + P)])
  626.     deltas = tf.pad(deltas, [(0, N + P), (0, 0)])
  627.     masks = tf.pad(masks, [[0, N + P], (0, 0), (0, 0)])
  628.  
  629.     return rois, roi_gt_class_ids, deltas, masks
  630.  
  631.  
  632. class DetectionTargetLayer(KE.Layer):
  633.     """Subsamples proposals and generates target box refinement, class_ids,
  634.    and masks for each.
  635.  
  636.    Inputs:
  637.    proposals: [batch, N, (y1, x1, y2, x2)] in normalized coordinates. Might
  638.               be zero padded if there are not enough proposals.
  639.    gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs.
  640.    gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized
  641.              coordinates.
  642.    gt_masks: [batch, height, width, MAX_GT_INSTANCES] of boolean type
  643.  
  644.    Returns: Target ROIs and corresponding class IDs, bounding box shifts,
  645.    and masks.
  646.    rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized
  647.          coordinates
  648.    target_class_ids: [batch, TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
  649.    target_deltas: [batch, TRAIN_ROIS_PER_IMAGE, (dy, dx, log(dh), log(dw)]
  650.    target_mask: [batch, TRAIN_ROIS_PER_IMAGE, height, width]
  651.                 Masks cropped to bbox boundaries and resized to neural
  652.                 network output size.
  653.  
  654.    Note: Returned arrays might be zero padded if not enough target ROIs.
  655.    """
  656.  
  657.     def __init__(self, config, **kwargs):
  658.         super(DetectionTargetLayer, self).__init__(**kwargs)
  659.         self.config = config
  660.  
  661.     def call(self, inputs):
  662.         proposals = inputs[0]
  663.         gt_class_ids = inputs[1]
  664.         gt_boxes = inputs[2]
  665.         gt_masks = inputs[3]
  666.  
  667.         # Slice the batch and run a graph for each slice
  668.         # TODO: Rename target_bbox to target_deltas for clarity
  669.         names = ["rois", "target_class_ids", "target_bbox", "target_mask"]
  670.         outputs = utils.batch_slice(
  671.             [proposals, gt_class_ids, gt_boxes, gt_masks],
  672.             lambda w, x, y, z: detection_targets_graph(
  673.                 w, x, y, z, self.config),
  674.             self.config.IMAGES_PER_GPU, names=names)
  675.         return outputs
  676.  
  677.     def compute_output_shape(self, input_shape):
  678.         return [
  679.             (None, self.config.TRAIN_ROIS_PER_IMAGE, 4),  # rois
  680.             (None, self.config.TRAIN_ROIS_PER_IMAGE),  # class_ids
  681.             (None, self.config.TRAIN_ROIS_PER_IMAGE, 4),  # deltas
  682.             (None, self.config.TRAIN_ROIS_PER_IMAGE, self.config.MASK_SHAPE[0],
  683.              self.config.MASK_SHAPE[1])  # masks
  684.         ]
  685.  
  686.     def compute_mask(self, inputs, mask=None):
  687.         return [None, None, None, None]
  688.  
  689.  
  690. ############################################################
  691. #  Detection Layer
  692. ############################################################
  693.  
  694. def refine_detections_graph(rois, probs, deltas, window, config):
  695.     """Refine classified proposals and filter overlaps and return final
  696.    detections.
  697.  
  698.    Inputs:
  699.        rois: [N, (y1, x1, y2, x2)] in normalized coordinates
  700.        probs: [N, num_classes]. Class probabilities.
  701.        deltas: [N, num_classes, (dy, dx, log(dh), log(dw))]. Class-specific
  702.                bounding box deltas.
  703.        window: (y1, x1, y2, x2) in normalized coordinates. The part of the image
  704.            that contains the image excluding the padding.
  705.  
  706.    Returns detections shaped: [num_detections, (y1, x1, y2, x2, class_id, score)] where
  707.        coordinates are normalized.
  708.    """
  709.     # Class IDs per ROI
  710.     class_ids = tf.argmax(probs, axis=1, output_type=tf.int32)
  711.     # Class probability of the top class of each ROI
  712.     indices = tf.stack([tf.range(probs.shape[0]), class_ids], axis=1)
  713.     class_scores = tf.gather_nd(probs, indices)
  714.     # Class-specific bounding box deltas
  715.     deltas_specific = tf.gather_nd(deltas, indices)
  716.     # Apply bounding box deltas
  717.     # Shape: [boxes, (y1, x1, y2, x2)] in normalized coordinates
  718.     refined_rois = apply_box_deltas_graph(
  719.         rois, deltas_specific * config.BBOX_STD_DEV)
  720.     # Clip boxes to image window
  721.     refined_rois = clip_boxes_graph(refined_rois, window)
  722.  
  723.     # TODO: Filter out boxes with zero area
  724.  
  725.     # Filter out background boxes
  726.     keep = tf.where(class_ids > 0)[:, 0]
  727.     # Filter out low confidence boxes
  728.     if config.DETECTION_MIN_CONFIDENCE:
  729.         conf_keep = tf.where(class_scores >= config.DETECTION_MIN_CONFIDENCE)[:, 0]
  730.         keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
  731.                                         tf.expand_dims(conf_keep, 0))
  732.         keep = tf.sparse_tensor_to_dense(keep)[0]
  733.  
  734.     # Apply per-class NMS
  735.     # 1. Prepare variables
  736.     pre_nms_class_ids = tf.gather(class_ids, keep)
  737.     pre_nms_scores = tf.gather(class_scores, keep)
  738.     pre_nms_rois = tf.gather(refined_rois,   keep)
  739.     unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0]
  740.  
  741.     def nms_keep_map(class_id):
  742.         """Apply Non-Maximum Suppression on ROIs of the given class."""
  743.         # Indices of ROIs of the given class
  744.         ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0]
  745.         # Apply NMS
  746.         class_keep = tf.image.non_max_suppression(
  747.                 tf.gather(pre_nms_rois, ixs),
  748.                 tf.gather(pre_nms_scores, ixs),
  749.                 max_output_size=config.DETECTION_MAX_INSTANCES,
  750.                 iou_threshold=config.DETECTION_NMS_THRESHOLD)
  751.         # Map indices
  752.         class_keep = tf.gather(keep, tf.gather(ixs, class_keep))
  753.         # Pad with -1 so returned tensors have the same shape
  754.         gap = config.DETECTION_MAX_INSTANCES - tf.shape(class_keep)[0]
  755.         class_keep = tf.pad(class_keep, [(0, gap)],
  756.                             mode='CONSTANT', constant_values=-1)
  757.         # Set shape so map_fn() can infer result shape
  758.         class_keep.set_shape([config.DETECTION_MAX_INSTANCES])
  759.         return class_keep
  760.  
  761.     # 2. Map over class IDs
  762.     nms_keep = tf.map_fn(nms_keep_map, unique_pre_nms_class_ids,
  763.                          dtype=tf.int64)
  764.     # 3. Merge results into one list, and remove -1 padding
  765.     nms_keep = tf.reshape(nms_keep, [-1])
  766.     nms_keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0])
  767.     # 4. Compute intersection between keep and nms_keep
  768.     keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
  769.                                     tf.expand_dims(nms_keep, 0))
  770.     keep = tf.sparse_tensor_to_dense(keep)[0]
  771.     # Keep top detections
  772.     roi_count = config.DETECTION_MAX_INSTANCES
  773.     class_scores_keep = tf.gather(class_scores, keep)
  774.     num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count)
  775.     top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1]
  776.     keep = tf.gather(keep, top_ids)
  777.  
  778.     # Arrange output as [N, (y1, x1, y2, x2, class_id, score)]
  779.     # Coordinates are normalized.
  780.     detections = tf.concat([
  781.         tf.gather(refined_rois, keep),
  782.         tf.to_float(tf.gather(class_ids, keep))[..., tf.newaxis],
  783.         tf.gather(class_scores, keep)[..., tf.newaxis]
  784.         ], axis=1)
  785.  
  786.     # Pad with zeros if detections < DETECTION_MAX_INSTANCES
  787.     gap = config.DETECTION_MAX_INSTANCES - tf.shape(detections)[0]
  788.     detections = tf.pad(detections, [(0, gap), (0, 0)], "CONSTANT")
  789.     return detections
  790.  
  791.  
  792. class DetectionLayer(KE.Layer):
  793.     """Takes classified proposal boxes and their bounding box deltas and
  794.    returns the final detection boxes.
  795.  
  796.    Returns:
  797.    [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] where
  798.    coordinates are normalized.
  799.    """
  800.  
  801.     def __init__(self, config=None, **kwargs):
  802.         super(DetectionLayer, self).__init__(**kwargs)
  803.         self.config = config
  804.        
  805.     def get_config(self):
  806.         config = super(DetectionLayer, self).get_config()
  807.         config["config"] = self.config.to_dict()
  808.         return config
  809.  
  810.  
  811.     def call(self, inputs):
  812.         rois = inputs[0]
  813.         mrcnn_class = inputs[1]
  814.         mrcnn_bbox = inputs[2]
  815.         image_meta = inputs[3]
  816.  
  817.         # Get windows of images in normalized coordinates. Windows are the area
  818.         # in the image that excludes the padding.
  819.         # Use the shape of the first image in the batch to normalize the window
  820.         # because we know that all images get resized to the same size.
  821.         m = parse_image_meta_graph(image_meta)
  822.         image_shape = m['image_shape'][0]
  823.         window = norm_boxes_graph(m['window'], image_shape[:2])
  824.  
  825.         # Run detection refinement graph on each item in the batch
  826.         detections_batch = utils.batch_slice(
  827.             [rois, mrcnn_class, mrcnn_bbox, window],
  828.             lambda x, y, w, z: refine_detections_graph(x, y, w, z, self.config),
  829.             self.config.IMAGES_PER_GPU)
  830.  
  831.         # Reshape output
  832.         # [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in
  833.         # normalized coordinates
  834.         return tf.reshape(
  835.             detections_batch,
  836.             #[self.config.BATCH_SIZE, self.config.DETECTION_MAX_INSTANCES, 6])
  837.             [self.config.IMAGES_PER_GPU, self.config.DETECTION_MAX_INSTANCES, 6])
  838.  
  839.    
  840.     def compute_output_shape(self, input_shape):
  841.         return (None, self.config.DETECTION_MAX_INSTANCES, 6)
  842.  
  843.  
  844. ############################################################
  845. #  Region Proposal Network (RPN)
  846. ############################################################
  847.  
  848. def rpn_graph(feature_map, anchors_per_location, anchor_stride):
  849.     """Builds the computation graph of Region Proposal Network.
  850.  
  851.    feature_map: backbone features [batch, height, width, depth]
  852.    anchors_per_location: number of anchors per pixel in the feature map
  853.    anchor_stride: Controls the density of anchors. Typically 1 (anchors for
  854.                   every pixel in the feature map), or 2 (every other pixel).
  855.  
  856.    Returns:
  857.        rpn_class_logits: [batch, H * W * anchors_per_location, 2] Anchor classifier logits (before softmax)
  858.        rpn_probs: [batch, H * W * anchors_per_location, 2] Anchor classifier probabilities.
  859.        rpn_bbox: [batch, H * W * anchors_per_location, (dy, dx, log(dh), log(dw))] Deltas to be
  860.                  applied to anchors.
  861.    """
  862.     # TODO: check if stride of 2 causes alignment issues if the feature map
  863.     # is not even.
  864.     # Shared convolutional base of the RPN
  865.     shared = KL.Conv2D(512, (3, 3), padding='same', activation='relu',
  866.                        strides=anchor_stride,
  867.                        name='rpn_conv_shared')(feature_map)
  868.  
  869.     # Anchor Score. [batch, height, width, anchors per location * 2].
  870.     x = KL.Conv2D(2 * anchors_per_location, (1, 1), padding='valid',
  871.                   activation='linear', name='rpn_class_raw')(shared)
  872.  
  873.     # Reshape to [batch, anchors, 2]
  874.     rpn_class_logits = KL.Lambda(
  875.         lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 2]))(x)
  876.  
  877.     # Softmax on last dimension of BG/FG.
  878.     rpn_probs = KL.Activation(
  879.         "softmax", name="rpn_class_xxx")(rpn_class_logits)
  880.  
  881.     # Bounding box refinement. [batch, H, W, anchors per location * depth]
  882.     # where depth is [x, y, log(w), log(h)]
  883.     x = KL.Conv2D(anchors_per_location * 4, (1, 1), padding="valid",
  884.                   activation='linear', name='rpn_bbox_pred')(shared)
  885.  
  886.     # Reshape to [batch, anchors, 4]
  887.     rpn_bbox = KL.Lambda(lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 4]))(x)
  888.  
  889.     return [rpn_class_logits, rpn_probs, rpn_bbox]
  890.  
  891.  
  892. def build_rpn_model(anchor_stride, anchors_per_location, depth):
  893.     """Builds a Keras model of the Region Proposal Network.
  894.    It wraps the RPN graph so it can be used multiple times with shared
  895.    weights.
  896.  
  897.    anchors_per_location: number of anchors per pixel in the feature map
  898.    anchor_stride: Controls the density of anchors. Typically 1 (anchors for
  899.                   every pixel in the feature map), or 2 (every other pixel).
  900.    depth: Depth of the backbone feature map.
  901.  
  902.    Returns a Keras Model object. The model outputs, when called, are:
  903.    rpn_class_logits: [batch, H * W * anchors_per_location, 2] Anchor classifier logits (before softmax)
  904.    rpn_probs: [batch, H * W * anchors_per_location, 2] Anchor classifier probabilities.
  905.    rpn_bbox: [batch, H * W * anchors_per_location, (dy, dx, log(dh), log(dw))] Deltas to be
  906.                applied to anchors.
  907.    """
  908.     input_feature_map = KL.Input(shape=[None, None, depth],
  909.                                  name="input_rpn_feature_map")
  910.     outputs = rpn_graph(input_feature_map, anchors_per_location, anchor_stride)
  911.     return KM.Model([input_feature_map], outputs, name="rpn_model")
  912.  
  913.  
  914. ############################################################
  915. #  Feature Pyramid Network Heads
  916. ############################################################
  917.  
  918. def fpn_classifier_graph(rois, feature_maps, image_meta,
  919.                          pool_size, num_classes, train_bn=True,
  920.                          fc_layers_size=1024):
  921.     """Builds the computation graph of the feature pyramid network classifier
  922.    and regressor heads.
  923.  
  924.    rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized
  925.          coordinates.
  926.    feature_maps: List of feature maps from different layers of the pyramid,
  927.                  [P2, P3, P4, P5]. Each has a different resolution.
  928.    image_meta: [batch, (meta data)] Image details. See compose_image_meta()
  929.    pool_size: The width of the square feature map generated from ROI Pooling.
  930.    num_classes: number of classes, which determines the depth of the results
  931.    train_bn: Boolean. Train or freeze Batch Norm layers
  932.    fc_layers_size: Size of the 2 FC layers
  933.  
  934.    Returns:
  935.        logits: [batch, num_rois, NUM_CLASSES] classifier logits (before softmax)
  936.        probs: [batch, num_rois, NUM_CLASSES] classifier probabilities
  937.        bbox_deltas: [batch, num_rois, NUM_CLASSES, (dy, dx, log(dh), log(dw))] Deltas to apply to
  938.                     proposal boxes
  939.    """
  940.     # ROI Pooling
  941.     # Shape: [batch, num_rois, POOL_SIZE, POOL_SIZE, channels]
  942.     x = PyramidROIAlign([pool_size, pool_size],
  943.                         name="roi_align_classifier")([rois, image_meta] + feature_maps)
  944.     # Two 1024 FC layers (implemented with Conv2D for consistency)
  945.     x = KL.TimeDistributed(KL.Conv2D(fc_layers_size, (pool_size, pool_size), padding="valid"),
  946.                            name="mrcnn_class_conv1")(x)
  947.     x = KL.TimeDistributed(BatchNorm(), name='mrcnn_class_bn1')(x, training=train_bn)
  948.     x = KL.Activation('relu')(x)
  949.     x = KL.TimeDistributed(KL.Conv2D(fc_layers_size, (1, 1)),
  950.                            name="mrcnn_class_conv2")(x)
  951.     x = KL.TimeDistributed(BatchNorm(), name='mrcnn_class_bn2')(x, training=train_bn)
  952.     x = KL.Activation('relu')(x)
  953.  
  954.     shared = KL.Lambda(lambda x: K.squeeze(K.squeeze(x, 3), 2),
  955.                        name="pool_squeeze")(x)
  956.  
  957.     # Classifier head
  958.     mrcnn_class_logits = KL.TimeDistributed(KL.Dense(num_classes),
  959.                                             name='mrcnn_class_logits')(shared)
  960.     mrcnn_probs = KL.TimeDistributed(KL.Activation("softmax"),
  961.                                      name="mrcnn_class")(mrcnn_class_logits)
  962.  
  963.     # BBox head
  964.     # [batch, num_rois, NUM_CLASSES * (dy, dx, log(dh), log(dw))]
  965.     x = KL.TimeDistributed(KL.Dense(num_classes * 4, activation='linear'),
  966.                            name='mrcnn_bbox_fc')(shared)
  967.     # Reshape to [batch, num_rois, NUM_CLASSES, (dy, dx, log(dh), log(dw))]
  968.     s = K.int_shape(x)
  969.     if s[1] is None:
  970.         mrcnn_bbox = KL.Reshape((-1, num_classes, 4), name="mrcnn_bbox")(x)
  971.     else:
  972.         mrcnn_bbox = KL.Reshape((s[1], num_classes, 4), name="mrcnn_bbox")(x)
  973.  
  974.     return mrcnn_class_logits, mrcnn_probs, mrcnn_bbox
  975.  
  976.  
  977. def build_fpn_mask_graph(rois, feature_maps, image_meta,
  978.                          pool_size, num_classes, train_bn=True):
  979.     """Builds the computation graph of the mask head of Feature Pyramid Network.
  980.  
  981.    rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized
  982.          coordinates.
  983.    feature_maps: List of feature maps from different layers of the pyramid,
  984.                  [P2, P3, P4, P5]. Each has a different resolution.
  985.    image_meta: [batch, (meta data)] Image details. See compose_image_meta()
  986.    pool_size: The width of the square feature map generated from ROI Pooling.
  987.    num_classes: number of classes, which determines the depth of the results
  988.    train_bn: Boolean. Train or freeze Batch Norm layers
  989.  
  990.    Returns: Masks [batch, num_rois, MASK_POOL_SIZE, MASK_POOL_SIZE, NUM_CLASSES]
  991.    """
  992.     # ROI Pooling
  993.     # Shape: [batch, num_rois, MASK_POOL_SIZE, MASK_POOL_SIZE, channels]
  994.     x = PyramidROIAlign([pool_size, pool_size],
  995.                         name="roi_align_mask")([rois, image_meta] + feature_maps)
  996.  
  997.     # Conv layers
  998.     x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
  999.                            name="mrcnn_mask_conv1")(x)
  1000.     x = KL.TimeDistributed(BatchNorm(),
  1001.                            name='mrcnn_mask_bn1')(x, training=train_bn)
  1002.     x = KL.Activation('relu')(x)
  1003.  
  1004.     x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
  1005.                            name="mrcnn_mask_conv2")(x)
  1006.     x = KL.TimeDistributed(BatchNorm(),
  1007.                            name='mrcnn_mask_bn2')(x, training=train_bn)
  1008.     x = KL.Activation('relu')(x)
  1009.  
  1010.     x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
  1011.                            name="mrcnn_mask_conv3")(x)
  1012.     x = KL.TimeDistributed(BatchNorm(),
  1013.                            name='mrcnn_mask_bn3')(x, training=train_bn)
  1014.     x = KL.Activation('relu')(x)
  1015.  
  1016.     x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
  1017.                            name="mrcnn_mask_conv4")(x)
  1018.     x = KL.TimeDistributed(BatchNorm(),
  1019.                            name='mrcnn_mask_bn4')(x, training=train_bn)
  1020.     x = KL.Activation('relu')(x)
  1021.  
  1022.     x = KL.TimeDistributed(KL.Conv2DTranspose(256, (2, 2), strides=2, activation="relu"),
  1023.                            name="mrcnn_mask_deconv")(x)
  1024.     x = KL.TimeDistributed(KL.Conv2D(num_classes, (1, 1), strides=1, activation="sigmoid"),
  1025.                            name="mrcnn_mask")(x)
  1026.     return x
  1027.  
  1028.  
  1029. ############################################################
  1030. #  Loss Functions
  1031. ############################################################
  1032.  
  1033. def smooth_l1_loss(y_true, y_pred):
  1034.     """Implements Smooth-L1 loss.
  1035.    y_true and y_pred are typically: [N, 4], but could be any shape.
  1036.    """
  1037.     diff = K.abs(y_true - y_pred)
  1038.     less_than_one = K.cast(K.less(diff, 1.0), "float32")
  1039.     loss = (less_than_one * 0.5 * diff**2) + (1 - less_than_one) * (diff - 0.5)
  1040.     return loss
  1041.  
  1042.  
  1043. def rpn_class_loss_graph(rpn_match, rpn_class_logits):
  1044.     """RPN anchor classifier loss.
  1045.  
  1046.    rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive,
  1047.               -1=negative, 0=neutral anchor.
  1048.    rpn_class_logits: [batch, anchors, 2]. RPN classifier logits for BG/FG.
  1049.    """
  1050.     # Squeeze last dim to simplify
  1051.     rpn_match = tf.squeeze(rpn_match, -1)
  1052.     # Get anchor classes. Convert the -1/+1 match to 0/1 values.
  1053.     anchor_class = K.cast(K.equal(rpn_match, 1), tf.int32)
  1054.     # Positive and Negative anchors contribute to the loss,
  1055.     # but neutral anchors (match value = 0) don't.
  1056.     indices = tf.where(K.not_equal(rpn_match, 0))
  1057.     # Pick rows that contribute to the loss and filter out the rest.
  1058.     rpn_class_logits = tf.gather_nd(rpn_class_logits, indices)
  1059.     anchor_class = tf.gather_nd(anchor_class, indices)
  1060.     # Cross entropy loss
  1061.     loss = K.sparse_categorical_crossentropy(target=anchor_class,
  1062.                                              output=rpn_class_logits,
  1063.                                              from_logits=True)
  1064.     loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0))
  1065.     return loss
  1066.  
  1067.  
  1068. def rpn_bbox_loss_graph(config, target_bbox, rpn_match, rpn_bbox):
  1069.     """Return the RPN bounding box loss graph.
  1070.  
  1071.    config: the model config object.
  1072.    target_bbox: [batch, max positive anchors, (dy, dx, log(dh), log(dw))].
  1073.        Uses 0 padding to fill in unsed bbox deltas.
  1074.    rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive,
  1075.               -1=negative, 0=neutral anchor.
  1076.    rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))]
  1077.    """
  1078.     # Positive anchors contribute to the loss, but negative and
  1079.     # neutral anchors (match value of 0 or -1) don't.
  1080.     rpn_match = K.squeeze(rpn_match, -1)
  1081.     indices = tf.where(K.equal(rpn_match, 1))
  1082.  
  1083.     # Pick bbox deltas that contribute to the loss
  1084.     rpn_bbox = tf.gather_nd(rpn_bbox, indices)
  1085.  
  1086.     # Trim target bounding box deltas to the same length as rpn_bbox.
  1087.     batch_counts = K.sum(K.cast(K.equal(rpn_match, 1), tf.int32), axis=1)
  1088.     target_bbox = batch_pack_graph(target_bbox, batch_counts,
  1089.                                    config.IMAGES_PER_GPU)
  1090.  
  1091.     loss = smooth_l1_loss(target_bbox, rpn_bbox)
  1092.    
  1093.     loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0))
  1094.     return loss
  1095.  
  1096.  
  1097. def mrcnn_class_loss_graph(target_class_ids, pred_class_logits,
  1098.                            active_class_ids):
  1099.     """Loss for the classifier head of Mask RCNN.
  1100.  
  1101.    target_class_ids: [batch, num_rois]. Integer class IDs. Uses zero
  1102.        padding to fill in the array.
  1103.    pred_class_logits: [batch, num_rois, num_classes]
  1104.    active_class_ids: [batch, num_classes]. Has a value of 1 for
  1105.        classes that are in the dataset of the image, and 0
  1106.        for classes that are not in the dataset.
  1107.    """
  1108.     # During model building, Keras calls this function with
  1109.     # target_class_ids of type float32. Unclear why. Cast it
  1110.     # to int to get around it.
  1111.     target_class_ids = tf.cast(target_class_ids, 'int64')
  1112.  
  1113.     # Find predictions of classes that are not in the dataset.
  1114.     pred_class_ids = tf.argmax(pred_class_logits, axis=2)
  1115.     # TODO: Update this line to work with batch > 1. Right now it assumes all
  1116.     #       images in a batch have the same active_class_ids
  1117.     pred_active = tf.gather(active_class_ids[0], pred_class_ids)
  1118.  
  1119.     # Loss
  1120.     loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
  1121.         labels=target_class_ids, logits=pred_class_logits)
  1122.  
  1123.     # Erase losses of predictions of classes that are not in the active
  1124.     # classes of the image.
  1125.     loss = loss * pred_active
  1126.  
  1127.     # Computer loss mean. Use only predictions that contribute
  1128.     # to the loss to get a correct mean.
  1129.     loss = tf.reduce_sum(loss) / tf.reduce_sum(pred_active)
  1130.     return loss
  1131.  
  1132.  
  1133. def mrcnn_bbox_loss_graph(target_bbox, target_class_ids, pred_bbox):
  1134.     """Loss for Mask R-CNN bounding box refinement.
  1135.  
  1136.    target_bbox: [batch, num_rois, (dy, dx, log(dh), log(dw))]
  1137.    target_class_ids: [batch, num_rois]. Integer class IDs.
  1138.    pred_bbox: [batch, num_rois, num_classes, (dy, dx, log(dh), log(dw))]
  1139.    """
  1140.     # Reshape to merge batch and roi dimensions for simplicity.
  1141.     target_class_ids = K.reshape(target_class_ids, (-1,))
  1142.     target_bbox = K.reshape(target_bbox, (-1, 4))
  1143.     pred_bbox = K.reshape(pred_bbox, (-1, K.int_shape(pred_bbox)[2], 4))
  1144.  
  1145.     # Only positive ROIs contribute to the loss. And only
  1146.     # the right class_id of each ROI. Get their indices.
  1147.     positive_roi_ix = tf.where(target_class_ids > 0)[:, 0]
  1148.     positive_roi_class_ids = tf.cast(
  1149.         tf.gather(target_class_ids, positive_roi_ix), tf.int64)
  1150.     indices = tf.stack([positive_roi_ix, positive_roi_class_ids], axis=1)
  1151.  
  1152.     # Gather the deltas (predicted and true) that contribute to loss
  1153.     target_bbox = tf.gather(target_bbox, positive_roi_ix)
  1154.     pred_bbox = tf.gather_nd(pred_bbox, indices)
  1155.  
  1156.     # Smooth-L1 Loss
  1157.     loss = K.switch(tf.size(target_bbox) > 0,
  1158.                     smooth_l1_loss(y_true=target_bbox, y_pred=pred_bbox),
  1159.                     tf.constant(0.0))
  1160.     loss = K.mean(loss)
  1161.     return loss
  1162.  
  1163.  
  1164. def mrcnn_mask_loss_graph(target_masks, target_class_ids, pred_masks):
  1165.     """Mask binary cross-entropy loss for the masks head.
  1166.  
  1167.    target_masks: [batch, num_rois, height, width].
  1168.        A float32 tensor of values 0 or 1. Uses zero padding to fill array.
  1169.    target_class_ids: [batch, num_rois]. Integer class IDs. Zero padded.
  1170.    pred_masks: [batch, proposals, height, width, num_classes] float32 tensor
  1171.                with values from 0 to 1.
  1172.    """
  1173.     # Reshape for simplicity. Merge first two dimensions into one.
  1174.     target_class_ids = K.reshape(target_class_ids, (-1,))
  1175.     mask_shape = tf.shape(target_masks)
  1176.     target_masks = K.reshape(target_masks, (-1, mask_shape[2], mask_shape[3]))
  1177.     pred_shape = tf.shape(pred_masks)
  1178.     pred_masks = K.reshape(pred_masks,
  1179.                            (-1, pred_shape[2], pred_shape[3], pred_shape[4]))
  1180.     # Permute predicted masks to [N, num_classes, height, width]
  1181.     pred_masks = tf.transpose(pred_masks, [0, 3, 1, 2])
  1182.  
  1183.     # Only positive ROIs contribute to the loss. And only
  1184.     # the class specific mask of each ROI.
  1185.     positive_ix = tf.where(target_class_ids > 0)[:, 0]
  1186.     positive_class_ids = tf.cast(
  1187.         tf.gather(target_class_ids, positive_ix), tf.int64)
  1188.     indices = tf.stack([positive_ix, positive_class_ids], axis=1)
  1189.  
  1190.     # Gather the masks (predicted and true) that contribute to loss
  1191.     y_true = tf.gather(target_masks, positive_ix)
  1192.     y_pred = tf.gather_nd(pred_masks, indices)
  1193.  
  1194.     # Compute binary cross entropy. If no positive ROIs, then return 0.
  1195.     # shape: [batch, roi, num_classes]
  1196.     loss = K.switch(tf.size(y_true) > 0,
  1197.                     K.binary_crossentropy(target=y_true, output=y_pred),
  1198.                     tf.constant(0.0))
  1199.     loss = K.mean(loss)
  1200.     return loss
  1201.  
  1202.  
  1203. ############################################################
  1204. #  Data Generator
  1205. ############################################################
  1206.  
  1207. def load_image_gt(dataset, config, image_id, augment=False, augmentation=None,
  1208.                   use_mini_mask=False):
  1209.     """Load and return ground truth data for an image (image, mask, bounding boxes).
  1210.  
  1211.    augment: (deprecated. Use augmentation instead). If true, apply random
  1212.        image augmentation. Currently, only horizontal flipping is offered.
  1213.    augmentation: Optional. An imgaug (https://github.com/aleju/imgaug) augmentation.
  1214.        For example, passing imgaug.augmenters.Fliplr(0.5) flips images
  1215.        right/left 50% of the time.
  1216.    use_mini_mask: If False, returns full-size masks that are the same height
  1217.        and width as the original image. These can be big, for example
  1218.        1024x1024x100 (for 100 instances). Mini masks are smaller, typically,
  1219.        224x224 and are generated by extracting the bounding box of the
  1220.        object and resizing it to MINI_MASK_SHAPE.
  1221.  
  1222.    Returns:
  1223.    image: [height, width, 3]
  1224.    shape: the original shape of the image before resizing and cropping.
  1225.    class_ids: [instance_count] Integer class IDs
  1226.    bbox: [instance_count, (y1, x1, y2, x2)]
  1227.    mask: [height, width, instance_count]. The height and width are those
  1228.        of the image unless use_mini_mask is True, in which case they are
  1229.        defined in MINI_MASK_SHAPE.
  1230.    """
  1231.     # Load image and mask
  1232.     image = dataset.load_image(image_id)
  1233.     mask, class_ids = dataset.load_mask(image_id)
  1234.     original_shape = image.shape
  1235.     image, window, scale, padding, crop = utils.resize_image(
  1236.         image,
  1237.         min_dim=config.IMAGE_MIN_DIM,
  1238.         min_scale=config.IMAGE_MIN_SCALE,
  1239.         max_dim=config.IMAGE_MAX_DIM,
  1240.         mode=config.IMAGE_RESIZE_MODE)
  1241.     mask = utils.resize_mask(mask, scale, padding, crop)
  1242.  
  1243.     # Random horizontal flips.
  1244.     # TODO: will be removed in a future update in favor of augmentation
  1245.     if augment:
  1246.         logging.warning("'augment' is deprecated. Use 'augmentation' instead.")
  1247.         if random.randint(0, 1):
  1248.             image = np.fliplr(image)
  1249.             mask = np.fliplr(mask)
  1250.  
  1251.     # Augmentation
  1252.     # This requires the imgaug lib (https://github.com/aleju/imgaug)
  1253.     if augmentation:
  1254.         import imgaug
  1255.  
  1256.         # Augmenters that are safe to apply to masks
  1257.         # Some, such as Affine, have settings that make them unsafe, so always
  1258.         # test your augmentation on masks
  1259.         MASK_AUGMENTERS = ["Sequential", "SomeOf", "OneOf", "Sometimes",
  1260.                            "Fliplr", "Flipud", "CropAndPad",
  1261.                            "Affine", "PiecewiseAffine"]
  1262.  
  1263.         def hook(images, augmenter, parents, default):
  1264.             """Determines which augmenters to apply to masks."""
  1265.             return augmenter.__class__.__name__ in MASK_AUGMENTERS
  1266.  
  1267.         # Store shapes before augmentation to compare
  1268.         image_shape = image.shape
  1269.         mask_shape = mask.shape
  1270.         # Make augmenters deterministic to apply similarly to images and masks
  1271.         det = augmentation.to_deterministic()
  1272.         image = det.augment_image(image)
  1273.         # Change mask to np.uint8 because imgaug doesn't support np.bool
  1274.         mask = det.augment_image(mask.astype(np.uint8),
  1275.                                  hooks=imgaug.HooksImages(activator=hook))
  1276.         # Verify that shapes didn't change
  1277.         assert image.shape == image_shape, "Augmentation shouldn't change image size"
  1278.         assert mask.shape == mask_shape, "Augmentation shouldn't change mask size"
  1279.         # Change mask back to bool
  1280.         mask = mask.astype(np.bool)
  1281.  
  1282.     # Note that some boxes might be all zeros if the corresponding mask got cropped out.
  1283.     # and here is to filter them out
  1284.     _idx = np.sum(mask, axis=(0, 1)) > 0
  1285.     mask = mask[:, :, _idx]
  1286.     class_ids = class_ids[_idx]
  1287.     # Bounding boxes. Note that some boxes might be all zeros
  1288.     # if the corresponding mask got cropped out.
  1289.     # bbox: [num_instances, (y1, x1, y2, x2)]
  1290.     bbox = utils.extract_bboxes(mask)
  1291.  
  1292.     # Active classes
  1293.     # Different datasets have different classes, so track the
  1294.     # classes supported in the dataset of this image.
  1295.     active_class_ids = np.zeros([dataset.num_classes], dtype=np.int32)
  1296.     source_class_ids = dataset.source_class_ids[dataset.image_info[image_id]["source"]]
  1297.     active_class_ids[source_class_ids] = 1
  1298.  
  1299.     # Resize masks to smaller size to reduce memory usage
  1300.     if use_mini_mask:
  1301.         mask = utils.minimize_mask(bbox, mask, config.MINI_MASK_SHAPE)
  1302.  
  1303.     # Image meta data
  1304.     image_meta = compose_image_meta(image_id, original_shape, image.shape,
  1305.                                     window, scale, active_class_ids)
  1306.  
  1307.     return image, image_meta, class_ids, bbox, mask
  1308.  
  1309.  
  1310. def build_detection_targets(rpn_rois, gt_class_ids, gt_boxes, gt_masks, config):
  1311.     """Generate targets for training Stage 2 classifier and mask heads.
  1312.    This is not used in normal training. It's useful for debugging or to train
  1313.    the Mask RCNN heads without using the RPN head.
  1314.  
  1315.    Inputs:
  1316.    rpn_rois: [N, (y1, x1, y2, x2)] proposal boxes.
  1317.    gt_class_ids: [instance count] Integer class IDs
  1318.    gt_boxes: [instance count, (y1, x1, y2, x2)]
  1319.    gt_masks: [height, width, instance count] Ground truth masks. Can be full
  1320.              size or mini-masks.
  1321.  
  1322.    Returns:
  1323.    rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)]
  1324.    class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
  1325.    bboxes: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (y, x, log(h), log(w))]. Class-specific
  1326.            bbox refinements.
  1327.    masks: [TRAIN_ROIS_PER_IMAGE, height, width, NUM_CLASSES). Class specific masks cropped
  1328.           to bbox boundaries and resized to neural network output size.
  1329.    """
  1330.     assert rpn_rois.shape[0] > 0
  1331.     assert gt_class_ids.dtype == np.int32, "Expected int but got {}".format(
  1332.         gt_class_ids.dtype)
  1333.     assert gt_boxes.dtype == np.int32, "Expected int but got {}".format(
  1334.         gt_boxes.dtype)
  1335.     assert gt_masks.dtype == np.bool_, "Expected bool but got {}".format(
  1336.         gt_masks.dtype)
  1337.  
  1338.     # It's common to add GT Boxes to ROIs but we don't do that here because
  1339.     # according to XinLei Chen's paper, it doesn't help.
  1340.  
  1341.     # Trim empty padding in gt_boxes and gt_masks parts
  1342.     instance_ids = np.where(gt_class_ids > 0)[0]
  1343.     assert instance_ids.shape[0] > 0, "Image must contain instances."
  1344.     gt_class_ids = gt_class_ids[instance_ids]
  1345.     gt_boxes = gt_boxes[instance_ids]
  1346.     gt_masks = gt_masks[:, :, instance_ids]
  1347.  
  1348.     # Compute areas of ROIs and ground truth boxes.
  1349.     rpn_roi_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) * \
  1350.         (rpn_rois[:, 3] - rpn_rois[:, 1])
  1351.     gt_box_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) * \
  1352.         (gt_boxes[:, 3] - gt_boxes[:, 1])
  1353.  
  1354.     # Compute overlaps [rpn_rois, gt_boxes]
  1355.     overlaps = np.zeros((rpn_rois.shape[0], gt_boxes.shape[0]))
  1356.     for i in range(overlaps.shape[1]):
  1357.         gt = gt_boxes[i]
  1358.         overlaps[:, i] = utils.compute_iou(
  1359.             gt, rpn_rois, gt_box_area[i], rpn_roi_area)
  1360.  
  1361.     # Assign ROIs to GT boxes
  1362.     rpn_roi_iou_argmax = np.argmax(overlaps, axis=1)
  1363.     rpn_roi_iou_max = overlaps[np.arange(
  1364.         overlaps.shape[0]), rpn_roi_iou_argmax]
  1365.     # GT box assigned to each ROI
  1366.     rpn_roi_gt_boxes = gt_boxes[rpn_roi_iou_argmax]
  1367.     rpn_roi_gt_class_ids = gt_class_ids[rpn_roi_iou_argmax]
  1368.  
  1369.     # Positive ROIs are those with >= 0.5 IoU with a GT box.
  1370.     fg_ids = np.where(rpn_roi_iou_max > 0.5)[0]
  1371.  
  1372.     # Negative ROIs are those with max IoU 0.1-0.5 (hard example mining)
  1373.     # TODO: To hard example mine or not to hard example mine, that's the question
  1374.     # bg_ids = np.where((rpn_roi_iou_max >= 0.1) & (rpn_roi_iou_max < 0.5))[0]
  1375.     bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
  1376.  
  1377.     # Subsample ROIs. Aim for 33% foreground.
  1378.     # FG
  1379.     fg_roi_count = int(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO)
  1380.     if fg_ids.shape[0] > fg_roi_count:
  1381.         keep_fg_ids = np.random.choice(fg_ids, fg_roi_count, replace=False)
  1382.     else:
  1383.         keep_fg_ids = fg_ids
  1384.     # BG
  1385.     remaining = config.TRAIN_ROIS_PER_IMAGE - keep_fg_ids.shape[0]
  1386.     if bg_ids.shape[0] > remaining:
  1387.         keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
  1388.     else:
  1389.         keep_bg_ids = bg_ids
  1390.     # Combine indices of ROIs to keep
  1391.     keep = np.concatenate([keep_fg_ids, keep_bg_ids])
  1392.     # Need more?
  1393.     remaining = config.TRAIN_ROIS_PER_IMAGE - keep.shape[0]
  1394.     if remaining > 0:
  1395.         # Looks like we don't have enough samples to maintain the desired
  1396.         # balance. Reduce requirements and fill in the rest. This is
  1397.         # likely different from the Mask RCNN paper.
  1398.  
  1399.         # There is a small chance we have neither fg nor bg samples.
  1400.         if keep.shape[0] == 0:
  1401.             # Pick bg regions with easier IoU threshold
  1402.             bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
  1403.             assert bg_ids.shape[0] >= remaining
  1404.             keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
  1405.             assert keep_bg_ids.shape[0] == remaining
  1406.             keep = np.concatenate([keep, keep_bg_ids])
  1407.         else:
  1408.             # Fill the rest with repeated bg rois.
  1409.             keep_extra_ids = np.random.choice(
  1410.                 keep_bg_ids, remaining, replace=True)
  1411.             keep = np.concatenate([keep, keep_extra_ids])
  1412.     assert keep.shape[0] == config.TRAIN_ROIS_PER_IMAGE, \
  1413.         "keep doesn't match ROI batch size {}, {}".format(
  1414.             keep.shape[0], config.TRAIN_ROIS_PER_IMAGE)
  1415.  
  1416.     # Reset the gt boxes assigned to BG ROIs.
  1417.     rpn_roi_gt_boxes[keep_bg_ids, :] = 0
  1418.     rpn_roi_gt_class_ids[keep_bg_ids] = 0
  1419.  
  1420.     # For each kept ROI, assign a class_id, and for FG ROIs also add bbox refinement.
  1421.     rois = rpn_rois[keep]
  1422.     roi_gt_boxes = rpn_roi_gt_boxes[keep]
  1423.     roi_gt_class_ids = rpn_roi_gt_class_ids[keep]
  1424.     roi_gt_assignment = rpn_roi_iou_argmax[keep]
  1425.  
  1426.     # Class-aware bbox deltas. [y, x, log(h), log(w)]
  1427.     bboxes = np.zeros((config.TRAIN_ROIS_PER_IMAGE,
  1428.                        config.NUM_CLASSES, 4), dtype=np.float32)
  1429.     pos_ids = np.where(roi_gt_class_ids > 0)[0]
  1430.     bboxes[pos_ids, roi_gt_class_ids[pos_ids]] = utils.box_refinement(
  1431.         rois[pos_ids], roi_gt_boxes[pos_ids, :4])
  1432.     # Normalize bbox refinements
  1433.     bboxes /= config.BBOX_STD_DEV
  1434.  
  1435.     # Generate class-specific target masks
  1436.     masks = np.zeros((config.TRAIN_ROIS_PER_IMAGE, config.MASK_SHAPE[0], config.MASK_SHAPE[1], config.NUM_CLASSES),
  1437.                      dtype=np.float32)
  1438.     for i in pos_ids:
  1439.         class_id = roi_gt_class_ids[i]
  1440.         assert class_id > 0, "class id must be greater than 0"
  1441.         gt_id = roi_gt_assignment[i]
  1442.         class_mask = gt_masks[:, :, gt_id]
  1443.  
  1444.         if config.USE_MINI_MASK:
  1445.             # Create a mask placeholder, the size of the image
  1446.             placeholder = np.zeros(config.IMAGE_SHAPE[:2], dtype=bool)
  1447.             # GT box
  1448.             gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[gt_id]
  1449.             gt_w = gt_x2 - gt_x1
  1450.             gt_h = gt_y2 - gt_y1
  1451.             # Resize mini mask to size of GT box
  1452.             placeholder[gt_y1:gt_y2, gt_x1:gt_x2] = \
  1453.                 np.round(utils.resize(class_mask, (gt_h, gt_w))).astype(bool)
  1454.             # Place the mini batch in the placeholder
  1455.             class_mask = placeholder
  1456.  
  1457.         # Pick part of the mask and resize it
  1458.         y1, x1, y2, x2 = rois[i].astype(np.int32)
  1459.         m = class_mask[y1:y2, x1:x2]
  1460.         mask = utils.resize(m, config.MASK_SHAPE)
  1461.         masks[i, :, :, class_id] = mask
  1462.  
  1463.     return rois, roi_gt_class_ids, bboxes, masks
  1464.  
  1465.  
  1466. def build_rpn_targets(image_shape, anchors, gt_class_ids, gt_boxes, config):
  1467.     """Given the anchors and GT boxes, compute overlaps and identify positive
  1468.    anchors and deltas to refine them to match their corresponding GT boxes.
  1469.  
  1470.    anchors: [num_anchors, (y1, x1, y2, x2)]
  1471.    gt_class_ids: [num_gt_boxes] Integer class IDs.
  1472.    gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
  1473.  
  1474.    Returns:
  1475.    rpn_match: [N] (int32) matches between anchors and GT boxes.
  1476.               1 = positive anchor, -1 = negative anchor, 0 = neutral
  1477.    rpn_bbox: [N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
  1478.    """
  1479.     # RPN Match: 1 = positive anchor, -1 = negative anchor, 0 = neutral
  1480.     rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32)
  1481.     # RPN bounding boxes: [max anchors per image, (dy, dx, log(dh), log(dw))]
  1482.     rpn_bbox = np.zeros((config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4))
  1483.  
  1484.     # Handle COCO crowds
  1485.     # A crowd box in COCO is a bounding box around several instances. Exclude
  1486.     # them from training. A crowd box is given a negative class ID.
  1487.     crowd_ix = np.where(gt_class_ids < 0)[0]
  1488.     if crowd_ix.shape[0] > 0:
  1489.         # Filter out crowds from ground truth class IDs and boxes
  1490.         non_crowd_ix = np.where(gt_class_ids > 0)[0]
  1491.         crowd_boxes = gt_boxes[crowd_ix]
  1492.         gt_class_ids = gt_class_ids[non_crowd_ix]
  1493.         gt_boxes = gt_boxes[non_crowd_ix]
  1494.         # Compute overlaps with crowd boxes [anchors, crowds]
  1495.         crowd_overlaps = utils.compute_overlaps(anchors, crowd_boxes)
  1496.         crowd_iou_max = np.amax(crowd_overlaps, axis=1)
  1497.         no_crowd_bool = (crowd_iou_max < 0.001)
  1498.     else:
  1499.         # All anchors don't intersect a crowd
  1500.         no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool)
  1501.  
  1502.     # Compute overlaps [num_anchors, num_gt_boxes]
  1503.     overlaps = utils.compute_overlaps(anchors, gt_boxes)
  1504.  
  1505.     # Match anchors to GT Boxes
  1506.     # If an anchor overlaps a GT box with IoU >= 0.7 then it's positive.
  1507.     # If an anchor overlaps a GT box with IoU < 0.3 then it's negative.
  1508.     # Neutral anchors are those that don't match the conditions above,
  1509.     # and they don't influence the loss function.
  1510.     # However, don't keep any GT box unmatched (rare, but happens). Instead,
  1511.     # match it to the closest anchor (even if its max IoU is < 0.3).
  1512.     #
  1513.     # 1. Set negative anchors first. They get overwritten below if a GT box is
  1514.     # matched to them. Skip boxes in crowd areas.
  1515.     anchor_iou_argmax = np.argmax(overlaps, axis=1)
  1516.     anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]
  1517.     rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1
  1518.     # 2. Set an anchor for each GT box (regardless of IoU value).
  1519.     # If multiple anchors have the same IoU match all of them
  1520.     gt_iou_argmax = np.argwhere(overlaps == np.max(overlaps, axis=0))[:,0]
  1521.     rpn_match[gt_iou_argmax] = 1
  1522.     # 3. Set anchors with high overlap as positive.
  1523.     rpn_match[anchor_iou_max >= 0.7] = 1
  1524.  
  1525.     # Subsample to balance positive and negative anchors
  1526.     # Don't let positives be more than half the anchors
  1527.     ids = np.where(rpn_match == 1)[0]
  1528.     extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE // 2)
  1529.     if extra > 0:
  1530.         # Reset the extra ones to neutral
  1531.         ids = np.random.choice(ids, extra, replace=False)
  1532.         rpn_match[ids] = 0
  1533.     # Same for negative proposals
  1534.     ids = np.where(rpn_match == -1)[0]
  1535.     extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE -
  1536.                         np.sum(rpn_match == 1))
  1537.     if extra > 0:
  1538.         # Rest the extra ones to neutral
  1539.         ids = np.random.choice(ids, extra, replace=False)
  1540.         rpn_match[ids] = 0
  1541.  
  1542.     # For positive anchors, compute shift and scale needed to transform them
  1543.     # to match the corresponding GT boxes.
  1544.     ids = np.where(rpn_match == 1)[0]
  1545.     ix = 0  # index into rpn_bbox
  1546.     # TODO: use box_refinement() rather than duplicating the code here
  1547.     for i, a in zip(ids, anchors[ids]):
  1548.         # Closest gt box (it might have IoU < 0.7)
  1549.         gt = gt_boxes[anchor_iou_argmax[i]]
  1550.  
  1551.         # Convert coordinates to center plus width/height.
  1552.         # GT Box
  1553.         gt_h = gt[2] - gt[0]
  1554.         gt_w = gt[3] - gt[1]
  1555.         gt_center_y = gt[0] + 0.5 * gt_h
  1556.         gt_center_x = gt[1] + 0.5 * gt_w
  1557.         # Anchor
  1558.         a_h = a[2] - a[0]
  1559.         a_w = a[3] - a[1]
  1560.         a_center_y = a[0] + 0.5 * a_h
  1561.         a_center_x = a[1] + 0.5 * a_w
  1562.  
  1563.         # Compute the bbox refinement that the RPN should predict.
  1564.         rpn_bbox[ix] = [
  1565.             (gt_center_y - a_center_y) / a_h,
  1566.             (gt_center_x - a_center_x) / a_w,
  1567.             np.log(gt_h / a_h),
  1568.             np.log(gt_w / a_w),
  1569.         ]
  1570.         # Normalize
  1571.         rpn_bbox[ix] /= config.RPN_BBOX_STD_DEV
  1572.         ix += 1
  1573.  
  1574.     return rpn_match, rpn_bbox
  1575.  
  1576.  
  1577. def generate_random_rois(image_shape, count, gt_class_ids, gt_boxes):
  1578.     """Generates ROI proposals similar to what a region proposal network
  1579.    would generate.
  1580.  
  1581.    image_shape: [Height, Width, Depth]
  1582.    count: Number of ROIs to generate
  1583.    gt_class_ids: [N] Integer ground truth class IDs
  1584.    gt_boxes: [N, (y1, x1, y2, x2)] Ground truth boxes in pixels.
  1585.  
  1586.    Returns: [count, (y1, x1, y2, x2)] ROI boxes in pixels.
  1587.    """
  1588.     # placeholder
  1589.     rois = np.zeros((count, 4), dtype=np.int32)
  1590.  
  1591.     # Generate random ROIs around GT boxes (90% of count)
  1592.     rois_per_box = int(0.9 * count / gt_boxes.shape[0])
  1593.     for i in range(gt_boxes.shape[0]):
  1594.         gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[i]
  1595.         h = gt_y2 - gt_y1
  1596.         w = gt_x2 - gt_x1
  1597.         # random boundaries
  1598.         r_y1 = max(gt_y1 - h, 0)
  1599.         r_y2 = min(gt_y2 + h, image_shape[0])
  1600.         r_x1 = max(gt_x1 - w, 0)
  1601.         r_x2 = min(gt_x2 + w, image_shape[1])
  1602.  
  1603.         # To avoid generating boxes with zero area, we generate double what
  1604.         # we need and filter out the extra. If we get fewer valid boxes
  1605.         # than we need, we loop and try again.
  1606.         while True:
  1607.             y1y2 = np.random.randint(r_y1, r_y2, (rois_per_box * 2, 2))
  1608.             x1x2 = np.random.randint(r_x1, r_x2, (rois_per_box * 2, 2))
  1609.             # Filter out zero area boxes
  1610.             threshold = 1
  1611.             y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=
  1612.                         threshold][:rois_per_box]
  1613.             x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=
  1614.                         threshold][:rois_per_box]
  1615.             if y1y2.shape[0] == rois_per_box and x1x2.shape[0] == rois_per_box:
  1616.                 break
  1617.  
  1618.         # Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
  1619.         # into x1, y1, x2, y2 order
  1620.         x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
  1621.         y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
  1622.         box_rois = np.hstack([y1, x1, y2, x2])
  1623.         rois[rois_per_box * i:rois_per_box * (i + 1)] = box_rois
  1624.  
  1625.     # Generate random ROIs anywhere in the image (10% of count)
  1626.     remaining_count = count - (rois_per_box * gt_boxes.shape[0])
  1627.     # To avoid generating boxes with zero area, we generate double what
  1628.     # we need and filter out the extra. If we get fewer valid boxes
  1629.     # than we need, we loop and try again.
  1630.     while True:
  1631.         y1y2 = np.random.randint(0, image_shape[0], (remaining_count * 2, 2))
  1632.         x1x2 = np.random.randint(0, image_shape[1], (remaining_count * 2, 2))
  1633.         # Filter out zero area boxes
  1634.         threshold = 1
  1635.         y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=
  1636.                     threshold][:remaining_count]
  1637.         x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=
  1638.                     threshold][:remaining_count]
  1639.         if y1y2.shape[0] == remaining_count and x1x2.shape[0] == remaining_count:
  1640.             break
  1641.  
  1642.     # Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
  1643.     # into x1, y1, x2, y2 order
  1644.     x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
  1645.     y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
  1646.     global_rois = np.hstack([y1, x1, y2, x2])
  1647.     rois[-remaining_count:] = global_rois
  1648.     return rois
  1649.  
  1650.  
  1651. def data_generator(dataset, config, shuffle=True, augment=False, augmentation=None,
  1652.                    random_rois=0, batch_size=1, detection_targets=False,
  1653.                    no_augmentation_sources=None):
  1654.     """A generator that returns images and corresponding target class ids,
  1655.    bounding box deltas, and masks.
  1656.  
  1657.    dataset: The Dataset object to pick data from
  1658.    config: The model config object
  1659.    shuffle: If True, shuffles the samples before every epoch
  1660.    augment: (deprecated. Use augmentation instead). If true, apply random
  1661.        image augmentation. Currently, only horizontal flipping is offered.
  1662.    augmentation: Optional. An imgaug (https://github.com/aleju/imgaug) augmentation.
  1663.        For example, passing imgaug.augmenters.Fliplr(0.5) flips images
  1664.        right/left 50% of the time.
  1665.    random_rois: If > 0 then generate proposals to be used to train the
  1666.                 network classifier and mask heads. Useful if training
  1667.                 the Mask RCNN part without the RPN.
  1668.    batch_size: How many images to return in each call
  1669.    detection_targets: If True, generate detection targets (class IDs, bbox
  1670.        deltas, and masks). Typically for debugging or visualizations because
  1671.        in trainig detection targets are generated by DetectionTargetLayer.
  1672.    no_augmentation_sources: Optional. List of sources to exclude for
  1673.        augmentation. A source is string that identifies a dataset and is
  1674.        defined in the Dataset class.
  1675.  
  1676.    Returns a Python generator. Upon calling next() on it, the
  1677.    generator returns two lists, inputs and outputs. The contents
  1678.    of the lists differs depending on the received arguments:
  1679.    inputs list:
  1680.    - images: [batch, H, W, C]
  1681.    - image_meta: [batch, (meta data)] Image details. See compose_image_meta()
  1682.    - rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral)
  1683.    - rpn_bbox: [batch, N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
  1684.    - gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs
  1685.    - gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)]
  1686.    - gt_masks: [batch, height, width, MAX_GT_INSTANCES]. The height and width
  1687.                are those of the image unless use_mini_mask is True, in which
  1688.                case they are defined in MINI_MASK_SHAPE.
  1689.  
  1690.    outputs list: Usually empty in regular training. But if detection_targets
  1691.        is True then the outputs list contains target class_ids, bbox deltas,
  1692.        and masks.
  1693.    """
  1694.     b = 0  # batch item index
  1695.     image_index = -1
  1696.     image_ids = np.copy(dataset.image_ids)
  1697.     error_count = 0
  1698.     no_augmentation_sources = no_augmentation_sources or []
  1699.  
  1700.     # Anchors
  1701.     # [anchor_count, (y1, x1, y2, x2)]
  1702.     backbone_shapes = compute_backbone_shapes(config, config.IMAGE_SHAPE)
  1703.     anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES,
  1704.                                              config.RPN_ANCHOR_RATIOS,
  1705.                                              backbone_shapes,
  1706.                                              config.BACKBONE_STRIDES,
  1707.                                              config.RPN_ANCHOR_STRIDE)
  1708.  
  1709.     # Keras requires a generator to run indefinitely.
  1710.     while True:
  1711.         try:
  1712.             # Increment index to pick next image. Shuffle if at the start of an epoch.
  1713.             image_index = (image_index + 1) % len(image_ids)
  1714.             if shuffle and image_index == 0:
  1715.                 np.random.shuffle(image_ids)
  1716.  
  1717.             # Get GT bounding boxes and masks for image.
  1718.             image_id = image_ids[image_index]
  1719.  
  1720.             # If the image source is not to be augmented pass None as augmentation
  1721.             if dataset.image_info[image_id]['source'] in no_augmentation_sources:
  1722.                 image, image_meta, gt_class_ids, gt_boxes, gt_masks = \
  1723.                 load_image_gt(dataset, config, image_id, augment=augment,
  1724.                               augmentation=None,
  1725.                               use_mini_mask=config.USE_MINI_MASK)
  1726.             else:
  1727.                 image, image_meta, gt_class_ids, gt_boxes, gt_masks = \
  1728.                     load_image_gt(dataset, config, image_id, augment=augment,
  1729.                                 augmentation=augmentation,
  1730.                                 use_mini_mask=config.USE_MINI_MASK)
  1731.  
  1732.             # Skip images that have no instances. This can happen in cases
  1733.             # where we train on a subset of classes and the image doesn't
  1734.             # have any of the classes we care about.
  1735.             if not np.any(gt_class_ids > 0):
  1736.                 continue
  1737.  
  1738.             # RPN Targets
  1739.             rpn_match, rpn_bbox = build_rpn_targets(image.shape, anchors,
  1740.                                                     gt_class_ids, gt_boxes, config)
  1741.  
  1742.             # Mask R-CNN Targets
  1743.             if random_rois:
  1744.                 rpn_rois = generate_random_rois(
  1745.                     image.shape, random_rois, gt_class_ids, gt_boxes)
  1746.                 if detection_targets:
  1747.                     rois, mrcnn_class_ids, mrcnn_bbox, mrcnn_mask =\
  1748.                         build_detection_targets(
  1749.                             rpn_rois, gt_class_ids, gt_boxes, gt_masks, config)
  1750.  
  1751.             # Init batch arrays
  1752.             if b == 0:
  1753.                 batch_image_meta = np.zeros(
  1754.                     (batch_size,) + image_meta.shape, dtype=image_meta.dtype)
  1755.                 batch_rpn_match = np.zeros(
  1756.                     [batch_size, anchors.shape[0], 1], dtype=rpn_match.dtype)
  1757.                 batch_rpn_bbox = np.zeros(
  1758.                     [batch_size, config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4], dtype=rpn_bbox.dtype)
  1759.                 batch_images = np.zeros(
  1760.                     (batch_size,) + image.shape, dtype=np.float32)
  1761.                 batch_gt_class_ids = np.zeros(
  1762.                     (batch_size, config.MAX_GT_INSTANCES), dtype=np.int32)
  1763.                 batch_gt_boxes = np.zeros(
  1764.                     (batch_size, config.MAX_GT_INSTANCES, 4), dtype=np.int32)
  1765.                 batch_gt_masks = np.zeros(
  1766.                     (batch_size, gt_masks.shape[0], gt_masks.shape[1],
  1767.                      config.MAX_GT_INSTANCES), dtype=gt_masks.dtype)
  1768.                 if random_rois:
  1769.                     batch_rpn_rois = np.zeros(
  1770.                         (batch_size, rpn_rois.shape[0], 4), dtype=rpn_rois.dtype)
  1771.                     if detection_targets:
  1772.                         batch_rois = np.zeros(
  1773.                             (batch_size,) + rois.shape, dtype=rois.dtype)
  1774.                         batch_mrcnn_class_ids = np.zeros(
  1775.                             (batch_size,) + mrcnn_class_ids.shape, dtype=mrcnn_class_ids.dtype)
  1776.                         batch_mrcnn_bbox = np.zeros(
  1777.                             (batch_size,) + mrcnn_bbox.shape, dtype=mrcnn_bbox.dtype)
  1778.                         batch_mrcnn_mask = np.zeros(
  1779.                             (batch_size,) + mrcnn_mask.shape, dtype=mrcnn_mask.dtype)
  1780.  
  1781.             # If more instances than fits in the array, sub-sample from them.
  1782.             if gt_boxes.shape[0] > config.MAX_GT_INSTANCES:
  1783.                 ids = np.random.choice(
  1784.                     np.arange(gt_boxes.shape[0]), config.MAX_GT_INSTANCES, replace=False)
  1785.                 gt_class_ids = gt_class_ids[ids]
  1786.                 gt_boxes = gt_boxes[ids]
  1787.                 gt_masks = gt_masks[:, :, ids]
  1788.  
  1789.             # Add to batch
  1790.             batch_image_meta[b] = image_meta
  1791.             batch_rpn_match[b] = rpn_match[:, np.newaxis]
  1792.             batch_rpn_bbox[b] = rpn_bbox
  1793.             batch_images[b] = mold_image(image.astype(np.float32), config)
  1794.             batch_gt_class_ids[b, :gt_class_ids.shape[0]] = gt_class_ids
  1795.             batch_gt_boxes[b, :gt_boxes.shape[0]] = gt_boxes
  1796.             batch_gt_masks[b, :, :, :gt_masks.shape[-1]] = gt_masks
  1797.             if random_rois:
  1798.                 batch_rpn_rois[b] = rpn_rois
  1799.                 if detection_targets:
  1800.                     batch_rois[b] = rois
  1801.                     batch_mrcnn_class_ids[b] = mrcnn_class_ids
  1802.                     batch_mrcnn_bbox[b] = mrcnn_bbox
  1803.                     batch_mrcnn_mask[b] = mrcnn_mask
  1804.             b += 1
  1805.  
  1806.             # Batch full?
  1807.             if b >= batch_size:
  1808.                 inputs = [batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox,
  1809.                           batch_gt_class_ids, batch_gt_boxes, batch_gt_masks]
  1810.                 outputs = []
  1811.  
  1812.                 if random_rois:
  1813.                     inputs.extend([batch_rpn_rois])
  1814.                     if detection_targets:
  1815.                         inputs.extend([batch_rois])
  1816.                         # Keras requires that output and targets have the same number of dimensions
  1817.                         batch_mrcnn_class_ids = np.expand_dims(
  1818.                             batch_mrcnn_class_ids, -1)
  1819.                         outputs.extend(
  1820.                             [batch_mrcnn_class_ids, batch_mrcnn_bbox, batch_mrcnn_mask])
  1821.  
  1822.                 yield inputs, outputs
  1823.  
  1824.                 # start a new batch
  1825.                 b = 0
  1826.         except (GeneratorExit, KeyboardInterrupt):
  1827.             raise
  1828.         except:
  1829.             # Log it and skip the image
  1830.             logging.exception("Error processing image {}".format(
  1831.                 dataset.image_info[image_id]))
  1832.             error_count += 1
  1833.             if error_count > 5:
  1834.                 raise
  1835.  
  1836.  
  1837. ############################################################
  1838. #  MaskRCNN Class
  1839. ############################################################
  1840.  
  1841. class MaskRCNN():
  1842.     """Encapsulates the Mask RCNN model functionality.
  1843.  
  1844.    The actual Keras model is in the keras_model property.
  1845.    """
  1846.  
  1847.     def __init__(self, mode, config, model_dir):
  1848.         """
  1849.        mode: Either "training" or "inference"
  1850.        config: A Sub-class of the Config class
  1851.        model_dir: Directory to save training logs and trained weights
  1852.        """
  1853.         assert mode in ['training', 'inference']
  1854.         self.mode = mode
  1855.         self.config = config
  1856.         self.model_dir = model_dir
  1857.         self.set_log_dir()
  1858.         self.keras_model = self.build(mode=mode, config=config)
  1859.  
  1860.     def build(self, mode, config):
  1861.         """Build Mask R-CNN architecture.
  1862.            input_shape: The shape of the input image.
  1863.            mode: Either "training" or "inference". The inputs and
  1864.                outputs of the model differ accordingly.
  1865.        """
  1866.         assert mode in ['training', 'inference']
  1867.  
  1868.         # Image size must be dividable by 2 multiple times
  1869.         h, w = config.IMAGE_SHAPE[:2]
  1870.         if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6):
  1871.             raise Exception("Image size must be dividable by 2 at least 6 times "
  1872.                             "to avoid fractions when downscaling and upscaling."
  1873.                             "For example, use 256, 320, 384, 448, 512, ... etc. ")
  1874.  
  1875.         # Inputs
  1876.         input_image = KL.Input(
  1877.             shape=[None, None, config.IMAGE_SHAPE[2]], name="input_image")
  1878.         input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE],
  1879.                                     name="input_image_meta")
  1880.         if mode == "training":
  1881.             # RPN GT
  1882.             input_rpn_match = KL.Input(
  1883.                 shape=[None, 1], name="input_rpn_match", dtype=tf.int32)
  1884.             input_rpn_bbox = KL.Input(
  1885.                 shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32)
  1886.  
  1887.             # Detection GT (class IDs, bounding boxes, and masks)
  1888.             # 1. GT Class IDs (zero padded)
  1889.             input_gt_class_ids = KL.Input(
  1890.                 shape=[None], name="input_gt_class_ids", dtype=tf.int32)
  1891.             # 2. GT Boxes in pixels (zero padded)
  1892.             # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates
  1893.             input_gt_boxes = KL.Input(
  1894.                 shape=[None, 4], name="input_gt_boxes", dtype=tf.float32)
  1895.             # Normalize coordinates
  1896.             gt_boxes = KL.Lambda(lambda x: norm_boxes_graph(
  1897.                 x, K.shape(input_image)[1:3]))(input_gt_boxes)
  1898.             # 3. GT Masks (zero padded)
  1899.             # [batch, height, width, MAX_GT_INSTANCES]
  1900.             if config.USE_MINI_MASK:
  1901.                 input_gt_masks = KL.Input(
  1902.                     shape=[config.MINI_MASK_SHAPE[0],
  1903.                            config.MINI_MASK_SHAPE[1], None],
  1904.                     name="input_gt_masks", dtype=bool)
  1905.             else:
  1906.                 input_gt_masks = KL.Input(
  1907.                     shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None],
  1908.                     name="input_gt_masks", dtype=bool)
  1909.         elif mode == "inference":
  1910.             # Anchors in normalized coordinates
  1911.             input_anchors = KL.Input(shape=[None, 4], name="input_anchors")
  1912.  
  1913.         # Build the shared convolutional layers.
  1914.         # Bottom-up Layers
  1915.         # Returns a list of the last layers of each stage, 5 in total.
  1916.         # Don't create the thead (stage 5), so we pick the 4th item in the list.
  1917.         if callable(config.BACKBONE):
  1918.             _, C2, C3, C4, C5 = config.BACKBONE(input_image, stage5=True,
  1919.                                                 train_bn=config.TRAIN_BN)
  1920.         else:
  1921.             _, C2, C3, C4, C5 = resnet_graph(input_image, config.BACKBONE,
  1922.                                              stage5=True, train_bn=config.TRAIN_BN)
  1923.         # Top-down Layers
  1924.         # TODO: add assert to varify feature map sizes match what's in config
  1925.         P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c5p5')(C5)
  1926.         P4 = KL.Add(name="fpn_p4add")([
  1927.             KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5),
  1928.             KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c4p4')(C4)])
  1929.         P3 = KL.Add(name="fpn_p3add")([
  1930.             KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4),
  1931.             KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c3p3')(C3)])
  1932.         P2 = KL.Add(name="fpn_p2add")([
  1933.             KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3),
  1934.             KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c2p2')(C2)])
  1935.         # Attach 3x3 conv to all P layers to get the final feature maps.
  1936.         P2 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p2")(P2)
  1937.         P3 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p3")(P3)
  1938.         P4 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p4")(P4)
  1939.         P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p5")(P5)
  1940.         # P6 is used for the 5th anchor scale in RPN. Generated by
  1941.         # subsampling from P5 with stride of 2.
  1942.         P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5)
  1943.  
  1944.         # Note that P6 is used in RPN, but not in the classifier heads.
  1945.         rpn_feature_maps = [P2, P3, P4, P5, P6]
  1946.         mrcnn_feature_maps = [P2, P3, P4, P5]
  1947.  
  1948.         # Anchors
  1949.         if mode == "training":
  1950.             anchors = self.get_anchors(config.IMAGE_SHAPE)
  1951.             # Duplicate across the batch dimension because Keras requires it
  1952.             # TODO: can this be optimized to avoid duplicating the anchors?
  1953.             anchors = np.broadcast_to(anchors, (config.BATCH_SIZE,) + anchors.shape)
  1954.             # A hack to get around Keras's bad support for constants
  1955.             #anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image)
  1956.             anchors = tf.constant(anchors, name="anchors")
  1957.         else:
  1958.             anchors = input_anchors
  1959.  
  1960.         # RPN Model
  1961.         rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE,
  1962.                               len(config.RPN_ANCHOR_RATIOS), config.TOP_DOWN_PYRAMID_SIZE)
  1963.         # Loop through pyramid layers
  1964.         layer_outputs = []  # list of lists
  1965.         for p in rpn_feature_maps:
  1966.             layer_outputs.append(rpn([p]))
  1967.         # Concatenate layer outputs
  1968.         # Convert from list of lists of level outputs to list of lists
  1969.         # of outputs across levels.
  1970.         # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
  1971.         output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"]
  1972.         outputs = list(zip(*layer_outputs))
  1973.         outputs = [KL.Concatenate(axis=1, name=n)(list(o))
  1974.                    for o, n in zip(outputs, output_names)]
  1975.  
  1976.         rpn_class_logits, rpn_class, rpn_bbox = outputs
  1977.  
  1978.         # Generate proposals
  1979.         # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
  1980.         # and zero padded.
  1981.         proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training"\
  1982.             else config.POST_NMS_ROIS_INFERENCE
  1983.         rpn_rois = ProposalLayer(
  1984.             proposal_count=proposal_count,
  1985.             nms_threshold=config.RPN_NMS_THRESHOLD,
  1986.             name="ROI",
  1987.             config=config)([rpn_class, rpn_bbox, anchors])
  1988.  
  1989.         if mode == "training":
  1990.             # Class ID mask to mark class IDs supported by the dataset the image
  1991.             # came from.
  1992.             active_class_ids = KL.Lambda(
  1993.                 lambda x: parse_image_meta_graph(x)["active_class_ids"]
  1994.                 )(input_image_meta)
  1995.  
  1996.             if not config.USE_RPN_ROIS:
  1997.                 # Ignore predicted ROIs and use ROIs provided as an input.
  1998.                 input_rois = KL.Input(shape=[config.POST_NMS_ROIS_TRAINING, 4],
  1999.                                       name="input_roi", dtype=np.int32)
  2000.                 # Normalize coordinates
  2001.                 target_rois = KL.Lambda(lambda x: norm_boxes_graph(
  2002.                     x, K.shape(input_image)[1:3]))(input_rois)
  2003.             else:
  2004.                 target_rois = rpn_rois
  2005.  
  2006.             # Generate detection targets
  2007.             # Subsamples proposals and generates target outputs for training
  2008.             # Note that proposal class IDs, gt_boxes, and gt_masks are zero
  2009.             # padded. Equally, returned rois and targets are zero padded.
  2010.             rois, target_class_ids, target_bbox, target_mask =\
  2011.                 DetectionTargetLayer(config, name="proposal_targets")([
  2012.                     target_rois, input_gt_class_ids, gt_boxes, input_gt_masks])
  2013.  
  2014.             # Network Heads
  2015.             # TODO: verify that this handles zero padded ROIs
  2016.             mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\
  2017.                 fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_meta,
  2018.                                      config.POOL_SIZE, config.NUM_CLASSES,
  2019.                                      train_bn=config.TRAIN_BN,
  2020.                                      fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE)
  2021.  
  2022.             mrcnn_mask = build_fpn_mask_graph(rois, mrcnn_feature_maps,
  2023.                                               input_image_meta,
  2024.                                               config.MASK_POOL_SIZE,
  2025.                                               config.NUM_CLASSES,
  2026.                                               train_bn=config.TRAIN_BN)
  2027.  
  2028.             # TODO: clean up (use tf.identify if necessary)
  2029.             output_rois = KL.Lambda(lambda x: x * 1, name="output_rois")(rois)
  2030.  
  2031.             # Losses
  2032.             rpn_class_loss = KL.Lambda(lambda x: rpn_class_loss_graph(*x), name="rpn_class_loss")(
  2033.                 [input_rpn_match, rpn_class_logits])
  2034.             rpn_bbox_loss = KL.Lambda(lambda x: rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")(
  2035.                 [input_rpn_bbox, input_rpn_match, rpn_bbox])
  2036.             class_loss = KL.Lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")(
  2037.                 [target_class_ids, mrcnn_class_logits, active_class_ids])
  2038.             bbox_loss = KL.Lambda(lambda x: mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")(
  2039.                 [target_bbox, target_class_ids, mrcnn_bbox])
  2040.             mask_loss = KL.Lambda(lambda x: mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")(
  2041.                 [target_mask, target_class_ids, mrcnn_mask])
  2042.  
  2043.             # Model
  2044.             inputs = [input_image, input_image_meta,
  2045.                       input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks]
  2046.             if not config.USE_RPN_ROIS:
  2047.                 inputs.append(input_rois)
  2048.             outputs = [rpn_class_logits, rpn_class, rpn_bbox,
  2049.                        mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask,
  2050.                        rpn_rois, output_rois,
  2051.                        rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss]
  2052.             model = KM.Model(inputs, outputs, name='mask_rcnn')
  2053.         else:
  2054.             # Network Heads
  2055.             # Proposal classifier and BBox regressor heads
  2056.             mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\
  2057.                 fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta,
  2058.                                      config.POOL_SIZE, config.NUM_CLASSES,
  2059.                                      train_bn=config.TRAIN_BN,
  2060.                                      fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE)
  2061.  
  2062.             # Detections
  2063.             # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in
  2064.             # normalized coordinates
  2065.             detections = DetectionLayer(config, name="mrcnn_detection")(
  2066.                 [rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta])
  2067.  
  2068.             # Create masks for detections
  2069.             detection_boxes = KL.Lambda(lambda x: x[..., :4])(detections)
  2070.             mrcnn_mask = build_fpn_mask_graph(detection_boxes, mrcnn_feature_maps,
  2071.                                               input_image_meta,
  2072.                                               config.MASK_POOL_SIZE,
  2073.                                               config.NUM_CLASSES,
  2074.                                               train_bn=config.TRAIN_BN)
  2075.  
  2076.             model = KM.Model([input_image, input_image_meta, input_anchors],
  2077.                              [detections, mrcnn_class, mrcnn_bbox,
  2078.                                  mrcnn_mask, rpn_rois, rpn_class, rpn_bbox],
  2079.                              name='mask_rcnn')
  2080.  
  2081.         # Add multi-GPU support.
  2082.         if config.GPU_COUNT > 1:
  2083.             from mrcnn.parallel_model import ParallelModel
  2084.             model = ParallelModel(model, config.GPU_COUNT)
  2085.  
  2086.         return model
  2087.  
  2088.     def find_last(self):
  2089.         """Finds the last checkpoint file of the last trained model in the
  2090.        model directory.
  2091.        Returns:
  2092.            The path of the last checkpoint file
  2093.        """
  2094.         # Get directory names. Each directory corresponds to a model
  2095.         dir_names = next(os.walk(self.model_dir))[1]
  2096.         key = self.config.NAME.lower()
  2097.         dir_names = filter(lambda f: f.startswith(key), dir_names)
  2098.         dir_names = sorted(dir_names)
  2099.         if not dir_names:
  2100.             import errno
  2101.             raise FileNotFoundError(
  2102.                 errno.ENOENT,
  2103.                 "Could not find model directory under {}".format(self.model_dir))
  2104.         # Pick last directory
  2105.         dir_name = os.path.join(self.model_dir, dir_names[-1])
  2106.         # Find the last checkpoint
  2107.         checkpoints = next(os.walk(dir_name))[2]
  2108.         checkpoints = filter(lambda f: f.startswith("mask_rcnn"), checkpoints)
  2109.         checkpoints = sorted(checkpoints)
  2110.         if not checkpoints:
  2111.             import errno
  2112.             raise FileNotFoundError(
  2113.                 errno.ENOENT, "Could not find weight files in {}".format(dir_name))
  2114.         checkpoint = os.path.join(dir_name, checkpoints[-1])
  2115.         return checkpoint
  2116.  
  2117.     def load_weights(self, filepath, by_name=False, exclude=None):
  2118.         """Modified version of the corresponding Keras function with
  2119.        the addition of multi-GPU support and the ability to exclude
  2120.        some layers from loading.
  2121.        exclude: list of layer names to exclude
  2122.        """
  2123.         import h5py
  2124.         # Conditional import to support versions of Keras before 2.2
  2125.         # TODO: remove in about 6 months (end of 2018)
  2126.         try:
  2127.             from tensorflow.keras.engine import saving
  2128.         except ImportError:
  2129.             # Keras before 2.2 used the 'topology' namespace.
  2130.             from tensorflow.keras.engine import topology as saving
  2131.  
  2132.         if exclude:
  2133.             by_name = True
  2134.  
  2135.         if h5py is None:
  2136.             raise ImportError('`load_weights` requires h5py.')
  2137.         f = h5py.File(filepath, mode='r')
  2138.         if 'layer_names' not in f.attrs and 'model_weights' in f:
  2139.             f = f['model_weights']
  2140.  
  2141.         # In multi-GPU training, we wrap the model. Get layers
  2142.         # of the inner model because they have the weights.
  2143.         keras_model = self.keras_model
  2144.         layers = keras_model.inner_model.layers if hasattr(keras_model, "inner_model")\
  2145.             else keras_model.layers
  2146.  
  2147.         # Exclude some layers
  2148.         if exclude:
  2149.             layers = filter(lambda l: l.name not in exclude, layers)
  2150.  
  2151.         if by_name:
  2152.             saving.load_weights_from_hdf5_group_by_name(f, layers)
  2153.         else:
  2154.             saving.load_weights_from_hdf5_group(f, layers)
  2155.         if hasattr(f, 'close'):
  2156.             f.close()
  2157.  
  2158.         # Update the log directory
  2159.         self.set_log_dir(filepath)
  2160.  
  2161.  
  2162.     def get_imagenet_weights(self):
  2163.         """Downloads ImageNet trained weights from Keras.
  2164.        Returns path to weights file.
  2165.        """
  2166.         from tensorflow.keras.utils.data_utils import get_file
  2167.         TF_WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/'\
  2168.                                  'releases/download/v0.2/'\
  2169.                                  'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'
  2170.         weights_path = get_file('resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',
  2171.                                 TF_WEIGHTS_PATH_NO_TOP,
  2172.                                 cache_subdir='models',
  2173.                                 md5_hash='a268eb855778b3df3c7506639542a6af')
  2174.         return weights_path
  2175.  
  2176.     def compile(self, learning_rate, momentum):
  2177.         """Gets the model ready for training. Adds losses, regularization, and
  2178.        metrics. Then calls the Keras compile() function.
  2179.        """
  2180.         # Optimizer object
  2181.         optimizer = tensorflow.keras.optimizers.SGD(
  2182.             lr=learning_rate, momentum=momentum,
  2183.             clipnorm=self.config.GRADIENT_CLIP_NORM)
  2184.         # Add Losses
  2185.         # First, clear previously set losses to avoid duplication
  2186.         self.keras_model._losses = []
  2187.         self.keras_model._per_input_losses = {}
  2188.         loss_names = [
  2189.             "rpn_class_loss",  "rpn_bbox_loss",
  2190.             "mrcnn_class_loss", "mrcnn_bbox_loss", "mrcnn_mask_loss"]
  2191.         for name in loss_names:
  2192.             layer = self.keras_model.get_layer(name)
  2193.             if layer.output in self.keras_model.losses:
  2194.                 continue
  2195.             loss = (
  2196.                 tf.reduce_mean(layer.output, keepdims=True)
  2197.                 * self.config.LOSS_WEIGHTS.get(name, 1.))
  2198.             self.keras_model.add_loss(loss)
  2199.  
  2200.         # Add L2 Regularization
  2201.         # Skip gamma and beta weights of batch normalization layers.
  2202.         reg_losses = [
  2203.             tensnsorflow.keras.regularizers.l2(self.config.WEIGHT_DECAY)(w) / tf.cast(tf.size(w), tf.float32)
  2204.             for w in self.keras_model.trainable_weights
  2205.             if 'gamma' not in w.name and 'beta' not in w.name]
  2206.         self.keras_model.add_loss(tf.add_n(reg_losses))
  2207.  
  2208.         # Compile
  2209.         self.keras_model.compile(
  2210.             optimizer=optimizer,
  2211.             loss=[None] * len(self.keras_model.outputs))
  2212.  
  2213.         # Add metrics for losses
  2214.         for name in loss_names:
  2215.             if name in self.keras_model.metrics_names:
  2216.                 continue
  2217.             layer = self.keras_model.get_layer(name)
  2218.             self.keras_model.metrics_names.append(name)
  2219.             loss = (
  2220.                 tf.reduce_mean(layer.output, keepdims=True)
  2221.                 * self.config.LOSS_WEIGHTS.get(name, 1.))
  2222.             self.keras_model.metrics_tensors.append(loss)
  2223.  
  2224.     def set_trainable(self, layer_regex, keras_model=None, indent=0, verbose=1):
  2225.         """Sets model layers as trainable if their names match
  2226.        the given regular expression.
  2227.        """
  2228.         # Print message on the first call (but not on recursive calls)
  2229.         if verbose > 0 and keras_model is None:
  2230.             log("Selecting layers to train")
  2231.  
  2232.         keras_model = keras_model or self.keras_model
  2233.  
  2234.         # In multi-GPU training, we wrap the model. Get layers
  2235.         # of the inner model because they have the weights.
  2236.         layers = keras_model.inner_model.layers if hasattr(keras_model, "inner_model")\
  2237.             else keras_model.layers
  2238.  
  2239.         for layer in layers:
  2240.             # Is the layer a model?
  2241.             if layer.__class__.__name__ == 'Model':
  2242.                 print("In model: ", layer.name)
  2243.                 self.set_trainable(
  2244.                     layer_regex, keras_model=layer, indent=indent + 4)
  2245.                 continue
  2246.  
  2247.             if not layer.weights:
  2248.                 continue
  2249.             # Is it trainable?
  2250.             trainable = bool(re.fullmatch(layer_regex, layer.name))
  2251.             # Update layer. If layer is a container, update inner layer.
  2252.             if layer.__class__.__name__ == 'TimeDistributed':
  2253.                 layer.layer.trainable = trainable
  2254.             else:
  2255.                 layer.trainable = trainable
  2256.             # Print trainable layer names
  2257.             if trainable and verbose > 0:
  2258.                 log("{}{:20}   ({})".format(" " * indent, layer.name,
  2259.                                             layer.__class__.__name__))
  2260.  
  2261.     def set_log_dir(self, model_path=None):
  2262.         """Sets the model log directory and epoch counter.
  2263.  
  2264.        model_path: If None, or a format different from what this code uses
  2265.            then set a new log directory and start epochs from 0. Otherwise,
  2266.            extract the log directory and the epoch counter from the file
  2267.            name.
  2268.        """
  2269.         # Set date and epoch counter as if starting a new model
  2270.         self.epoch = 0
  2271.         now = datetime.datetime.now()
  2272.  
  2273.         # If we have a model path with date and epochs use them
  2274.         if model_path:
  2275.             # Continue from we left of. Get epoch and date from the file name
  2276.             # A sample model path might look like:
  2277.             # \path\to\logs\coco20171029T2315\mask_rcnn_coco_0001.h5 (Windows)
  2278.             # /path/to/logs/coco20171029T2315/mask_rcnn_coco_0001.h5 (Linux)
  2279.             regex = r".*[/\\][\w-]+(\d{4})(\d{2})(\d{2})T(\d{2})(\d{2})[/\\]mask\_rcnn\_[\w-]+(\d{4})\.h5"
  2280.             m = re.match(regex, model_path)
  2281.             if m:
  2282.                 now = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
  2283.                                         int(m.group(4)), int(m.group(5)))
  2284.                 # Epoch number in file is 1-based, and in Keras code it's 0-based.
  2285.                 # So, adjust for that then increment by one to start from the next epoch
  2286.                 self.epoch = int(m.group(6)) - 1 + 1
  2287.                 print('Re-starting from epoch %d' % self.epoch)
  2288.  
  2289.         # Directory for training logs
  2290.         self.log_dir = os.path.join(self.model_dir, "{}{:%Y%m%dT%H%M}".format(
  2291.             self.config.NAME.lower(), now))
  2292.  
  2293.         # Path to save after each epoch. Include placeholders that get filled by Keras.
  2294.         self.checkpoint_path = os.path.join(self.log_dir, "mask_rcnn_{}_*epoch*.h5".format(
  2295.             self.config.NAME.lower()))
  2296.         self.checkpoint_path = self.checkpoint_path.replace(
  2297.             "*epoch*", "{epoch:04d}")
  2298.        
  2299.     def train(self, train_dataset, val_dataset, learning_rate, epochs, layers,
  2300.               augmentation=None, custom_callbacks=None, no_augmentation_sources=None, best_only=False):
  2301.         """Train the model.
  2302.        train_dataset, val_dataset: Training and validation Dataset objects.
  2303.        learning_rate: The learning rate to train with
  2304.        epochs: Number of training epochs. Note that previous training epochs
  2305.                are considered to be done alreay, so this actually determines
  2306.                the epochs to train in total rather than in this particaular
  2307.                call.
  2308.        layers: Allows selecting wich layers to train. It can be:
  2309.            - A regular expression to match layer names to train
  2310.            - One of these predefined values:
  2311.              heads: The RPN, classifier and mask heads of the network
  2312.              all: All the layers
  2313.              3+: Train Resnet stage 3 and up
  2314.              4+: Train Resnet stage 4 and up
  2315.              5+: Train Resnet stage 5 and up
  2316.        augmentation: Optional. An imgaug (https://github.com/aleju/imgaug)
  2317.            augmentation. For example, passing imgaug.augmenters.Fliplr(0.5)
  2318.            flips images right/left 50% of the time. You can pass complex
  2319.            augmentations as well. This augmentation applies 50% of the
  2320.            time, and when it does it flips images right/left half the time
  2321.            and adds a Gaussian blur with a random sigma in range 0 to 5.
  2322.  
  2323.                augmentation = imgaug.augmenters.Sometimes(0.5, [
  2324.                    imgaug.augmenters.Fliplr(0.5),
  2325.                    imgaug.augmenters.GaussianBlur(sigma=(0.0, 5.0))
  2326.                ])
  2327.         custom_callbacks: Optional. Add custom callbacks to be called
  2328.             with the keras fit_generator method. Must be list of type keras.callbacks.
  2329.        no_augmentation_sources: Optional. List of sources to exclude for
  2330.            augmentation. A source is string that identifies a dataset and is
  2331.            defined in the Dataset class.
  2332.        best_only: if True will only save the best model as defined by the minimum validation loss. Defualt is False.
  2333.        output_save_path: Where to save the epoch history as a CSV
  2334.        """
  2335.         assert self.mode == "training", "Create model in training mode."
  2336.  
  2337.         # Pre-defined layer regular expressions
  2338.         layer_regex = {
  2339.             # all layers but the backbone
  2340.             "heads": r"(mrcnn\_.*)|(rpn\_.*)|(fpn\_.*)",
  2341.             # From a specific Resnet stage and up
  2342.             "3+": r"(res3.*)|(bn3.*)|(res4.*)|(bn4.*)|(res5.*)|(bn5.*)|(mrcnn\_.*)|(rpn\_.*)|(fpn\_.*)",
  2343.             "4+": r"(res4.*)|(bn4.*)|(res5.*)|(bn5.*)|(mrcnn\_.*)|(rpn\_.*)|(fpn\_.*)",
  2344.             "5+": r"(res5.*)|(bn5.*)|(mrcnn\_.*)|(rpn\_.*)|(fpn\_.*)",
  2345.             # All layers
  2346.             "all": ".*",
  2347.         }
  2348.         if layers in layer_regex.keys():
  2349.             layers = layer_regex[layers]
  2350.  
  2351.         # Data generators
  2352.         train_generator = data_generator(train_dataset, self.config, shuffle=True,
  2353.                                          augmentation=augmentation,
  2354.                                          batch_size=self.config.BATCH_SIZE,
  2355.                                          no_augmentation_sources=no_augmentation_sources)
  2356.         val_generator = data_generator(val_dataset, self.config, shuffle=True,
  2357.                                        batch_size=self.config.BATCH_SIZE)
  2358.  
  2359.         # Create log_dir if it does not exist
  2360.         if not os.path.exists(self.log_dir):
  2361.             os.makedirs(self.log_dir)
  2362.                    
  2363.         if best_only:
  2364.             log("Saving best model only...")
  2365.             # Callbacks
  2366.             callbacks = [
  2367.                 tensorsorflow.keras.callbacks.TensorBoard(log_dir=self.log_dir,
  2368.                                             histogram_freq=0, write_graph=True, write_images=False),
  2369.                 tensorflow.keras.callbacks.ModelCheckpoint(self.checkpoint_path,
  2370.                                                 verbose=0, save_weights_only=True, monitor="val_loss", save_best_only=True, mode="min"),
  2371.             ]
  2372.         else:
  2373.             log("Saving all model epochs...")
  2374.             # Callbacks
  2375.             callbacks = [
  2376.             tensorflow.keras.callbacks.TensorBoard(log_dir=self.log_dir,
  2377.                                         histogram_freq=0, write_graph=True, write_images=False),
  2378.             tensororflow.keras.callbacks.ModelCheckpoint(self.checkpoint_path,
  2379.                                             verbose=0, save_weights_only=True),
  2380.             ]
  2381.  
  2382.         # Add custom callbacks to the list
  2383.         if custom_callbacks:
  2384.             callbacks += custom_callbacks
  2385.  
  2386.         # Train
  2387.         log("\nStarting at epoch {}. LR={}\n".format(self.epoch, learning_rate))
  2388.         log("Checkpoint Path: {}".format(self.checkpoint_path))
  2389.         self.set_trainable(layers)
  2390.         self.compile(learning_rate, self.config.LEARNING_MOMENTUM)
  2391.  
  2392.         # Work-around for Windows: Keras fails on Windows when using
  2393.         # multiprocessing workers. See discussion here:
  2394.         # https://github.com/matterport/Mask_RCNN/issues/13#issuecomment-353124009
  2395.         if os.name is 'nt':
  2396.             workers = 0
  2397.         else:
  2398.             workers = multiprocessing.cpu_count()
  2399.  
  2400.         self.keras_model.fit_generator(
  2401.             train_generator,
  2402.             initial_epoch=self.epoch,
  2403.             epochs=epochs,
  2404.             steps_per_epoch=self.config.STEPS_PER_EPOCH,
  2405.             callbacks=callbacks,
  2406.             validation_data=val_generator,
  2407.             validation_steps=self.config.VALIDATION_STEPS,
  2408.             max_queue_size=100,
  2409.             workers=workers,
  2410.             use_multiprocessing=True,
  2411.         )
  2412.        
  2413.    
  2414.         self.epoch = max(self.epoch, epochs)
  2415.  
  2416.     def mold_inputs(self, images):
  2417.         """Takes a list of images and modifies them to the format expected
  2418.        as an input to the neural network.
  2419.        images: List of image matrices [height,width,depth]. Images can have
  2420.            different sizes.
  2421.  
  2422.        Returns 3 Numpy matrices:
  2423.        molded_images: [N, h, w, 3]. Images resized and normalized.
  2424.        image_metas: [N, length of meta data]. Details about each image.
  2425.        windows: [N, (y1, x1, y2, x2)]. The portion of the image that has the
  2426.            original image (padding excluded).
  2427.        """
  2428.         molded_images = []
  2429.         image_metas = []
  2430.         windows = []
  2431.         for image in images:
  2432.             # Resize image
  2433.             # TODO: move resizing to mold_image()
  2434.             molded_image, window, scale, padding, crop = utils.resize_image(
  2435.                 image,
  2436.                 min_dim=self.config.IMAGE_MIN_DIM,
  2437.                 min_scale=self.config.IMAGE_MIN_SCALE,
  2438.                 max_dim=self.config.IMAGE_MAX_DIM,
  2439.                 mode=self.config.IMAGE_RESIZE_MODE)
  2440.             molded_image = mold_image(molded_image, self.config)
  2441.             # Build image_meta
  2442.             image_meta = compose_image_meta(
  2443.                 0, image.shape, molded_image.shape, window, scale,
  2444.                 np.zeros([self.config.NUM_CLASSES], dtype=np.int32))
  2445.             # Append
  2446.             molded_images.append(molded_image)
  2447.             windows.append(window)
  2448.             image_metas.append(image_meta)
  2449.         # Pack into arrays
  2450.         molded_images = np.stack(molded_images)
  2451.         image_metas = np.stack(image_metas)
  2452.         windows = np.stack(windows)
  2453.         return molded_images, image_metas, windows
  2454.  
  2455.     def unmold_detections(self, detections, mrcnn_mask, original_image_shape,
  2456.                           image_shape, window):
  2457.         """Reformats the detections of one image from the format of the neural
  2458.        network output to a format suitable for use in the rest of the
  2459.        application.
  2460.  
  2461.        detections: [N, (y1, x1, y2, x2, class_id, score)] in normalized coordinates
  2462.        mrcnn_mask: [N, height, width, num_classes]
  2463.        original_image_shape: [H, W, C] Original image shape before resizing
  2464.        image_shape: [H, W, C] Shape of the image after resizing and padding
  2465.        window: [y1, x1, y2, x2] Pixel coordinates of box in the image where the real
  2466.                image is excluding the padding.
  2467.  
  2468.        Returns:
  2469.        boxes: [N, (y1, x1, y2, x2)] Bounding boxes in pixels
  2470.        class_ids: [N] Integer class IDs for each bounding box
  2471.        scores: [N] Float probability scores of the class_id
  2472.        masks: [height, width, num_instances] Instance masks
  2473.        """
  2474.         # How many detections do we have?
  2475.         # Detections array is padded with zeros. Find the first class_id == 0.
  2476.         zero_ix = np.where(detections[:, 4] == 0)[0]
  2477.         N = zero_ix[0] if zero_ix.shape[0] > 0 else detections.shape[0]
  2478.  
  2479.         # Extract boxes, class_ids, scores, and class-specific masks
  2480.         boxes = detections[:N, :4]
  2481.         class_ids = detections[:N, 4].astype(np.int32)
  2482.         scores = detections[:N, 5]
  2483.         masks = mrcnn_mask[np.arange(N), :, :, class_ids]
  2484.  
  2485.         # Translate normalized coordinates in the resized image to pixel
  2486.         # coordinates in the original image before resizing
  2487.         window = utils.norm_boxes(window, image_shape[:2])
  2488.         wy1, wx1, wy2, wx2 = window
  2489.         shift = np.array([wy1, wx1, wy1, wx1])
  2490.         wh = wy2 - wy1  # window height
  2491.         ww = wx2 - wx1  # window width
  2492.         scale = np.array([wh, ww, wh, ww])
  2493.         # Convert boxes to normalized coordinates on the window
  2494.         boxes = np.divide(boxes - shift, scale)
  2495.         # Convert boxes to pixel coordinates on the original image
  2496.         boxes = utils.denorm_boxes(boxes, original_image_shape[:2])
  2497.  
  2498.         # Filter out detections with zero area. Happens in early training when
  2499.         # network weights are still random
  2500.         exclude_ix = np.where(
  2501.             (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0]
  2502.         if exclude_ix.shape[0] > 0:
  2503.             boxes = np.delete(boxes, exclude_ix, axis=0)
  2504.             class_ids = np.delete(class_ids, exclude_ix, axis=0)
  2505.             scores = np.delete(scores, exclude_ix, axis=0)
  2506.             masks = np.delete(masks, exclude_ix, axis=0)
  2507.             N = class_ids.shape[0]
  2508.  
  2509.         # Resize masks to original image size and set boundary threshold.
  2510.         full_masks = []
  2511.         for i in range(N):
  2512.             # Convert neural network mask to full size mask
  2513.             full_mask = utils.unmold_mask(masks[i], boxes[i], original_image_shape)
  2514.             full_masks.append(full_mask)
  2515.         full_masks = np.stack(full_masks, axis=-1)\
  2516.             if full_masks else np.empty(original_image_shape[:2] + (0,))
  2517.  
  2518.         return boxes, class_ids, scores, full_masks
  2519.  
  2520.     def detect(self, images, verbose=0):
  2521.         """Runs the detection pipeline.
  2522.  
  2523.        images: List of images, potentially of different sizes.
  2524.  
  2525.        Returns a list of dicts, one dict per image. The dict contains:
  2526.        rois: [N, (y1, x1, y2, x2)] detection bounding boxes
  2527.        class_ids: [N] int class IDs
  2528.        scores: [N] float probability scores for the class IDs
  2529.        masks: [H, W, N] instance binary masks
  2530.        """
  2531.         assert self.mode == "inference", "Create model in inference mode."
  2532.         assert len(
  2533.             images) == self.config.BATCH_SIZE, "len(images) must be equal to BATCH_SIZE"
  2534.  
  2535.         if verbose:
  2536.             log("Processing {} images".format(len(images)))
  2537.             for image in images:
  2538.                 log("image", image)
  2539.  
  2540.         # Mold inputs to format expected by the neural network
  2541.         molded_images, image_metas, windows = self.mold_inputs(images)
  2542.  
  2543.         # Validate image sizes
  2544.         # All images in a batch MUST be of the same size
  2545.         image_shape = molded_images[0].shape
  2546.         for g in molded_images[1:]:
  2547.             assert g.shape == image_shape,\
  2548.                 "After resizing, all images must have the same size. Check IMAGE_RESIZE_MODE and image sizes."
  2549.  
  2550.         # Anchors
  2551.         anchors = self.get_anchors(image_shape)
  2552.         # Duplicate across the batch dimension because Keras requires it
  2553.         # TODO: can this be optimized to avoid duplicating the anchors?
  2554.         anchors = np.broadcast_to(anchors, (self.config.BATCH_SIZE,) + anchors.shape)
  2555.  
  2556.         if verbose:
  2557.             log("molded_images", molded_images)
  2558.             log("image_metas", image_metas)
  2559.             log("anchors", anchors)
  2560.         # Run object detection
  2561.         detections, _, _, mrcnn_mask, _, _, _ =\
  2562.             self.keras_model.predict([molded_images, image_metas, anchors], verbose=0)
  2563.         # Process detections
  2564.         results = []
  2565.         for i, image in enumerate(images):
  2566.             final_rois, final_class_ids, final_scores, final_masks =\
  2567.                 self.unmold_detections(detections[i], mrcnn_mask[i],
  2568.                                        image.shape, molded_images[i].shape,
  2569.                                        windows[i])
  2570.             results.append({
  2571.                 "rois": final_rois,
  2572.                 "class_ids": final_class_ids,
  2573.                 "scores": final_scores,
  2574.                 "masks": final_masks,
  2575.             })
  2576.         return results
  2577.  
  2578.     def detect_molded(self, molded_images, image_metas, verbose=0):
  2579.         """Runs the detection pipeline, but expect inputs that are
  2580.        molded already. Used mostly for debugging and inspecting
  2581.        the model.
  2582.  
  2583.        molded_images: List of images loaded using load_image_gt()
  2584.        image_metas: image meta data, also returned by load_image_gt()
  2585.  
  2586.        Returns a list of dicts, one dict per image. The dict contains:
  2587.        rois: [N, (y1, x1, y2, x2)] detection bounding boxes
  2588.        class_ids: [N] int class IDs
  2589.        scores: [N] float probability scores for the class IDs
  2590.        masks: [H, W, N] instance binary masks
  2591.        """
  2592.         assert self.mode == "inference", "Create model in inference mode."
  2593.         assert len(molded_images) == self.config.BATCH_SIZE,\
  2594.             "Number of images must be equal to BATCH_SIZE"
  2595.  
  2596.         if verbose:
  2597.             log("Processing {} images".format(len(molded_images)))
  2598.             for image in molded_images:
  2599.                 log("image", image)
  2600.  
  2601.         # Validate image sizes
  2602.         # All images in a batch MUST be of the same size
  2603.         image_shape = molded_images[0].shape
  2604.         for g in molded_images[1:]:
  2605.             assert g.shape == image_shape, "Images must have the same size"
  2606.  
  2607.         # Anchors
  2608.         anchors = self.get_anchors(image_shape)
  2609.         # Duplicate across the batch dimension because Keras requires it
  2610.         # TODO: can this be optimized to avoid duplicating the anchors?
  2611.         anchors = np.broadcast_to(anchors, (self.config.BATCH_SIZE,) + anchors.shape)
  2612.  
  2613.         if verbose:
  2614.             log("molded_images", molded_images)
  2615.             log("image_metas", image_metas)
  2616.             log("anchors", anchors)
  2617.         # Run object detection
  2618.         detections, _, _, mrcnn_mask, _, _, _ =\
  2619.             self.keras_model.predict([molded_images, image_metas, anchors], verbose=0)
  2620.         # Process detections
  2621.         results = []
  2622.         for i, image in enumerate(molded_images):
  2623.             window = [0, 0, image.shape[0], image.shape[1]]
  2624.             final_rois, final_class_ids, final_scores, final_masks =\
  2625.                 self.unmold_detections(detections[i], mrcnn_mask[i],
  2626.                                        image.shape, molded_images[i].shape,
  2627.                                        window)
  2628.             results.append({
  2629.                 "rois": final_rois,
  2630.                 "class_ids": final_class_ids,
  2631.                 "scores": final_scores,
  2632.                 "masks": final_masks,
  2633.             })
  2634.         return results
  2635.  
  2636.     def get_anchors(self, image_shape):
  2637.         """Returns anchor pyramid for the given image size."""
  2638.         backbone_shapes = compute_backbone_shapes(self.config, image_shape)
  2639.         # Cache anchors and reuse if image shape is the same
  2640.         if not hasattr(self, "_anchor_cache"):
  2641.             self._anchor_cache = {}
  2642.         if not tuple(image_shape) in self._anchor_cache:
  2643.             # Generate Anchors
  2644.             a = utils.generate_pyramid_anchors(
  2645.                 self.config.RPN_ANCHOR_SCALES,
  2646.                 self.config.RPN_ANCHOR_RATIOS,
  2647.                 backbone_shapes,
  2648.                 self.config.BACKBONE_STRIDES,
  2649.                 self.config.RPN_ANCHOR_STRIDE)
  2650.             # Keep a copy of the latest anchors in pixel coordinates because
  2651.             # it's used in inspect_model notebooks.
  2652.             # TODO: Remove this after the notebook are refactored to not use it
  2653.             self.anchors = a
  2654.             # Normalize coordinates
  2655.             self._anchor_cache[tuple(image_shape)] = utils.norm_boxes(a, image_shape[:2])
  2656.         return self._anchor_cache[tuple(image_shape)]
  2657.  
  2658.     def ancestor(self, tensor, name, checked=None):
  2659.         """Finds the ancestor of a TF tensor in the computation graph.
  2660.        tensor: TensorFlow symbolic tensor.
  2661.        name: Name of ancestor tensor to find
  2662.        checked: For internal use. A list of tensors that were already
  2663.                 searched to avoid loops in traversing the graph.
  2664.        """
  2665.         checked = checked if checked is not None else []
  2666.         # Put a limit on how deep we go to avoid very long loops
  2667.         if len(checked) > 500:
  2668.             return None
  2669.         # Convert name to a regex and allow matching a number prefix
  2670.         # because Keras adds them automatically
  2671.         if isinstance(name, str):
  2672.             name = re.compile(name.replace("/", r"(\_\d+)*/"))
  2673.  
  2674.         parents = tensor.op.inputs
  2675.         for p in parents:
  2676.             if p in checked:
  2677.                 continue
  2678.             if bool(re.fullmatch(name, p.name)):
  2679.                 return p
  2680.             checked.append(p)
  2681.             a = self.ancestor(p, name, checked)
  2682.             if a is not None:
  2683.                 return a
  2684.         return None
  2685.  
  2686.     def find_trainable_layer(self, layer):
  2687.         """If a layer is encapsulated by another layer, this function
  2688.        digs through the encapsulation and returns the layer that holds
  2689.        the weights.
  2690.        """
  2691.         if layer.__class__.__name__ == 'TimeDistributed':
  2692.             return self.find_trainable_layer(layer.layer)
  2693.         return layer
  2694.  
  2695.     def get_trainable_layers(self):
  2696.         """Returns a list of layers that have weights."""
  2697.         layers = []
  2698.         # Loop through all layers
  2699.         for l in self.keras_model.layers:
  2700.             # If layer is a wrapper, find inner trainable layer
  2701.             l = self.find_trainable_layer(l)
  2702.             # Include layer if it has weights
  2703.             if l.get_weights():
  2704.                 layers.append(l)
  2705.         return layers
  2706.  
  2707.     def run_graph(self, images, outputs, image_metas=None):
  2708.         """Runs a sub-set of the computation graph that computes the given
  2709.        outputs.
  2710.  
  2711.        image_metas: If provided, the images are assumed to be already
  2712.            molded (i.e. resized, padded, and normalized)
  2713.  
  2714.        outputs: List of tuples (name, tensor) to compute. The tensors are
  2715.            symbolic TensorFlow tensors and the names are for easy tracking.
  2716.  
  2717.        Returns an ordered dict of results. Keys are the names received in the
  2718.        input and values are Numpy arrays.
  2719.        """
  2720.         model = self.keras_model
  2721.  
  2722.         # Organize desired outputs into an ordered dict
  2723.         outputs = OrderedDict(outputs)
  2724.         for o in outputs.values():
  2725.             assert o is not None
  2726.  
  2727.         # Build a Keras function to run parts of the computation graph
  2728.         inputs = model.inputs
  2729.         if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
  2730.             inputs += [K.learning_phase()]
  2731.         kf = K.function(model.inputs, list(outputs.values()))
  2732.  
  2733.         # Prepare inputs
  2734.         if image_metas is None:
  2735.             molded_images, image_metas, _ = self.mold_inputs(images)
  2736.         else:
  2737.             molded_images = images
  2738.         image_shape = molded_images[0].shape
  2739.         # Anchors
  2740.         anchors = self.get_anchors(image_shape)
  2741.         # Duplicate across the batch dimension because Keras requires it
  2742.         # TODO: can this be optimized to avoid duplicating the anchors?
  2743.         anchors = np.broadcast_to(anchors, (self.config.BATCH_SIZE,) + anchors.shape)
  2744.         model_in = [molded_images, image_metas, anchors]
  2745.  
  2746.         # Run inference
  2747.         if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
  2748.             model_in.append(0.)
  2749.         outputs_np = kf(model_in)
  2750.  
  2751.         # Pack the generated Numpy arrays into a a dict and log the results.
  2752.         outputs_np = OrderedDict([(k, v)
  2753.                                   for k, v in zip(outputs.keys(), outputs_np)])
  2754.         for k, v in outputs_np.items():
  2755.             log(k, v)
  2756.         return outputs_np
  2757.  
  2758.  
  2759. ############################################################
  2760. #  Data Formatting
  2761. ############################################################
  2762.  
  2763. def compose_image_meta(image_id, original_image_shape, image_shape,
  2764.                        window, scale, active_class_ids):
  2765.     """Takes attributes of an image and puts them in one 1D array.
  2766.  
  2767.    image_id: An int ID of the image. Useful for debugging.
  2768.    original_image_shape: [H, W, C] before resizing or padding.
  2769.    image_shape: [H, W, C] after resizing and padding
  2770.    window: (y1, x1, y2, x2) in pixels. The area of the image where the real
  2771.            image is (excluding the padding)
  2772.    scale: The scaling factor applied to the original image (float32)
  2773.    active_class_ids: List of class_ids available in the dataset from which
  2774.        the image came. Useful if training on images from multiple datasets
  2775.        where not all classes are present in all datasets.
  2776.    """
  2777.     meta = np.array(
  2778.         [image_id] +                  # size=1
  2779.         list(original_image_shape) +  # size=3
  2780.         list(image_shape) +           # size=3
  2781.         list(window) +                # size=4 (y1, x1, y2, x2) in image cooredinates
  2782.         [scale] +                     # size=1
  2783.         list(active_class_ids)        # size=num_classes
  2784.     )
  2785.     return meta
  2786.  
  2787.  
  2788. def parse_image_meta(meta):
  2789.     """Parses an array that contains image attributes to its components.
  2790.    See compose_image_meta() for more details.
  2791.  
  2792.    meta: [batch, meta length] where meta length depends on NUM_CLASSES
  2793.  
  2794.    Returns a dict of the parsed values.
  2795.    """
  2796.     image_id = meta[:, 0]
  2797.     original_image_shape = meta[:, 1:4]
  2798.     image_shape = meta[:, 4:7]
  2799.     window = meta[:, 7:11]  # (y1, x1, y2, x2) window of image in in pixels
  2800.     scale = meta[:, 11]
  2801.     active_class_ids = meta[:, 12:]
  2802.     return {
  2803.         "image_id": image_id.astype(np.int32),
  2804.         "original_image_shape": original_image_shape.astype(np.int32),
  2805.         "image_shape": image_shape.astype(np.int32),
  2806.         "window": window.astype(np.int32),
  2807.         "scale": scale.astype(np.float32),
  2808.         "active_class_ids": active_class_ids.astype(np.int32),
  2809.     }
  2810.  
  2811.  
  2812. def parse_image_meta_graph(meta):
  2813.     """Parses a tensor that contains image attributes to its components.
  2814.    See compose_image_meta() for more details.
  2815.  
  2816.    meta: [batch, meta length] where meta length depends on NUM_CLASSES
  2817.  
  2818.    Returns a dict of the parsed tensors.
  2819.    """
  2820.     image_id = meta[:, 0]
  2821.     original_image_shape = meta[:, 1:4]
  2822.     image_shape = meta[:, 4:7]
  2823.     window = meta[:, 7:11]  # (y1, x1, y2, x2) window of image in in pixels
  2824.     scale = meta[:, 11]
  2825.     active_class_ids = meta[:, 12:]
  2826.     return {
  2827.         "image_id": image_id,
  2828.         "original_image_shape": original_image_shape,
  2829.         "image_shape": image_shape,
  2830.         "window": window,
  2831.         "scale": scale,
  2832.         "active_class_ids": active_class_ids,
  2833.     }
  2834.  
  2835.  
  2836. def mold_image(images, config):
  2837.     """Expects an RGB image (or array of images) and subtracts
  2838.    the mean pixel and converts it to float. Expects image
  2839.    colors in RGB order.
  2840.    """
  2841.     return images.astype(np.float32) - config.MEAN_PIXEL
  2842.  
  2843.  
  2844. def unmold_image(normalized_images, config):
  2845.     """Takes a image normalized with mold() and returns the original."""
  2846.     return (normalized_images + config.MEAN_PIXEL).astype(np.uint8)
  2847.  
  2848.  
  2849. ############################################################
  2850. #  Miscellenous Graph Functions
  2851. ############################################################
  2852.  
  2853. def trim_zeros_graph(boxes, name='trim_zeros'):
  2854.     """Often boxes are represented with matrices of shape [N, 4] and
  2855.    are padded with zeros. This removes zero boxes.
  2856.  
  2857.    boxes: [N, 4] matrix of boxes.
  2858.    non_zeros: [N] a 1D boolean mask identifying the rows to keep
  2859.    """
  2860.     non_zeros = tf.cast(tf.reduce_sum(tf.abs(boxes), axis=1), tf.bool)
  2861.     boxes = tf.boolean_mask(boxes, non_zeros, name=name)
  2862.     return boxes, non_zeros
  2863.  
  2864.  
  2865. def batch_pack_graph(x, counts, num_rows):
  2866.     """Picks different number of values from each row
  2867.    in x depending on the values in counts.
  2868.    """
  2869.     outputs = []
  2870.     for i in range(num_rows):
  2871.         outputs.append(x[i, :counts[i]])
  2872.     return tf.concat(outputs, axis=0)
  2873.  
  2874.  
  2875. def norm_boxes_graph(boxes, shape):
  2876.     """Converts boxes from pixel coordinates to normalized coordinates.
  2877.    boxes: [..., (y1, x1, y2, x2)] in pixel coordinates
  2878.    shape: [..., (height, width)] in pixels
  2879.  
  2880.    Note: In pixel coordinates (y2, x2) is outside the box. But in normalized
  2881.    coordinates it's inside the box.
  2882.  
  2883.    Returns:
  2884.        [..., (y1, x1, y2, x2)] in normalized coordinates
  2885.    """
  2886.     h, w = tf.split(tf.cast(shape, tf.float32), 2)
  2887.     scale = tf.concat([h, w, h, w], axis=-1) - tf.constant(1.0)
  2888.     shift = tf.constant([0., 0., 1., 1.])
  2889.     return tf.divide(boxes - shift, scale)
  2890.  
  2891.  
  2892. def denorm_boxes_graph(boxes, shape):
  2893.     """Converts boxes from normalized coordinates to pixel coordinates.
  2894.    boxes: [..., (y1, x1, y2, x2)] in normalized coordinates
  2895.    shape: [..., (height, width)] in pixels
  2896.  
  2897.    Note: In pixel coordinates (y2, x2) is outside the box. But in normalized
  2898.    coordinates it's inside the box.
  2899.  
  2900.    Returns:
  2901.        [..., (y1, x1, y2, x2)] in pixel coordinates
  2902.    """
  2903.     h, w = tf.split(tf.cast(shape, tf.float32), 2)
  2904.     scale = tf.concat([h, w, h, w], axis=-1) - tf.constant(1.0)
  2905.     shift = tf.constant([0., 0., 1., 1.])
  2906.     return tf.cast(tf.round(tf.multiply(boxes, scale) + shift), tf.int32)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement