Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- import apache_beam as beam
- import google.cloud.ml as ml
- import google.cloud.ml.io as io
- from google.cloud.ml.features._features import GraphOptions
- import tensorflow as tf
- def main(hack):
- pipeline = beam.Pipeline('DirectPipelineRunner')
- csv_columns = ('id', 'image_filename',)
- feature_set = {
- 'key': ml.features.key('id'),
- # The 'image' feature is a ImageFeatureColumn that reads from the CSV's
- # 'image_filename' column.
- 'image': ml.features.image('image_filename').image(
- target_size=(256, 256)),
- }
- # Mock reading in a CSV.
- rows = pipeline | beam.Create([
- {'id': 1, 'image_filename': 'image.jpg'}
- ])
- # Run preprocess step, which generates metadata and processes the raw CSV
- # rows into features.
- (metadata, features) = (
- rows
- | 'Preprocess' >> ml.Preprocess(
- feature_set,
- input_format='csv',
- format_metadata={
- 'headers': csv_columns
- }))
- # This hack makes the script work.
- if hack:
- metadata = metadata | beam.Map(metadata_image_hack)
- # Using metadata, create a Tensor that accepts the tf.Examples as a
- # feed_dict. This is where the script breaks.
- metadata | beam.Map(create_tensor)
- pipeline.run()
- def create_tensor(metadata):
- # Wrap metadata dict inside an object where the dict's keys can be accessed
- # as the object's attributes.
- metadata = GraphOptions(metadata)
- # Create a placeholder for tf.Example-encoded features.
- placeholder = tf.placeholder(tf.string, name='input', shape=(None,))
- # Parse the tf.Example-encoded features into a dict mapping feature names
- # to feature Tensors, each with a dtype and shape.
- features = ml.features.FeatureMetadata.parse_features(metadata, placeholder)
- # Get the feature Tensor for the 'image' feature. This should contain a
- # batch of JPEG strings (1 per example).
- image_feature = features['image']
- batch_size = image_feature.get_shape()[0]
- # For each JPEG string in the batch, decode the jpeg from the feature
- # Tensor.
- #
- # Since the 'image' feature Tensor is given a shape (None, >1) by
- # FeatureMetadata.parse_features, this is impossible.
- imgs = tf.map_fn(lambda x: tf.image.decode_jpeg(tf.reshape(x, [])),
- image_feature, dtype=tf.uint8)
- return imgs
- def metadata_image_hack(metadata_raw):
- """This is a hack to fix a bug in Google ML's ImageTransform:
- _transforms.ImageTransform.transform() serializes images to string-encoded
- JPEG/PNG scalars, but sets _transforms.ImageTransform.feature_size to (8 *
- target_size[0] * target_size[1]). But when deserializing (in
- _features.FeatureMetadata.parse_features()), we need feature_size to be 1,
- or else parse_features breaks.
- Here, we replace the metadata value to specify the image size as 1 instead
- of >1 so that deserialization works.
- """
- import copy
- image_col_names = set()
- for col_name, col_attrs in metadata_raw['columns'].items():
- if col_attrs['type'] == 'image' and col_attrs['transform'] == 'image':
- image_col_names.add(col_name)
- metadata = copy.deepcopy(metadata_raw)
- for feat_name, feat_attrs in metadata['features'].items():
- # if feature includes image column name:
- if len(set(feat_attrs['columns']) & image_col_names) > 0:
- print("changing size for image feature: %s" % feat_name)
- metadata['features'][feat_name]['size'] = 1
- return metadata
- if __name__ == "__main__":
- if len(sys.argv) > 1 and sys.argv[1] == '--hack':
- hack = True
- else:
- hack = False
- main(hack)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement