Guest User

36667-tensorflow-catgeorical-data-with-vocabulary-list-expec

a guest
Aug 8th, 2018
115
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import numpy as np # linear algebra
  2. import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
  3. import tensorflow as tf
  4. import matplotlib.pyplot as plt
  5. import math
  6. import json
  7.  
  8. from pandas.io.json import json_normalize
  9. from tensorflow.python.data import Dataset
  10.  
  11. import os
  12.  
  13. debugMode = False
  14.  
  15. validation_percentage = 0.15
  16.  
  17. def _create_data_frame(path):
  18.     if isDebug:
  19.         print('Files in Input Directory',os.listdir("../input"))    
  20.     with open(path) as f:
  21.         data = json.load(f)
  22.         df = pd.DataFrame.from_records(data)
  23.         return df
  24.    
  25. def create_vocabulary_list(dataframe,vocab_column):
  26.     vocab = []
  27.     for i in range(0,len(dataframe[vocab_column])):
  28.         v = dataframe[vocab_column].values[i]
  29.         index_array = []
  30.         for j in range(0,len(v)):
  31.             if v[j] not in vocab:
  32.                 vocab.append(v[j])
  33.             index_array.append(vocab.index(v[j]))
  34.         dataframe[vocab_column].values[i] = [index_array]
  35.     return set(vocab),dataframe
  36.  
  37. def create_training_frames(path):
  38.     train_frame = _create_data_frame('../input/train.json')
  39.     train_frame = train_frame.reindex(np.random.permutation(train_frame.index))
  40.     count_of_records = train_frame.count()[0]
  41.     count_of_validation = int(round(count_of_records * validation_percentage))
  42.     count_of_training = int(count_of_records-count_of_validation)
  43.     if debugMode:
  44.         print("Count of Records: ",count_of_records,"Count for Training: ",(count_of_records - count_of_validation),"Count for Validation: ",count_of_validation)
  45.     valid_frame = train_frame.tail(count_of_validation)
  46.     train_frame = train_frame.head(count_of_training)
  47.     return train_frame, valid_frame
  48.  
  49. def create_test_frame(path):
  50.     return _create_data_frame(path)
  51.  
  52. def split_frame_to_labels_and_features(frame,label_cols, feature_cols):
  53.     features = frame[feature_cols].copy()
  54.     labels = frame[label_cols].copy()
  55.     return features, labels
  56.  
  57. def create_feature_columns(key_vocab_dict):
  58.     columns = []
  59.     final_columns=[]
  60.     for key,val in key_vocab_dict.items():
  61.         columns.append(tf.feature_column.categorical_column_with_vocabulary_list(key,val))
  62.     for fc in columns:
  63.         final_columns.append(tf.feature_column.indicator_column(fc))
  64.     return final_columns
  65.  
  66. def input_fn(features,labels,batch_size,num_epochs=None,shuffle=True):
  67.     processed_features = {"ingredients":np.array(features)}
  68.     processed_labels = np.array(labels)
  69.     ds = Dataset.from_tensor_slices((processed_features,processed_labels)) # warning: 2GB limit
  70.     ds = ds.batch(batch_size).repeat(num_epochs)
  71.    
  72.     if shuffle:
  73.         ds = ds.shuffle(10000)
  74.    
  75.     # Return the next batch of data.
  76.     feature_batch, label_batch = ds.make_one_shot_iterator().get_next()
  77.     return feature_batch, label_batch
  78.  
  79. def train_linear(
  80.         learning_rate,
  81.         steps,
  82.         batch_size,
  83.         n_classes,
  84.         feature_cols,
  85.         training_example,
  86.         training_target,
  87.         validation_example,
  88.         validation_target):
  89.    
  90.     periods = 10
  91.     steps_per_period = steps / periods
  92.    
  93.     training_func = lambda: input_fn(training_example,training_target,batch_size)
  94.     validati_func = lambda: input_fn(validation_example,validation_target,batch_size)
  95.     optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
  96.     optimizer = tf.contrib.estimator.clip_gradients_by_norm(optimizer, 5.0)
  97.    
  98.     classifier = tf.estimator.LinearClassifier(
  99.         feature_columns=feature_columns,
  100.         n_classes=n_classes,
  101.         optimizer=optimizer,
  102.         config=tf.estimator.RunConfig(keep_checkpoint_max=1)
  103.     )
  104.     print("Training model...")
  105.     print("LogLoss error (on validation data):")
  106.     training_errors = []
  107.     validation_errors = []
  108.     for period in range(0,periods):
  109.         classifier.train(
  110.             input_fn=training_func,
  111.             steps=steps_per_period
  112.         )
  113.        
  114.         training_predictions = list(classifier.predict(input_fn=training_func))
  115.         training_probabilities = np.array([item['probabilities'] for item in training_predictions])
  116.         training_pred_class_id = np.array([item['class_ids'][0] for item in training_predictions])
  117.         training_pred_one_hot = tf.keras.utils.to_categorical(training_pred_class_id,10)
  118.  
  119.         validation_predictions = list(classifier.predict(input_fn=validati_func))
  120.         validation_probabilities = np.array([item['probabilities'] for item in validation_predictions])    
  121.         validation_pred_class_id = np.array([item['class_ids'][0] for item in validation_predictions])
  122.         validation_pred_one_hot = tf.keras.utils.to_categorical(validation_pred_class_id,10)  
  123.         training_log_loss = metrics.log_loss(training_targets, training_pred_one_hot)
  124.         validation_log_loss = metrics.log_loss(validation_targets, validation_pred_one_hot)
  125.         print("  period %02d : %0.2f" % (period, validation_log_loss))
  126.         training_errors.append(training_log_loss)
  127.         validation_errors.append(validation_log_loss)
  128.     print("Model training finished.")
  129.    
  130.     evaluation_metrics = classifier.evaluate(
  131.         input_fn=validati_func,
  132.         steps=1000
  133.     )
  134.     print("Training set metrics:")
  135.     for m in evaluation_metrics:
  136.         print(m, evaluation_metrics[m])
  137.     print("---")
  138.     return classifier
  139.  
  140. train_frame, valid_frame = create_training_frames('../input/train.json')
  141. test_frame = create_test_frame('../input/test.json')
  142.  
  143. train_vocabulary,train_frame = create_vocabulary_list(train_frame,"ingredients")
  144. valid_vocabulary,valid_frame = create_vocabulary_list(valid_frame,"ingredients")
  145.  
  146. train_features, train_labels = split_frame_to_labels_and_features(train_frame,["cuisine"],["ingredients"])
  147. valid_features, valid_labels = split_frame_to_labels_and_features(valid_frame,["cuisine"],["ingredients"])
  148.  
  149. feature_columns = create_feature_columns({"ingredients":train_vocabulary})
  150.  
  151. train_linear(
  152.         learning_rate=0.001,
  153.         steps=1000,
  154.         batch_size=50,
  155.         n_classes=len(train_vocabulary),
  156.         feature_cols=feature_columns,
  157.         training_example=train_features,
  158.         training_target=train_labels,
  159.         validation_example=valid_features,
  160.         validation_target=valid_labels)
RAW Paste Data