Untitled

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import multiprocessing
import six
import tensorflow as tf

# Define the format of your input data including unused columns
CSV_COLUMNS = ['comment_english', 'comment_sentiment', 'keyword',
               'syntax_lemma', 'section_name',
               'section_code']
CSV_COLUMN_DEFAULTS = [[''], [''], [''], [''], [''], ['']]
LABEL_COLUMN = 'section_code'
LABELS = [' TS4CFS6', ' TS3CFS6', ' TS6CFS6', ' TS7CSQ4', ' TS7CT3', ' TS2CC8', ' TS2CC4',
          ' TS7CT2', ' TS7CSQ2', ' TS2CP5', ' TS7CHP1', ' TS1CSQ4', ' TS2CC9', ' TS7CSQ1',
          ' TS2CC1', ' TS2CC5', ' TS3CFS2', ' TS4CFS2', ' TS6CFS2', ' TS3CFS1', ' TS1CSQ3',
          ' TS6CSQ2', ' TS2CC3', ' TS6CSQ4', ' TS2CC2', ' TS1CPS3', ' TS2CTR2',
          ' TS6CSQ1', ' TS1CPS1', ' TS1CSQ1', ' TS5CSQ2', ' TS3CSQ2', ' TS4CSQ2', ' TS3CTR1',
          ' TS6CFS1', ' TS6CSQ5', ' TS6CTR1', ' TS1CPS2', ' TS4CSQ4', ' TS3CFS3', ' TS6CFS3',
          ' TS4CFS3', ' TS5CFS3', ' TS3CSQ1', ' TS4CTR1', ' TS2CP2', ' TS1CPS5', ' TS7CT1',
          ' TS2CC6', ' TS7CT4', ' TS2CP4', ' TS5CTR1', ' TS4CSQ1', ' TS5CSQ1', ' TS2CP1',
          ' TS5CSQ4', ' TS5CSQ5', ' TS5CFS1', ' TS3CSQ4', ' TS7CSQ3', ' TS5CFS5', ' TS4CSQ3',
          ' TS1CTR1', ' TS2CTR1', ' TS2CC7', ' TS6CSQ3', ' TS5CSQ3', ' TS3CSQ3', ' TS1CSQ2',
          ' TS3CSQ5', ' TS3CFS4', ' TS5CFS4', ' TS4CFS4', ' TS6CFS4', ' TS5CFS2', ' TS4CSQ5',
          ' TS6CFS5', ' TS3CFS5', ' TS4CFS5', ' TS4CFS1', ' TS2CP3', ' TS1CPS4']

# Define the initial ingestion of each feature used by your model.
# Additionally, provide metadata about the feature.
INPUT_COLUMNS = [

    # For columns with a large number of values, or unknown values
    # We can use a hash function to convert to categories.
    tf.feature_column.categorical_column_with_hash_bucket(
        'comment_english', hash_bucket_size=1000, dtype=tf.string),


    # Categorical base columns

    # For categorical columns with known values we can provide lists
    # of values ahead of time.
    tf.feature_column.categorical_column_with_vocabulary_list(
        'comment_sentiment', [' NEGATIVE', ' POSITIVE', ' MIXED']),

    # For columns with a large number of values, or unknown values
    # We can use a hash function to convert to categories.
    tf.feature_column.categorical_column_with_hash_bucket(
        'keyword', hash_bucket_size=100, dtype=tf.string),
    tf.feature_column.categorical_column_with_hash_bucket(
        'syntax_lemma', hash_bucket_size=100, dtype=tf.string)

UNUSED_COLUMNS = set(CSV_COLUMNS) - {col.name for col in INPUT_COLUMNS} - {LABEL_COLUMN}

def build_estimator(config, embedding_size=4, hidden_units=None):
    """Build a wide and deep model for predicting income category.

    To define model we can use the prebuilt DNNCombinedLinearClassifier class,
    and need only define the data transformations particular to our dataset, and
    then
    assign these (potentially) transformed features to either the DNN, or linear
    regression portion of the model.

    Args:
      config: tf.contrib.learn.RunConfig defining the runtime environment for the
        estimator (including model_dir).
      embedding_size: int, the number of dimensions used to represent categorical
        features when providing them as inputs to the DNN.
      hidden_units: [int], the layer sizes of the DNN (input layer first)
      learning_rate: float, the learning rate for the optimizer.
    Returns:
      A DNNCombinedLinearClassifier
    """
    (comment_english, comment_sentiment, keyword, syntax_lemma) = INPUT_COLUMNS
    # Build an estimator.

    # Reused Transformations.
    # Continuous columns can be converted to cate

# age_buckets = tf.feature_column.bucketized_column(
    #     age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

    # Wide columns and deep columns.
    wide_columns = [
        # Interactions between different categorical features can also
        # be added as new virtual features.
        # tf.feature_column.crossed_column(
        #     ['comment_english', 'keyword', 'syntax_lemma'], hash_bucket_size=int(1e6)),
        tf.feature_column.crossed_column(
            ['keyword', 'syntax_lemma'], hash_bucket_size=int(1e4)),
        comment_english,
        comment_sentiment,
        keyword,
        syntax_lemma,
    ]

    deep_columns = [
        # Use indicator columns for low dimensional vocabularies
        tf.feature_column.indicator_column(comment_english),
        tf.feature_column.indicator_column(comment_sentiment),

        # Use embedding columns for high dimensional vocabularies
        tf.feature_column.embedding_column(
            keyword, dimension=embedding_size),
        tf.feature_column.embedding_column(syntax_lemma, dimension=embedding_size)
    ]

    return tf.estimator.DNNLinearCombinedClassifier(
        config=config,
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=hidden_units or [100, 70, 50, 25],
    )