Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import absolute_import
- from __future__ import division
- from __future__ import print_function
- import multiprocessing
- import six
- import tensorflow as tf
- # Define the format of your input data including unused columns
- CSV_COLUMNS = ['comment_english', 'comment_sentiment', 'keyword',
- 'syntax_lemma', 'section_name',
- 'section_code']
- CSV_COLUMN_DEFAULTS = [[''], [''], [''], [''], [''], ['']]
- LABEL_COLUMN = 'section_code'
- LABELS = [' TS4CFS6', ' TS3CFS6', ' TS6CFS6', ' TS7CSQ4', ' TS7CT3', ' TS2CC8', ' TS2CC4',
- ' TS7CT2', ' TS7CSQ2', ' TS2CP5', ' TS7CHP1', ' TS1CSQ4', ' TS2CC9', ' TS7CSQ1',
- ' TS2CC1', ' TS2CC5', ' TS3CFS2', ' TS4CFS2', ' TS6CFS2', ' TS3CFS1', ' TS1CSQ3',
- ' TS6CSQ2', ' TS2CC3', ' TS6CSQ4', ' TS2CC2', ' TS1CPS3', ' TS2CTR2',
- ' TS6CSQ1', ' TS1CPS1', ' TS1CSQ1', ' TS5CSQ2', ' TS3CSQ2', ' TS4CSQ2', ' TS3CTR1',
- ' TS6CFS1', ' TS6CSQ5', ' TS6CTR1', ' TS1CPS2', ' TS4CSQ4', ' TS3CFS3', ' TS6CFS3',
- ' TS4CFS3', ' TS5CFS3', ' TS3CSQ1', ' TS4CTR1', ' TS2CP2', ' TS1CPS5', ' TS7CT1',
- ' TS2CC6', ' TS7CT4', ' TS2CP4', ' TS5CTR1', ' TS4CSQ1', ' TS5CSQ1', ' TS2CP1',
- ' TS5CSQ4', ' TS5CSQ5', ' TS5CFS1', ' TS3CSQ4', ' TS7CSQ3', ' TS5CFS5', ' TS4CSQ3',
- ' TS1CTR1', ' TS2CTR1', ' TS2CC7', ' TS6CSQ3', ' TS5CSQ3', ' TS3CSQ3', ' TS1CSQ2',
- ' TS3CSQ5', ' TS3CFS4', ' TS5CFS4', ' TS4CFS4', ' TS6CFS4', ' TS5CFS2', ' TS4CSQ5',
- ' TS6CFS5', ' TS3CFS5', ' TS4CFS5', ' TS4CFS1', ' TS2CP3', ' TS1CPS4']
- # Define the initial ingestion of each feature used by your model.
- # Additionally, provide metadata about the feature.
- INPUT_COLUMNS = [
- # For columns with a large number of values, or unknown values
- # We can use a hash function to convert to categories.
- tf.feature_column.categorical_column_with_hash_bucket(
- 'comment_english', hash_bucket_size=1000, dtype=tf.string),
- # Categorical base columns
- # For categorical columns with known values we can provide lists
- # of values ahead of time.
- tf.feature_column.categorical_column_with_vocabulary_list(
- 'comment_sentiment', [' NEGATIVE', ' POSITIVE', ' MIXED']),
- # For columns with a large number of values, or unknown values
- # We can use a hash function to convert to categories.
- tf.feature_column.categorical_column_with_hash_bucket(
- 'keyword', hash_bucket_size=100, dtype=tf.string),
- tf.feature_column.categorical_column_with_hash_bucket(
- 'syntax_lemma', hash_bucket_size=100, dtype=tf.string)
- UNUSED_COLUMNS = set(CSV_COLUMNS) - {col.name for col in INPUT_COLUMNS} - {LABEL_COLUMN}
- def build_estimator(config, embedding_size=4, hidden_units=None):
- """Build a wide and deep model for predicting income category.
- To define model we can use the prebuilt DNNCombinedLinearClassifier class,
- and need only define the data transformations particular to our dataset, and
- then
- assign these (potentially) transformed features to either the DNN, or linear
- regression portion of the model.
- Args:
- config: tf.contrib.learn.RunConfig defining the runtime environment for the
- estimator (including model_dir).
- embedding_size: int, the number of dimensions used to represent categorical
- features when providing them as inputs to the DNN.
- hidden_units: [int], the layer sizes of the DNN (input layer first)
- learning_rate: float, the learning rate for the optimizer.
- Returns:
- A DNNCombinedLinearClassifier
- """
- (comment_english, comment_sentiment, keyword, syntax_lemma) = INPUT_COLUMNS
- # Build an estimator.
- # Reused Transformations.
- # Continuous columns can be converted to cate
- # age_buckets = tf.feature_column.bucketized_column(
- # age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
- # Wide columns and deep columns.
- wide_columns = [
- # Interactions between different categorical features can also
- # be added as new virtual features.
- # tf.feature_column.crossed_column(
- # ['comment_english', 'keyword', 'syntax_lemma'], hash_bucket_size=int(1e6)),
- tf.feature_column.crossed_column(
- ['keyword', 'syntax_lemma'], hash_bucket_size=int(1e4)),
- comment_english,
- comment_sentiment,
- keyword,
- syntax_lemma,
- ]
- deep_columns = [
- # Use indicator columns for low dimensional vocabularies
- tf.feature_column.indicator_column(comment_english),
- tf.feature_column.indicator_column(comment_sentiment),
- # Use embedding columns for high dimensional vocabularies
- tf.feature_column.embedding_column(
- keyword, dimension=embedding_size),
- tf.feature_column.embedding_column(syntax_lemma, dimension=embedding_size)
- ]
- return tf.estimator.DNNLinearCombinedClassifier(
- config=config,
- linear_feature_columns=wide_columns,
- dnn_feature_columns=deep_columns,
- dnn_hidden_units=hidden_units or [100, 70, 50, 25],
- )
Add Comment
Please, Sign In to add comment