Guest User

Untitled

a guest
May 16th, 2018
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.96 KB | None | 0 0
  1. from __future__ import absolute_import
  2. from __future__ import division
  3. from __future__ import print_function
  4.  
  5. import multiprocessing
  6. import six
  7. import tensorflow as tf
  8.  
  9. # Define the format of your input data including unused columns
  10. CSV_COLUMNS = ['comment_english', 'comment_sentiment', 'keyword',
  11. 'syntax_lemma', 'section_name',
  12. 'section_code']
  13. CSV_COLUMN_DEFAULTS = [[''], [''], [''], [''], [''], ['']]
  14. LABEL_COLUMN = 'section_code'
  15. LABELS = [' TS4CFS6', ' TS3CFS6', ' TS6CFS6', ' TS7CSQ4', ' TS7CT3', ' TS2CC8', ' TS2CC4',
  16. ' TS7CT2', ' TS7CSQ2', ' TS2CP5', ' TS7CHP1', ' TS1CSQ4', ' TS2CC9', ' TS7CSQ1',
  17. ' TS2CC1', ' TS2CC5', ' TS3CFS2', ' TS4CFS2', ' TS6CFS2', ' TS3CFS1', ' TS1CSQ3',
  18. ' TS6CSQ2', ' TS2CC3', ' TS6CSQ4', ' TS2CC2', ' TS1CPS3', ' TS2CTR2',
  19. ' TS6CSQ1', ' TS1CPS1', ' TS1CSQ1', ' TS5CSQ2', ' TS3CSQ2', ' TS4CSQ2', ' TS3CTR1',
  20. ' TS6CFS1', ' TS6CSQ5', ' TS6CTR1', ' TS1CPS2', ' TS4CSQ4', ' TS3CFS3', ' TS6CFS3',
  21. ' TS4CFS3', ' TS5CFS3', ' TS3CSQ1', ' TS4CTR1', ' TS2CP2', ' TS1CPS5', ' TS7CT1',
  22. ' TS2CC6', ' TS7CT4', ' TS2CP4', ' TS5CTR1', ' TS4CSQ1', ' TS5CSQ1', ' TS2CP1',
  23. ' TS5CSQ4', ' TS5CSQ5', ' TS5CFS1', ' TS3CSQ4', ' TS7CSQ3', ' TS5CFS5', ' TS4CSQ3',
  24. ' TS1CTR1', ' TS2CTR1', ' TS2CC7', ' TS6CSQ3', ' TS5CSQ3', ' TS3CSQ3', ' TS1CSQ2',
  25. ' TS3CSQ5', ' TS3CFS4', ' TS5CFS4', ' TS4CFS4', ' TS6CFS4', ' TS5CFS2', ' TS4CSQ5',
  26. ' TS6CFS5', ' TS3CFS5', ' TS4CFS5', ' TS4CFS1', ' TS2CP3', ' TS1CPS4']
  27.  
  28. # Define the initial ingestion of each feature used by your model.
  29. # Additionally, provide metadata about the feature.
  30. INPUT_COLUMNS = [
  31.  
  32. # For columns with a large number of values, or unknown values
  33. # We can use a hash function to convert to categories.
  34. tf.feature_column.categorical_column_with_hash_bucket(
  35. 'comment_english', hash_bucket_size=1000, dtype=tf.string),
  36.  
  37.  
  38. # Categorical base columns
  39.  
  40. # For categorical columns with known values we can provide lists
  41. # of values ahead of time.
  42. tf.feature_column.categorical_column_with_vocabulary_list(
  43. 'comment_sentiment', [' NEGATIVE', ' POSITIVE', ' MIXED']),
  44.  
  45. # For columns with a large number of values, or unknown values
  46. # We can use a hash function to convert to categories.
  47. tf.feature_column.categorical_column_with_hash_bucket(
  48. 'keyword', hash_bucket_size=100, dtype=tf.string),
  49. tf.feature_column.categorical_column_with_hash_bucket(
  50. 'syntax_lemma', hash_bucket_size=100, dtype=tf.string)
  51.  
  52. UNUSED_COLUMNS = set(CSV_COLUMNS) - {col.name for col in INPUT_COLUMNS} - {LABEL_COLUMN}
  53.  
  54. def build_estimator(config, embedding_size=4, hidden_units=None):
  55. """Build a wide and deep model for predicting income category.
  56.  
  57. To define model we can use the prebuilt DNNCombinedLinearClassifier class,
  58. and need only define the data transformations particular to our dataset, and
  59. then
  60. assign these (potentially) transformed features to either the DNN, or linear
  61. regression portion of the model.
  62.  
  63. Args:
  64. config: tf.contrib.learn.RunConfig defining the runtime environment for the
  65. estimator (including model_dir).
  66. embedding_size: int, the number of dimensions used to represent categorical
  67. features when providing them as inputs to the DNN.
  68. hidden_units: [int], the layer sizes of the DNN (input layer first)
  69. learning_rate: float, the learning rate for the optimizer.
  70. Returns:
  71. A DNNCombinedLinearClassifier
  72. """
  73. (comment_english, comment_sentiment, keyword, syntax_lemma) = INPUT_COLUMNS
  74. # Build an estimator.
  75.  
  76. # Reused Transformations.
  77. # Continuous columns can be converted to cate
  78.  
  79. # age_buckets = tf.feature_column.bucketized_column(
  80. # age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
  81.  
  82. # Wide columns and deep columns.
  83. wide_columns = [
  84. # Interactions between different categorical features can also
  85. # be added as new virtual features.
  86. # tf.feature_column.crossed_column(
  87. # ['comment_english', 'keyword', 'syntax_lemma'], hash_bucket_size=int(1e6)),
  88. tf.feature_column.crossed_column(
  89. ['keyword', 'syntax_lemma'], hash_bucket_size=int(1e4)),
  90. comment_english,
  91. comment_sentiment,
  92. keyword,
  93. syntax_lemma,
  94. ]
  95.  
  96. deep_columns = [
  97. # Use indicator columns for low dimensional vocabularies
  98. tf.feature_column.indicator_column(comment_english),
  99. tf.feature_column.indicator_column(comment_sentiment),
  100.  
  101. # Use embedding columns for high dimensional vocabularies
  102. tf.feature_column.embedding_column(
  103. keyword, dimension=embedding_size),
  104. tf.feature_column.embedding_column(syntax_lemma, dimension=embedding_size)
  105. ]
  106.  
  107. return tf.estimator.DNNLinearCombinedClassifier(
  108. config=config,
  109. linear_feature_columns=wide_columns,
  110. dnn_feature_columns=deep_columns,
  111. dnn_hidden_units=hidden_units or [100, 70, 50, 25],
  112. )
Add Comment
Please, Sign In to add comment