Advertisement
Guest User

Untitled

a guest
Mar 26th, 2017
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.60 KB | None | 0 0
  1. import xgboost as xgb
  2. import numpy as np # linear algebra
  3. import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
  4. import os
  5. import gc
  6. import matplotlib.pyplot as plt
  7. import seaborn as sns
  8. from nltk.corpus import stopwords
  9. from collections import Counter
  10. from sklearn.cross_validation import train_test_split
  11.  
  12. def tfidf_word_match_share(row):
  13. q1words = {}
  14. q2words = {}
  15. for word in str(row['question1']).lower().split():
  16. if word not in stops:
  17. q1words[word] = 1
  18. for word in str(row['question2']).lower().split():
  19. if word not in stops:
  20. q2words[word] = 1
  21. if len(q1words) == 0 or len(q2words) == 0:
  22. # The computer-generated chaff includes a few questions that are nothing but stopwords
  23. return 0
  24.  
  25. shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
  26. total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
  27.  
  28. R = np.sum(shared_weights) / np.sum(total_weights)
  29. return R
  30.  
  31. def word_match_share(row):
  32. q1words = {}
  33. q2words = {}
  34. for word in str(row['question1']).lower().split():
  35. if word not in stops:
  36. q1words[word] = 1
  37. for word in str(row['question2']).lower().split():
  38. if word not in stops:
  39. q2words[word] = 1
  40. if len(q1words) == 0 or len(q2words) == 0:
  41. # The computer-generated chaff includes a few questions that are nothing but stopwords
  42. return 0
  43. shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
  44. shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
  45. R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
  46. return R
  47.  
  48. # Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
  49. def get_weight(count, eps=10000, min_count=2):
  50. if count < min_count:
  51. return 0
  52. else:
  53. return 1 / (count + eps)
  54.  
  55. df_train = pd.read_csv('../data/train.csv')
  56. df_test = pd.read_csv('../data/test.csv')
  57.  
  58. train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
  59. test_qs = pd.Series(df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str)
  60.  
  61. eps = 5000
  62. words = (" ".join(train_qs)).lower().split()
  63. counts = Counter(words)
  64. weights = {word: get_weight(count) for word, count in counts.items()}
  65.  
  66. stops = set(stopwords.words("english"))
  67.  
  68. train_word_match = df_train.apply(word_match_share, axis=1, raw=True)
  69. tfidf_train_word_match = df_train.apply(tfidf_word_match_share, axis=1, raw=True)
  70.  
  71. x_train = pd.DataFrame()
  72. x_test = pd.DataFrame()
  73. x_train['word_match'] = train_word_match
  74. x_train['tfidf_word_match'] = tfidf_train_word_match
  75. x_test['word_match'] = df_test.apply(word_match_share, axis=1, raw=True)
  76. x_test['tfidf_word_match'] = df_test.apply(tfidf_word_match_share, axis=1, raw=True)
  77. y_train = df_train['is_duplicate'].values
  78.  
  79. x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)
  80.  
  81. # Set our parameters for xgboost
  82. params = {}
  83. params['objective'] = 'binary:logistic'
  84. params['eval_metric'] = 'logloss'
  85. params['eta'] = 0.02
  86. params['max_depth'] = 4
  87.  
  88. d_train = xgb.DMatrix(x_train, label=y_train)
  89. d_valid = xgb.DMatrix(x_valid, label=y_valid)
  90.  
  91. watchlist = [(d_train, 'train'), (d_valid, 'valid')]
  92.  
  93. bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)
  94.  
  95. d_test = xgb.DMatrix(x_test)
  96. p_test = bst.predict(d_test)
  97.  
  98. sub = pd.DataFrame()
  99. sub['test_id'] = df_test['test_id']
  100. sub['is_duplicate'] = p_test
  101. sub.to_csv('simple_xgb.csv', index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement