Advertisement
sreejith2904

Untitled

Apr 11th, 2017
132
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.03 KB | None | 0 0
  1. import pandas as pd
  2. pd.set_option('max_rows', 7)
  3. pd.set_option('expand_frame_repr', False)
  4. import numpy as np
  5. from sklearn.model_selection import train_test_split
  6. #from fuzzy import fuzz
  7.  
  8. import preprocessing as pp
  9. import encoding
  10. import classifier
  11. import features
  12. import visualization as viz
  13.  
  14. bodies = "../../data/train_bodies.csv"
  15. stances = "../../data/train_stances.csv"
  16.  
  17. content = pd.read_csv(bodies, sep=",")
  18. headlines = pd.read_csv(stances, sep=",")
  19.  
  20.  
  21. ## generate necessary token features for dnews heading and news body
  22. content['content_tokens'] = content.articleBody.apply(lambda x : pp.process(x))
  23. headlines['headline_tokens'] = headlines.Headline.apply(lambda x: pp.process(x))
  24.  
  25.  
  26. # ## Begin sentence embedding
  27. header_vectors = np.zeros((headlines.shape[0], 300))
  28. for i, q in enumerate(headlines.headline_tokens.values):
  29.     header_vectors[i, :] = encoding.tovector(q)
  30.  
  31. # ## create the content vector
  32. content_vectors  = np.zeros((content.shape[0], 300))
  33. for i, q in enumerate(content.content_tokens.values):
  34.     content_vectors[i, :] = encoding.tovector(q)
  35.  
  36.  
  37. header_series = pd.Series(header_vectors.tolist())
  38. headlines['headline_vector'] = header_series.values
  39.  
  40. content_series = pd.Series(content_vectors.tolist())
  41. content['content_vector'] = content_series.values
  42.  
  43.  
  44. data = pd.merge(content, headlines, how="left", on="Body ID")
  45.  
  46. data['char_length_body']=data['articleBody'].str.len()
  47. data['char_length_headline']=data['Headline'].str.len()
  48.  
  49.  
  50. #Feature 1 - Words overlapping between headline and content
  51. data['overlapping'] = data[['headline_tokens','content_tokens']].apply(lambda x: features.overlapping(*x), axis=1)
  52. data['phrase_reoccurance'] = data[['headline_tokens','content_tokens']].apply(lambda x: features.freqency_features(*x), axis=1)
  53.  
  54. ## stupid code - boo !
  55. reoccurance_cols = ["reoccur1", "reoccur2", "reoccur3", "reoccur4", "reoccur5", "reoccur6"]
  56. for i in range(0,6) :
  57.     data[reoccurance_cols[i]] = data['phrase_reoccurance'].apply(lambda x: x[i])
  58.  
  59.  
  60. ## Cosine similarity between word vectors
  61. data['cosine'] = data[['headline_vector','content_vector']].apply(lambda x: features.cosine(*x), axis=1)
  62. data['wmdistance'] = data[['headline_tokens','content_tokens']].apply(lambda x: features.wmdistance(*x), axis=1)
  63. data['euclidean'] = data[['headline_vector','content_vector']].apply(lambda x: features.euclidean(*x), axis=1)
  64.  
  65.  
  66. # 80/20 Train-Test Split keeping splits consistent for future runs
  67. train, test = train_test_split(data, test_size = 0.2,random_state= 55)
  68.  
  69. # ----------------------------------------------- Training Data Exploration/Visulation --------------------------------- #
  70.  
  71. #viz.summaryStatistics(train)
  72. # viz.plot_overlapping(train)
  73. # viz.plot_HLS(train)
  74. # viz.plot_CLS(train)
  75.  
  76. # viz.feature_bodyLength(train)
  77. # viz.countPlot_headline_article(train)
  78. # viz.pointPlot(train)
  79. # viz.pairPlot(train)
  80. #viz.dataFrame_CSV(train)
  81. # ---------------------------------------------------------------------------------------------------------------------#
  82. ## XGBoost classifier
  83. gbm = classifier.train_XGB(train, test)
  84. print("XGBoost classifier built...")
  85.  
  86.  
  87. # XGBoost only accepts numerical fields - So I'm gonna remove the rest from test data
  88. # we need to confirm this
  89.  
  90.  
  91. # clf = classifier.train_SVM(train)
  92. # print("SVM Classifier")
  93.  
  94. # _test = test[["overlapping", "reoccur1", "reoccur2", "reoccur3", "reoccur4", "reoccur5", "reoccur6","euclidean"]]
  95.  
  96. # _predictions = clf.predict(_test)
  97.  
  98. # predictions = pd.Series(_predictions.tolist())
  99. # test["predicted_SVM"] = predictions.values
  100.  
  101. # test["is_correct_prediction_SVM"] = test["Stance"] == test["predicted_SVM"]
  102. # correctly_predicted_rows = test[test['is_correct_prediction_SVM'] == True]
  103.  
  104. # print("Accuracy : ", float(len(correctly_predicted_rows))/len(test))
  105.  
  106. # # ---------------------------------------- Random Forest Classifier -----------------------------------------------------------#
  107.  
  108. # print("Random Forest classifier building...")
  109. # rfc = classifier.randomForest(train,test)
  110. # print("Random Forest classifier built ...")
  111.  
  112. # # ----------------------------------------- Cross Tabulation --------------------------------------------------------------------#
  113.  
  114. # print("\n Cross Tabulation for XGBOOST ",)
  115. # print (pd.crosstab(test.Stance, test.predicted_XGB,margins=True))
  116.  
  117. # print("\n Cross Tabulation for RANDOM FOREST ")
  118. # print (pd.crosstab(test.Stance, test.predicted_RF,margins=True))
  119.  
  120. # print("\n Cross Tabulation for SVM  ")
  121. # print (pd.crosstab(test.Stance, test.predicted_SVM,margins=True))
  122.  
  123. # # ----------------------------------------  Test Data Visualization / Plots  ------------------------------------------------------------#
  124.  
  125. # #viz.summaryStatistics(test)
  126.  
  127. # # Bar Plot for comparing counts of  Actual Stances vs Predicted Stances in Test Data on Random Forest model
  128. # viz.countPlot(test)
  129.  
  130. # # Compare Countplots of Random Forest, XGBoost, SVM on test set
  131. # viz.compare_countPlots(test)
  132.  
  133. # Swarm Plot for comparing counts of  Actual Stances vs Predicted Stances in Test Data on Random Forest model
  134. #viz.swarmPlot(test)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement