Advertisement
Guest User

Untitled

a guest
Oct 18th, 2019
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.18 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# Building Machine Learning Classifiers: Model selection"
  8. ]
  9. },
  10. {
  11. "cell_type": "markdown",
  12. "metadata": {},
  13. "source": [
  14. "### Read in & clean text"
  15. ]
  16. },
  17. {
  18. "cell_type": "code",
  19. "execution_count": 1,
  20. "metadata": {},
  21. "outputs": [],
  22. "source": [
  23. "import nltk\n",
  24. "import pandas as pd\n",
  25. "import re\n",
  26. "from sklearn.feature_extraction.text import TfidfVectorizer\n",
  27. "import string\n",
  28. "\n",
  29. "stopwords = nltk.corpus.stopwords.words('english')\n",
  30. "ps = nltk.PorterStemmer()\n",
  31. "\n",
  32. "data = pd.read_csv(\"SMSSpamCollection.tsv\", sep='\\t')\n",
  33. "data.columns = ['label', 'body_text']\n",
  34. "\n",
  35. "def count_punct(text):\n",
  36. " count = sum([1 for char in text if char in string.punctuation])\n",
  37. " return round(count/(len(text) - text.count(\" \")), 3)*100\n",
  38. "\n",
  39. "data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(\" \"))\n",
  40. "data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))\n",
  41. "\n",
  42. "def clean_text(text):\n",
  43. " text = \"\".join([word.lower() for word in text if word not in string.punctuation])\n",
  44. " tokens = re.split('\\W+', text)\n",
  45. " text = [ps.stem(word) for word in tokens if word not in stopwords]\n",
  46. " return text"
  47. ]
  48. },
  49. {
  50. "cell_type": "markdown",
  51. "metadata": {},
  52. "source": [
  53. "### Split into train/test"
  54. ]
  55. },
  56. {
  57. "cell_type": "code",
  58. "execution_count": 2,
  59. "metadata": {},
  60. "outputs": [],
  61. "source": [
  62. "from sklearn.model_selection import train_test_split\n",
  63. "\n",
  64. "X_train, X_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct%']], data['label'], test_size=0.2)"
  65. ]
  66. },
  67. {
  68. "cell_type": "markdown",
  69. "metadata": {},
  70. "source": [
  71. "### Vectorize text"
  72. ]
  73. },
  74. {
  75. "cell_type": "code",
  76. "execution_count": 3,
  77. "metadata": {},
  78. "outputs": [
  79. {
  80. "data": {
  81. "text/html": [
  82. "<div>\n",
  83. "<style scoped>\n",
  84. " .dataframe tbody tr th:only-of-type {\n",
  85. " vertical-align: middle;\n",
  86. " }\n",
  87. "\n",
  88. " .dataframe tbody tr th {\n",
  89. " vertical-align: top;\n",
  90. " }\n",
  91. "\n",
  92. " .dataframe thead th {\n",
  93. " text-align: right;\n",
  94. " }\n",
  95. "</style>\n",
  96. "<table border=\"1\" class=\"dataframe\">\n",
  97. " <thead>\n",
  98. " <tr style=\"text-align: right;\">\n",
  99. " <th></th>\n",
  100. " <th>body_len</th>\n",
  101. " <th>punct%</th>\n",
  102. " <th>0</th>\n",
  103. " <th>1</th>\n",
  104. " <th>2</th>\n",
  105. " <th>3</th>\n",
  106. " <th>4</th>\n",
  107. " <th>5</th>\n",
  108. " <th>6</th>\n",
  109. " <th>7</th>\n",
  110. " <th>...</th>\n",
  111. " <th>7112</th>\n",
  112. " <th>7113</th>\n",
  113. " <th>7114</th>\n",
  114. " <th>7115</th>\n",
  115. " <th>7116</th>\n",
  116. " <th>7117</th>\n",
  117. " <th>7118</th>\n",
  118. " <th>7119</th>\n",
  119. " <th>7120</th>\n",
  120. " <th>7121</th>\n",
  121. " </tr>\n",
  122. " </thead>\n",
  123. " <tbody>\n",
  124. " <tr>\n",
  125. " <th>0</th>\n",
  126. " <td>328</td>\n",
  127. " <td>8.5</td>\n",
  128. " <td>0.0</td>\n",
  129. " <td>0.0</td>\n",
  130. " <td>0.0</td>\n",
  131. " <td>0.0</td>\n",
  132. " <td>0.0</td>\n",
  133. " <td>0.0</td>\n",
  134. " <td>0.0</td>\n",
  135. " <td>0.0</td>\n",
  136. " <td>...</td>\n",
  137. " <td>0.0</td>\n",
  138. " <td>0.0</td>\n",
  139. " <td>0.0</td>\n",
  140. " <td>0.0</td>\n",
  141. " <td>0.0</td>\n",
  142. " <td>0.0</td>\n",
  143. " <td>0.0</td>\n",
  144. " <td>0.000000</td>\n",
  145. " <td>0.0</td>\n",
  146. " <td>0.0</td>\n",
  147. " </tr>\n",
  148. " <tr>\n",
  149. " <th>1</th>\n",
  150. " <td>25</td>\n",
  151. " <td>0.0</td>\n",
  152. " <td>0.0</td>\n",
  153. " <td>0.0</td>\n",
  154. " <td>0.0</td>\n",
  155. " <td>0.0</td>\n",
  156. " <td>0.0</td>\n",
  157. " <td>0.0</td>\n",
  158. " <td>0.0</td>\n",
  159. " <td>0.0</td>\n",
  160. " <td>...</td>\n",
  161. " <td>0.0</td>\n",
  162. " <td>0.0</td>\n",
  163. " <td>0.0</td>\n",
  164. " <td>0.0</td>\n",
  165. " <td>0.0</td>\n",
  166. " <td>0.0</td>\n",
  167. " <td>0.0</td>\n",
  168. " <td>0.000000</td>\n",
  169. " <td>0.0</td>\n",
  170. " <td>0.0</td>\n",
  171. " </tr>\n",
  172. " <tr>\n",
  173. " <th>2</th>\n",
  174. " <td>112</td>\n",
  175. " <td>2.7</td>\n",
  176. " <td>0.0</td>\n",
  177. " <td>0.0</td>\n",
  178. " <td>0.0</td>\n",
  179. " <td>0.0</td>\n",
  180. " <td>0.0</td>\n",
  181. " <td>0.0</td>\n",
  182. " <td>0.0</td>\n",
  183. " <td>0.0</td>\n",
  184. " <td>...</td>\n",
  185. " <td>0.0</td>\n",
  186. " <td>0.0</td>\n",
  187. " <td>0.0</td>\n",
  188. " <td>0.0</td>\n",
  189. " <td>0.0</td>\n",
  190. " <td>0.0</td>\n",
  191. " <td>0.0</td>\n",
  192. " <td>0.000000</td>\n",
  193. " <td>0.0</td>\n",
  194. " <td>0.0</td>\n",
  195. " </tr>\n",
  196. " <tr>\n",
  197. " <th>3</th>\n",
  198. " <td>24</td>\n",
  199. " <td>8.3</td>\n",
  200. " <td>0.0</td>\n",
  201. " <td>0.0</td>\n",
  202. " <td>0.0</td>\n",
  203. " <td>0.0</td>\n",
  204. " <td>0.0</td>\n",
  205. " <td>0.0</td>\n",
  206. " <td>0.0</td>\n",
  207. " <td>0.0</td>\n",
  208. " <td>...</td>\n",
  209. " <td>0.0</td>\n",
  210. " <td>0.0</td>\n",
  211. " <td>0.0</td>\n",
  212. " <td>0.0</td>\n",
  213. " <td>0.0</td>\n",
  214. " <td>0.0</td>\n",
  215. " <td>0.0</td>\n",
  216. " <td>0.645353</td>\n",
  217. " <td>0.0</td>\n",
  218. " <td>0.0</td>\n",
  219. " </tr>\n",
  220. " <tr>\n",
  221. " <th>4</th>\n",
  222. " <td>94</td>\n",
  223. " <td>3.2</td>\n",
  224. " <td>0.0</td>\n",
  225. " <td>0.0</td>\n",
  226. " <td>0.0</td>\n",
  227. " <td>0.0</td>\n",
  228. " <td>0.0</td>\n",
  229. " <td>0.0</td>\n",
  230. " <td>0.0</td>\n",
  231. " <td>0.0</td>\n",
  232. " <td>...</td>\n",
  233. " <td>0.0</td>\n",
  234. " <td>0.0</td>\n",
  235. " <td>0.0</td>\n",
  236. " <td>0.0</td>\n",
  237. " <td>0.0</td>\n",
  238. " <td>0.0</td>\n",
  239. " <td>0.0</td>\n",
  240. " <td>0.000000</td>\n",
  241. " <td>0.0</td>\n",
  242. " <td>0.0</td>\n",
  243. " </tr>\n",
  244. " </tbody>\n",
  245. "</table>\n",
  246. "<p>5 rows × 7124 columns</p>\n",
  247. "</div>"
  248. ],
  249. "text/plain": [
  250. " body_len punct% 0 1 2 3 4 5 6 7 ... 7112 7113 \\\n",
  251. "0 328 8.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
  252. "1 25 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
  253. "2 112 2.7 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
  254. "3 24 8.3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
  255. "4 94 3.2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
  256. "\n",
  257. " 7114 7115 7116 7117 7118 7119 7120 7121 \n",
  258. "0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 \n",
  259. "1 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 \n",
  260. "2 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 \n",
  261. "3 0.0 0.0 0.0 0.0 0.0 0.645353 0.0 0.0 \n",
  262. "4 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 \n",
  263. "\n",
  264. "[5 rows x 7124 columns]"
  265. ]
  266. },
  267. "execution_count": 3,
  268. "metadata": {},
  269. "output_type": "execute_result"
  270. }
  271. ],
  272. "source": [
  273. "tfidf_vect = TfidfVectorizer(analyzer=clean_text)\n",
  274. "tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])\n",
  275. "\n",
  276. "tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])\n",
  277. "tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])\n",
  278. "\n",
  279. "X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), \n",
  280. " pd.DataFrame(tfidf_train.toarray())], axis=1)\n",
  281. "X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), \n",
  282. " pd.DataFrame(tfidf_test.toarray())], axis=1)\n",
  283. "\n",
  284. "X_train_vect.head()"
  285. ]
  286. },
  287. {
  288. "cell_type": "markdown",
  289. "metadata": {},
  290. "source": [
  291. "### Final evaluation of models"
  292. ]
  293. },
  294. {
  295. "cell_type": "code",
  296. "execution_count": 4,
  297. "metadata": {},
  298. "outputs": [],
  299. "source": [
  300. "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
  301. "from sklearn.metrics import precision_recall_fscore_support as score\n",
  302. "import time"
  303. ]
  304. },
  305. {
  306. "cell_type": "code",
  307. "execution_count": 5,
  308. "metadata": {},
  309. "outputs": [
  310. {
  311. "name": "stdout",
  312. "output_type": "stream",
  313. "text": [
  314. "Fit Time: 1.924, Predict Time: 0.152, Precision: 1.0 / Recall: 0.839 / Accuracy: 0.978\n",
  315. "Precision: 1.0 / Recall: 0.839 / Accuracy: 0.978\n"
  316. ]
  317. }
  318. ],
  319. "source": [
  320. "rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)\n",
  321. "\n",
  322. "rf_model = rf.fit(X_train_vect, y_train)\n",
  323. "y_pred = rf_model.predict(X_test_vect)\n",
  324. "rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)\n",
  325. "\n",
  326. "start = time.time()\n",
  327. "rf_model = rf.fit(X_train_vect, y_train)\n",
  328. "end = time.time()\n",
  329. "fit_time = (end - start)\n",
  330. "\n",
  331. "start = time.time()\n",
  332. "y_pred = rf_model.predict(X_test_vect)\n",
  333. "end = time.time()\n",
  334. "pred_time = (end - start)\n",
  335. "\n",
  336. "precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')\n",
  337. "print('Fit Time: {}, Predict Time: {}, Precision: {} / Recall: {} / Accuracy: {}'.format(\n",
  338. " round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))\n",
  339. "# precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')\n",
  340. "# print('Precision: {} / Recall: {} / Accuracy: {}'.format(\n",
  341. "# round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))"
  342. ]
  343. },
  344. {
  345. "cell_type": "code",
  346. "execution_count": 6,
  347. "metadata": {},
  348. "outputs": [
  349. {
  350. "name": "stdout",
  351. "output_type": "stream",
  352. "text": [
  353. "Fit Time: 218.932, Predict Time: 0.142, Precision: 0.915 / Recall: 0.866 / Accuracy: 0.971\n"
  354. ]
  355. }
  356. ],
  357. "source": [
  358. "gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)\n",
  359. "\n",
  360. "start = time.time()\n",
  361. "gb_model = gb.fit(X_train_vect, y_train)\n",
  362. "end = time.time()\n",
  363. "fit_time = (end - start)\n",
  364. "\n",
  365. "start = time.time()\n",
  366. "y_pred = gb_model.predict(X_test_vect)\n",
  367. "end = time.time()\n",
  368. "pred_time = (end - start)\n",
  369. "\n",
  370. "precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')\n",
  371. "print('Fit Time: {}, Predict Time: {}, Precision: {} / Recall: {} / Accuracy: {}'.format(\n",
  372. " round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))"
  373. ]
  374. },
  375. {
  376. "cell_type": "code",
  377. "execution_count": null,
  378. "metadata": {},
  379. "outputs": [],
  380. "source": []
  381. }
  382. ],
  383. "metadata": {
  384. "kernelspec": {
  385. "display_name": "Python 3",
  386. "language": "python",
  387. "name": "python3"
  388. },
  389. "language_info": {
  390. "codemirror_mode": {
  391. "name": "ipython",
  392. "version": 3
  393. },
  394. "file_extension": ".py",
  395. "mimetype": "text/x-python",
  396. "name": "python",
  397. "nbconvert_exporter": "python",
  398. "pygments_lexer": "ipython3",
  399. "version": "3.6.9"
  400. }
  401. },
  402. "nbformat": 4,
  403. "nbformat_minor": 2
  404. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement