Advertisement
Guest User

Untitled

a guest
Jun 30th, 2016
31
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.70 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 17,
  6. "metadata": {
  7. "collapsed": true
  8. },
  9. "outputs": [],
  10. "source": [
  11. "import random\n",
  12. "import pandas as pd"
  13. ]
  14. },
  15. {
  16. "cell_type": "code",
  17. "execution_count": 21,
  18. "metadata": {
  19. "collapsed": false
  20. },
  21. "outputs": [],
  22. "source": [
  23. "# Construct the sample data (1.000.000 rows)\n",
  24. "data = []\n",
  25. "for i in range(1000000):\n",
  26. " row = []\n",
  27. " row.append(random.choice(['A1', 'A2', 'A3', 'A4', 'A5']))\n",
  28. " row.append(random.choice(['B1', 'B2', 'B3', 'B4', 'B5']))\n",
  29. " row.append(random.choice(['C1', 'C2', 'C3', 'C4', 'C5']))\n",
  30. " row.append(random.choice(['D1', 'D2', 'D3', 'D4', 'D5']))\n",
  31. " row.append(random.randint(1000, 100000))\n",
  32. " data.append(row)\n",
  33. "df = pd.DataFrame(data, columns=['A','B','C','D', 'val']) "
  34. ]
  35. },
  36. {
  37. "cell_type": "code",
  38. "execution_count": 22,
  39. "metadata": {
  40. "collapsed": false
  41. },
  42. "outputs": [
  43. {
  44. "data": {
  45. "text/html": [
  46. "<div>\n",
  47. "<table border=\"1\" class=\"dataframe\">\n",
  48. " <thead>\n",
  49. " <tr style=\"text-align: right;\">\n",
  50. " <th></th>\n",
  51. " <th>A</th>\n",
  52. " <th>B</th>\n",
  53. " <th>C</th>\n",
  54. " <th>D</th>\n",
  55. " <th>val</th>\n",
  56. " </tr>\n",
  57. " </thead>\n",
  58. " <tbody>\n",
  59. " <tr>\n",
  60. " <th>812419</th>\n",
  61. " <td>A1</td>\n",
  62. " <td>B1</td>\n",
  63. " <td>C2</td>\n",
  64. " <td>D4</td>\n",
  65. " <td>31882</td>\n",
  66. " </tr>\n",
  67. " <tr>\n",
  68. " <th>964167</th>\n",
  69. " <td>A5</td>\n",
  70. " <td>B3</td>\n",
  71. " <td>C2</td>\n",
  72. " <td>D2</td>\n",
  73. " <td>65951</td>\n",
  74. " </tr>\n",
  75. " <tr>\n",
  76. " <th>505576</th>\n",
  77. " <td>A1</td>\n",
  78. " <td>B2</td>\n",
  79. " <td>C4</td>\n",
  80. " <td>D3</td>\n",
  81. " <td>29178</td>\n",
  82. " </tr>\n",
  83. " <tr>\n",
  84. " <th>729472</th>\n",
  85. " <td>A4</td>\n",
  86. " <td>B5</td>\n",
  87. " <td>C2</td>\n",
  88. " <td>D4</td>\n",
  89. " <td>46592</td>\n",
  90. " </tr>\n",
  91. " <tr>\n",
  92. " <th>43541</th>\n",
  93. " <td>A4</td>\n",
  94. " <td>B3</td>\n",
  95. " <td>C3</td>\n",
  96. " <td>D2</td>\n",
  97. " <td>92247</td>\n",
  98. " </tr>\n",
  99. " </tbody>\n",
  100. "</table>\n",
  101. "</div>"
  102. ],
  103. "text/plain": [
  104. " A B C D val\n",
  105. "812419 A1 B1 C2 D4 31882\n",
  106. "964167 A5 B3 C2 D2 65951\n",
  107. "505576 A1 B2 C4 D3 29178\n",
  108. "729472 A4 B5 C2 D4 46592\n",
  109. "43541 A4 B3 C3 D2 92247"
  110. ]
  111. },
  112. "execution_count": 22,
  113. "metadata": {},
  114. "output_type": "execute_result"
  115. }
  116. ],
  117. "source": [
  118. "# Sample output\n",
  119. "df.sample(5)"
  120. ]
  121. },
  122. {
  123. "cell_type": "code",
  124. "execution_count": 23,
  125. "metadata": {
  126. "collapsed": false
  127. },
  128. "outputs": [
  129. {
  130. "data": {
  131. "text/plain": [
  132. "(1000000, 5)"
  133. ]
  134. },
  135. "execution_count": 23,
  136. "metadata": {},
  137. "output_type": "execute_result"
  138. }
  139. ],
  140. "source": [
  141. "df.shape"
  142. ]
  143. },
  144. {
  145. "cell_type": "code",
  146. "execution_count": 24,
  147. "metadata": {
  148. "collapsed": false
  149. },
  150. "outputs": [
  151. {
  152. "name": "stdout",
  153. "output_type": "stream",
  154. "text": [
  155. "<class 'pandas.core.frame.DataFrame'>\n",
  156. "RangeIndex: 1000000 entries, 0 to 999999\n",
  157. "Data columns (total 5 columns):\n",
  158. "A 1000000 non-null object\n",
  159. "B 1000000 non-null object\n",
  160. "C 1000000 non-null object\n",
  161. "D 1000000 non-null object\n",
  162. "val 1000000 non-null int64\n",
  163. "dtypes: int64(1), object(4)\n",
  164. "memory usage: 232.7 MB\n"
  165. ]
  166. }
  167. ],
  168. "source": [
  169. "# Memory usage with deep introspection\n",
  170. "df.info(memory_usage='deep')"
  171. ]
  172. },
  173. {
  174. "cell_type": "code",
  175. "execution_count": 25,
  176. "metadata": {
  177. "collapsed": false
  178. },
  179. "outputs": [
  180. {
  181. "name": "stdout",
  182. "output_type": "stream",
  183. "text": [
  184. "10 loops, best of 3: 57 ms per loop\n"
  185. ]
  186. }
  187. ],
  188. "source": [
  189. "%%timeit\n",
  190. "df[(df['A']== 'A1')]['val'].mean()"
  191. ]
  192. },
  193. {
  194. "cell_type": "code",
  195. "execution_count": 26,
  196. "metadata": {
  197. "collapsed": false
  198. },
  199. "outputs": [
  200. {
  201. "name": "stdout",
  202. "output_type": "stream",
  203. "text": [
  204. "10 loops, best of 3: 80.9 ms per loop\n"
  205. ]
  206. }
  207. ],
  208. "source": [
  209. "%%timeit\n",
  210. "df[(df['A']== 'A2') & (df['B'] == 'BA2')]['val'].mean()"
  211. ]
  212. },
  213. {
  214. "cell_type": "code",
  215. "execution_count": 27,
  216. "metadata": {
  217. "collapsed": false
  218. },
  219. "outputs": [
  220. {
  221. "name": "stdout",
  222. "output_type": "stream",
  223. "text": [
  224. "10 loops, best of 3: 119 ms per loop\n"
  225. ]
  226. }
  227. ],
  228. "source": [
  229. "%%timeit\n",
  230. "df[(df['A']== 'A4') & (df['C'] == 'CA4') & (df['D'] == 'D4')]['val'].mean()"
  231. ]
  232. },
  233. {
  234. "cell_type": "code",
  235. "execution_count": 28,
  236. "metadata": {
  237. "collapsed": false
  238. },
  239. "outputs": [],
  240. "source": [
  241. "for col in df.columns.tolist()[:-1]:\n",
  242. " df[col] = df[col].astype('category')"
  243. ]
  244. },
  245. {
  246. "cell_type": "code",
  247. "execution_count": 29,
  248. "metadata": {
  249. "collapsed": false
  250. },
  251. "outputs": [
  252. {
  253. "name": "stdout",
  254. "output_type": "stream",
  255. "text": [
  256. "<class 'pandas.core.frame.DataFrame'>\n",
  257. "RangeIndex: 1000000 entries, 0 to 999999\n",
  258. "Data columns (total 5 columns):\n",
  259. "A 1000000 non-null category\n",
  260. "B 1000000 non-null category\n",
  261. "C 1000000 non-null category\n",
  262. "D 1000000 non-null category\n",
  263. "val 1000000 non-null int64\n",
  264. "dtypes: category(4), int64(1)\n",
  265. "memory usage: 11.4 MB\n"
  266. ]
  267. }
  268. ],
  269. "source": [
  270. "df.info(memory_usage='deep')"
  271. ]
  272. },
  273. {
  274. "cell_type": "code",
  275. "execution_count": 30,
  276. "metadata": {
  277. "collapsed": false
  278. },
  279. "outputs": [
  280. {
  281. "name": "stdout",
  282. "output_type": "stream",
  283. "text": [
  284. "100 loops, best of 3: 9.36 ms per loop\n"
  285. ]
  286. }
  287. ],
  288. "source": [
  289. "%%timeit\n",
  290. "df[(df['A']== 'A1')]['val'].mean()"
  291. ]
  292. },
  293. {
  294. "cell_type": "code",
  295. "execution_count": 31,
  296. "metadata": {
  297. "collapsed": false
  298. },
  299. "outputs": [
  300. {
  301. "name": "stdout",
  302. "output_type": "stream",
  303. "text": [
  304. "100 loops, best of 3: 6.79 ms per loop\n"
  305. ]
  306. }
  307. ],
  308. "source": [
  309. "%%timeit\n",
  310. "df[(df['A']== 'A2') & (df['B'] == 'BA2')]['val'].mean()"
  311. ]
  312. },
  313. {
  314. "cell_type": "code",
  315. "execution_count": 32,
  316. "metadata": {
  317. "collapsed": false
  318. },
  319. "outputs": [
  320. {
  321. "name": "stdout",
  322. "output_type": "stream",
  323. "text": [
  324. "100 loops, best of 3: 9.37 ms per loop\n"
  325. ]
  326. }
  327. ],
  328. "source": [
  329. "%%timeit\n",
  330. "df[(df['A']== 'A4') & (df['C'] == 'CA4') & (df['D'] == 'D4')]['val'].mean()"
  331. ]
  332. },
  333. {
  334. "cell_type": "code",
  335. "execution_count": null,
  336. "metadata": {
  337. "collapsed": true
  338. },
  339. "outputs": [],
  340. "source": []
  341. }
  342. ],
  343. "metadata": {
  344. "kernelspec": {
  345. "display_name": "Python 3",
  346. "language": "python",
  347. "name": "python3"
  348. },
  349. "language_info": {
  350. "codemirror_mode": {
  351. "name": "ipython",
  352. "version": 3
  353. },
  354. "file_extension": ".py",
  355. "mimetype": "text/x-python",
  356. "name": "python",
  357. "nbconvert_exporter": "python",
  358. "pygments_lexer": "ipython3",
  359. "version": "3.5.1"
  360. }
  361. },
  362. "nbformat": 4,
  363. "nbformat_minor": 0
  364. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement