Advertisement
Guest User

asdasdasd

a guest
Jan 22nd, 2020
104
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.34 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 7,
  6. "metadata": {},
  7. "outputs": [
  8. {
  9. "name": "stdout",
  10. "output_type": "stream",
  11. "text": [
  12. "\u001b[32m\u001b[1mActivating\u001b[22m\u001b[39m environment at `~/Project.toml`\n"
  13. ]
  14. }
  15. ],
  16. "source": [
  17. "] activate ../"
  18. ]
  19. },
  20. {
  21. "cell_type": "markdown",
  22. "metadata": {},
  23. "source": [
  24. "## Problem"
  25. ]
  26. },
  27. {
  28. "cell_type": "markdown",
  29. "metadata": {},
  30. "source": [
  31. "Optimize the following function."
  32. ]
  33. },
  34. {
  35. "cell_type": "code",
  36. "execution_count": 10,
  37. "metadata": {},
  38. "outputs": [
  39. {
  40. "data": {
  41. "text/plain": [
  42. "work (generic function with 1 method)"
  43. ]
  44. },
  45. "execution_count": 10,
  46. "metadata": {},
  47. "output_type": "execute_result"
  48. }
  49. ],
  50. "source": [
  51. "function work(mat, s, v, N)\n",
  52. " val = 0.0\n",
  53. " for i in 1:N\n",
  54. " for j in 1:N\n",
  55. " val = mod(v[i],256);\n",
  56. " mat[i,j] = s[i,j]*(sin(val)*sin(val)-cos(val)*cos(val));\n",
  57. " end\n",
  58. " end;\n",
  59. "end"
  60. ]
  61. },
  62. {
  63. "cell_type": "code",
  64. "execution_count": 11,
  65. "metadata": {},
  66. "outputs": [
  67. {
  68. "data": {
  69. "text/plain": [
  70. "\u001b[32m\u001b[1mTest Passed\u001b[22m\u001b[39m"
  71. ]
  72. },
  73. "execution_count": 11,
  74. "metadata": {},
  75. "output_type": "execute_result"
  76. }
  77. ],
  78. "source": [
  79. "using Test\n",
  80. "x = rand()\n",
  81. "@test 1-2*cos(x)*cos(x) ≈ sin(x)*sin(x)-cos(x)*cos(x)\n",
  82. "@test -cos(2*x) ≈ sin(x)*sin(x)-cos(x)*cos(x)"
  83. ]
  84. },
  85. {
  86. "cell_type": "code",
  87. "execution_count": 12,
  88. "metadata": {},
  89. "outputs": [
  90. {
  91. "data": {
  92. "text/plain": [
  93. "opt1 (generic function with 1 method)"
  94. ]
  95. },
  96. "execution_count": 12,
  97. "metadata": {},
  98. "output_type": "execute_result"
  99. }
  100. ],
  101. "source": [
  102. "# pulling out + analytical opt\n",
  103. "function opt1(mat, s, v, N)\n",
  104. " val = 0.0\n",
  105. " @inbounds for i in 1:N\n",
  106. " val = mod(v[i],256);\n",
  107. " val = -cos(2*val)\n",
  108. " for j in 1:N\n",
  109. " mat[i,j] = s[i,j]*val;\n",
  110. " end\n",
  111. " end;\n",
  112. " mat\n",
  113. "end"
  114. ]
  115. },
  116. {
  117. "cell_type": "code",
  118. "execution_count": 13,
  119. "metadata": {},
  120. "outputs": [
  121. {
  122. "data": {
  123. "text/plain": [
  124. "opt12 (generic function with 1 method)"
  125. ]
  126. },
  127. "execution_count": 13,
  128. "metadata": {},
  129. "output_type": "execute_result"
  130. }
  131. ],
  132. "source": [
  133. "# pulling out + analytical opt + allocate values\n",
  134. "function opt12(mat, s, v, N)\n",
  135. " val = Vector{Float64}(undef, length(v))\n",
  136. " @inbounds for i in eachindex(val)\n",
  137. " val[i] = -cos(2*mod(v[i],256));\n",
  138. " end\n",
  139. " \n",
  140. " @inbounds for i in 1:N\n",
  141. " for j in 1:N\n",
  142. " mat[i,j] = s[i,j]*val[i];\n",
  143. " end\n",
  144. " end;\n",
  145. " mat\n",
  146. "end"
  147. ]
  148. },
  149. {
  150. "cell_type": "code",
  151. "execution_count": 14,
  152. "metadata": {},
  153. "outputs": [
  154. {
  155. "data": {
  156. "text/plain": [
  157. "opt2 (generic function with 1 method)"
  158. ]
  159. },
  160. "execution_count": 14,
  161. "metadata": {},
  162. "output_type": "execute_result"
  163. }
  164. ],
  165. "source": [
  166. "# reordering loops\n",
  167. "function opt2(mat, s, v, N)\n",
  168. " val = 0.0\n",
  169. " @inbounds for j in 1:N\n",
  170. " for i in 1:N\n",
  171. " val = mod(v[i],256);\n",
  172. " val = -cos(2*val)\n",
  173. " mat[i,j] = s[i,j]*val;\n",
  174. " end\n",
  175. " end;\n",
  176. "end"
  177. ]
  178. },
  179. {
  180. "cell_type": "code",
  181. "execution_count": 15,
  182. "metadata": {},
  183. "outputs": [
  184. {
  185. "data": {
  186. "text/plain": [
  187. "opt22 (generic function with 1 method)"
  188. ]
  189. },
  190. "execution_count": 15,
  191. "metadata": {},
  192. "output_type": "execute_result"
  193. }
  194. ],
  195. "source": [
  196. "# reordering loops + allocate values\n",
  197. "function opt22(mat, s, v, N)\n",
  198. " val = Vector{Float64}(undef, length(v))\n",
  199. " @inbounds for i in eachindex(val)\n",
  200. " val[i] = -cos(2*mod(v[i],256));\n",
  201. " end\n",
  202. " \n",
  203. " @inbounds for j in 1:N\n",
  204. " for i in 1:N\n",
  205. " mat[i,j] = s[i,j]*val[i];\n",
  206. " end\n",
  207. " end;\n",
  208. "end"
  209. ]
  210. },
  211. {
  212. "cell_type": "markdown",
  213. "metadata": {},
  214. "source": [
  215. "More optimizations: blocking, tabling values of `val`..."
  216. ]
  217. },
  218. {
  219. "cell_type": "markdown",
  220. "metadata": {},
  221. "source": [
  222. "## Single thread benchmarks"
  223. ]
  224. },
  225. {
  226. "cell_type": "code",
  227. "execution_count": 22,
  228. "metadata": {},
  229. "outputs": [],
  230. "source": [
  231. "using BenchmarkTools\n",
  232. "N = 4000\n",
  233. "mat = zeros(N,N)\n",
  234. "s = rand(N,N)\n",
  235. "v = rand(Int, N);"
  236. ]
  237. },
  238. {
  239. "cell_type": "code",
  240. "execution_count": 24,
  241. "metadata": {},
  242. "outputs": [
  243. {
  244. "name": "stdout",
  245. "output_type": "stream",
  246. "text": [
  247. "Performance: 8.832933179745996 MIt/s\n"
  248. ]
  249. }
  250. ],
  251. "source": [
  252. "runtime = @belapsed work($mat, $s, $v, $N);\n",
  253. "perf = N*N*1e-6/runtime # MIt/s\n",
  254. "println(\"Performance: $perf MIt/s\")"
  255. ]
  256. },
  257. {
  258. "cell_type": "code",
  259. "execution_count": 25,
  260. "metadata": {},
  261. "outputs": [
  262. {
  263. "name": "stdout",
  264. "output_type": "stream",
  265. "text": [
  266. "Performance: 18.954256315496306 MIt/s\n"
  267. ]
  268. }
  269. ],
  270. "source": [
  271. "runtime = @belapsed opt1($mat, $s, $v, $N);\n",
  272. "perf = N*N*1e-6/runtime # MIt/s\n",
  273. "println(\"Performance: $perf MIt/s\")"
  274. ]
  275. },
  276. {
  277. "cell_type": "code",
  278. "execution_count": 26,
  279. "metadata": {},
  280. "outputs": [
  281. {
  282. "name": "stdout",
  283. "output_type": "stream",
  284. "text": [
  285. "Performance: 18.667771913881896 MIt/s\n"
  286. ]
  287. }
  288. ],
  289. "source": [
  290. "runtime = @belapsed opt12($mat, $s, $v, $N);\n",
  291. "perf = N*N*1e-6/runtime # MIt/s\n",
  292. "println(\"Performance: $perf MIt/s\")"
  293. ]
  294. },
  295. {
  296. "cell_type": "code",
  297. "execution_count": 27,
  298. "metadata": {},
  299. "outputs": [
  300. {
  301. "name": "stdout",
  302. "output_type": "stream",
  303. "text": [
  304. "Performance: 39.56193729495743 MIt/s\n"
  305. ]
  306. }
  307. ],
  308. "source": [
  309. "runtime = @belapsed opt2($mat, $s, $v, $N);\n",
  310. "perf = N*N*1e-6/runtime # MIt/s\n",
  311. "println(\"Performance: $perf MIt/s\")"
  312. ]
  313. },
  314. {
  315. "cell_type": "code",
  316. "execution_count": 28,
  317. "metadata": {},
  318. "outputs": [
  319. {
  320. "name": "stdout",
  321. "output_type": "stream",
  322. "text": [
  323. "Performance: 641.2379740326675 MIt/s\n"
  324. ]
  325. }
  326. ],
  327. "source": [
  328. "runtime = @belapsed opt22($mat, $s, $v, $N);\n",
  329. "perf = N*N*1e-6/runtime # MIt/s\n",
  330. "println(\"Performance: $perf MIt/s\")"
  331. ]
  332. },
  333. {
  334. "cell_type": "code",
  335. "execution_count": 29,
  336. "metadata": {},
  337. "outputs": [
  338. {
  339. "data": {
  340. "text/plain": [
  341. "52.957615277130884"
  342. ]
  343. },
  344. "execution_count": 29,
  345. "metadata": {},
  346. "output_type": "execute_result"
  347. }
  348. ],
  349. "source": [
  350. "641/21.47"
  351. ]
  352. },
  353. {
  354. "cell_type": "markdown",
  355. "metadata": {},
  356. "source": [
  357. "## Multi-threading"
  358. ]
  359. },
  360. {
  361. "cell_type": "code",
  362. "execution_count": 34,
  363. "metadata": {},
  364. "outputs": [
  365. {
  366. "data": {
  367. "text/plain": [
  368. "8"
  369. ]
  370. },
  371. "execution_count": 34,
  372. "metadata": {},
  373. "output_type": "execute_result"
  374. }
  375. ],
  376. "source": [
  377. "using Hwloc\n",
  378. "Hwloc.num_physical_cores()"
  379. ]
  380. },
  381. {
  382. "cell_type": "code",
  383. "execution_count": 35,
  384. "metadata": {},
  385. "outputs": [
  386. {
  387. "data": {
  388. "text/plain": [
  389. "8"
  390. ]
  391. },
  392. "execution_count": 35,
  393. "metadata": {},
  394. "output_type": "execute_result"
  395. }
  396. ],
  397. "source": [
  398. "Base.Threads.nthreads()"
  399. ]
  400. },
  401. {
  402. "cell_type": "code",
  403. "execution_count": null,
  404. "metadata": {},
  405. "outputs": [],
  406. "source": [
  407. "function opt22_threaded(mat, s, v, N)\n",
  408. " val = Vector{Float64}(undef, length(v))\n",
  409. " @inbounds for i in eachindex(val)\n",
  410. " val[i] = -cos(2*mod(v[i],256));\n",
  411. " end\n",
  412. " \n",
  413. " @inbounds Threads.@threads for j in 1:N\n",
  414. " for i in 1:N\n",
  415. " mat[i,j] = s[i,j]*val[i];\n",
  416. " end\n",
  417. " end;\n",
  418. " mat\n",
  419. "end"
  420. ]
  421. },
  422. {
  423. "cell_type": "code",
  424. "execution_count": 30,
  425. "metadata": {},
  426. "outputs": [
  427. {
  428. "name": "stdout",
  429. "output_type": "stream",
  430. "text": [
  431. "Performance: 1710.8674621986506 MIt/s\n"
  432. ]
  433. }
  434. ],
  435. "source": [
  436. "runtime = @belapsed opt22_threaded($mat, $s, $v, $N);\n",
  437. "perf = N*N*1e-6/runtime # MIt/s\n",
  438. "println(\"Performance: $perf MIt/s\")\n",
  439. "# Performanc (1 thread): 678.7915812874131 MIt/s\n",
  440. "# Performance (4 threads): 1172.5993721170087 MIt/s"
  441. ]
  442. },
  443. {
  444. "cell_type": "markdown",
  445. "metadata": {},
  446. "source": [
  447. "## Maximal performance?"
  448. ]
  449. },
  450. {
  451. "cell_type": "markdown",
  452. "metadata": {},
  453. "source": [
  454. "\"Roofline model\""
  455. ]
  456. },
  457. {
  458. "cell_type": "code",
  459. "execution_count": 84,
  460. "metadata": {},
  461. "outputs": [
  462. {
  463. "name": "stdout",
  464. "output_type": "stream",
  465. "text": [
  466. "Memory bounded performance: 2.08 GIt/s\n"
  467. ]
  468. }
  469. ],
  470. "source": [
  471. "bs = 50 # [GB/s] max memory bandwidth (only an estimate for the unknown CPU model)\n",
  472. "traffic = 24 # [B/iter] in each iteration we have: LOAD s, LOAD + STORE mat, each contributing 8 B\n",
  473. "println(\"Memory bounded performance: \", round(bs/traffic, digits=2), \" GIt/s\")"
  474. ]
  475. }
  476. ],
  477. "metadata": {
  478. "kernelspec": {
  479. "display_name": "Julia 1.3.1",
  480. "language": "julia",
  481. "name": "julia-1.3"
  482. },
  483. "language_info": {
  484. "file_extension": ".jl",
  485. "mimetype": "application/julia",
  486. "name": "julia",
  487. "version": "1.3.1"
  488. }
  489. },
  490. "nbformat": 4,
  491. "nbformat_minor": 4
  492. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement