Advertisement
Guest User

Untitled

a guest
Jul 22nd, 2019
145
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 49.18 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# Pairwise distance benchmark, CPU vs GPU (update 2019-07-11)\n",
  8. "\n",
  9. "Benchmark two distance functions, city block (a.k.a. manhattan) and squared euclidean. Compare: (1) parallel CPU implementation, (2) scipy implementation, (3) GPU implementation."
  10. ]
  11. },
  12. {
  13. "cell_type": "code",
  14. "execution_count": 1,
  15. "metadata": {},
  16. "outputs": [],
  17. "source": [
  18. "import numpy as np\n",
  19. "from scipy.spatial.distance import pdist as sp_pdist, squareform\n",
  20. "import numba\n",
  21. "from numba import cuda\n",
  22. "import math\n",
  23. "import dask.array as da"
  24. ]
  25. },
  26. {
  27. "cell_type": "code",
  28. "execution_count": 2,
  29. "metadata": {},
  30. "outputs": [],
  31. "source": [
  32. "# create some input data, use float32 throughout\n",
  33. "X = np.random.randint(0, 4, size=(2000, 2000), dtype='i1').astype('f4')\n",
  34. "XF = np.asfortranarray(X)"
  35. ]
  36. },
  37. {
  38. "cell_type": "markdown",
  39. "metadata": {},
  40. "source": [
  41. "## Parallel CPU implementations\n",
  42. "\n",
  43. "Compare two algorithms for passing through the data, one row-wise, one column-wise, on both row-major and column-major data, to investigate the impact of data layout in memory."
  44. ]
  45. },
  46. {
  47. "cell_type": "code",
  48. "execution_count": 3,
  49. "metadata": {},
  50. "outputs": [],
  51. "source": [
  52. "@numba.jit(numba.float32[:, :](numba.float32[:, :]), nopython=True, parallel=True)\n",
  53. "def pdist_cityblock_rowm_nb(x):\n",
  54. " m = x.shape[0]\n",
  55. " n = x.shape[1]\n",
  56. " out = np.zeros((m, n), dtype=np.float32)\n",
  57. " for i in numba.prange(m):\n",
  58. " for j in range(n):\n",
  59. " vj = x[i, j]\n",
  60. " for k in range(j + 1, n):\n",
  61. " vk = x[i, k]\n",
  62. " d = math.fabs(vk - vj)\n",
  63. " out[j, k] += d\n",
  64. " return out"
  65. ]
  66. },
  67. {
  68. "cell_type": "code",
  69. "execution_count": 4,
  70. "metadata": {},
  71. "outputs": [],
  72. "source": [
  73. "@numba.jit(numba.float32[:, :](numba.float32[:, :]), nopython=True, parallel=True)\n",
  74. "def pdist_sqeuclid_rowm_nb(x):\n",
  75. " m = x.shape[0]\n",
  76. " n = x.shape[1]\n",
  77. " out = np.zeros((m, n), dtype=np.float32)\n",
  78. " for i in numba.prange(m):\n",
  79. " for j in range(n):\n",
  80. " vj = x[i, j]\n",
  81. " for k in range(j + 1, n):\n",
  82. " vk = x[i, k]\n",
  83. " d = (vk - vj)**2\n",
  84. " out[j, k] += d\n",
  85. " return out"
  86. ]
  87. },
  88. {
  89. "cell_type": "code",
  90. "execution_count": 5,
  91. "metadata": {},
  92. "outputs": [],
  93. "source": [
  94. "@numba.jit(numba.float32[:, :](numba.float32[:, :]), nopython=True, parallel=True)\n",
  95. "def pdist_cityblock_colm_nb(x):\n",
  96. " m = x.shape[0]\n",
  97. " n = x.shape[1]\n",
  98. " out = np.zeros((m, n), dtype=np.float32)\n",
  99. " for j in numba.prange(n):\n",
  100. " for k in range(j + 1, n):\n",
  101. " d = np.float32(0)\n",
  102. " for i in range(m):\n",
  103. " vj = x[i, j]\n",
  104. " vk = x[i, k]\n",
  105. " d += math.fabs(vk - vj)\n",
  106. " out[j, k] = d\n",
  107. " return out"
  108. ]
  109. },
  110. {
  111. "cell_type": "code",
  112. "execution_count": 6,
  113. "metadata": {},
  114. "outputs": [],
  115. "source": [
  116. "@numba.jit(numba.float32[:, :](numba.float32[:, :]), nopython=True, parallel=True)\n",
  117. "def pdist_sqeuclid_colm_nb(x):\n",
  118. " m = x.shape[0]\n",
  119. " n = x.shape[1]\n",
  120. " out = np.zeros((m, n), dtype=np.float32)\n",
  121. " for j in numba.prange(n):\n",
  122. " for k in range(j + 1, n):\n",
  123. " d = np.float32(0)\n",
  124. " for i in range(m):\n",
  125. " vj = x[i, j]\n",
  126. " vk = x[i, k]\n",
  127. " d += (vk - vj)**2\n",
  128. " out[j, k] = d\n",
  129. " return out\n"
  130. ]
  131. },
  132. {
  133. "cell_type": "code",
  134. "execution_count": 7,
  135. "metadata": {},
  136. "outputs": [
  137. {
  138. "name": "stdout",
  139. "output_type": "stream",
  140. "text": [
  141. "CPU times: user 9.65 s, sys: 26.6 ms, total: 9.68 s\n",
  142. "Wall time: 1.25 s\n"
  143. ]
  144. },
  145. {
  146. "data": {
  147. "text/plain": [
  148. "array([[ 0., 2465., 2480., ..., 2429., 2506., 2528.],\n",
  149. " [ 0., 0., 2520., ..., 2467., 2460., 2482.],\n",
  150. " [ 0., 0., 0., ..., 2475., 2512., 2514.],\n",
  151. " ...,\n",
  152. " [ 0., 0., 0., ..., 0., 2493., 2494.],\n",
  153. " [ 0., 0., 0., ..., 0., 0., 2444.],\n",
  154. " [ 0., 0., 0., ..., 0., 0., 0.]], dtype=float32)"
  155. ]
  156. },
  157. "execution_count": 7,
  158. "metadata": {},
  159. "output_type": "execute_result"
  160. }
  161. ],
  162. "source": [
  163. "%%time\n",
  164. "pdist_cityblock_rowm_nb(X)"
  165. ]
  166. },
  167. {
  168. "cell_type": "code",
  169. "execution_count": 8,
  170. "metadata": {},
  171. "outputs": [
  172. {
  173. "name": "stdout",
  174. "output_type": "stream",
  175. "text": [
  176. "CPU times: user 9.74 s, sys: 24.1 ms, total: 9.77 s\n",
  177. "Wall time: 1.28 s\n"
  178. ]
  179. },
  180. {
  181. "data": {
  182. "text/plain": [
  183. "array([[ 0., 4863., 4916., ..., 4815., 5018., 4968.],\n",
  184. " [ 0., 0., 5004., ..., 4983., 4840., 4944.],\n",
  185. " [ 0., 0., 0., ..., 4903., 5020., 4996.],\n",
  186. " ...,\n",
  187. " [ 0., 0., 0., ..., 0., 4958., 4994.],\n",
  188. " [ 0., 0., 0., ..., 0., 0., 4901.],\n",
  189. " [ 0., 0., 0., ..., 0., 0., 0.]], dtype=float32)"
  190. ]
  191. },
  192. "execution_count": 8,
  193. "metadata": {},
  194. "output_type": "execute_result"
  195. }
  196. ],
  197. "source": [
  198. "%%time\n",
  199. "pdist_sqeuclid_rowm_nb(X)"
  200. ]
  201. },
  202. {
  203. "cell_type": "code",
  204. "execution_count": 9,
  205. "metadata": {},
  206. "outputs": [
  207. {
  208. "name": "stdout",
  209. "output_type": "stream",
  210. "text": [
  211. "CPU times: user 5.66 s, sys: 40.1 ms, total: 5.7 s\n",
  212. "Wall time: 1.32 s\n"
  213. ]
  214. },
  215. {
  216. "data": {
  217. "text/plain": [
  218. "array([[ 0., 2476., 2482., ..., 2429., 2506., 2528.],\n",
  219. " [ 0., 0., 2520., ..., 2467., 2460., 2482.],\n",
  220. " [ 0., 0., 0., ..., 2475., 2512., 2514.],\n",
  221. " ...,\n",
  222. " [ 0., 0., 0., ..., 0., 2535., 2499.],\n",
  223. " [ 0., 0., 0., ..., 0., 0., 2502.],\n",
  224. " [ 0., 0., 0., ..., 0., 0., 0.]], dtype=float32)"
  225. ]
  226. },
  227. "execution_count": 9,
  228. "metadata": {},
  229. "output_type": "execute_result"
  230. }
  231. ],
  232. "source": [
  233. "%%time\n",
  234. "pdist_cityblock_colm_nb(XF)"
  235. ]
  236. },
  237. {
  238. "cell_type": "code",
  239. "execution_count": 10,
  240. "metadata": {},
  241. "outputs": [
  242. {
  243. "name": "stdout",
  244. "output_type": "stream",
  245. "text": [
  246. "CPU times: user 5.28 s, sys: 43.7 ms, total: 5.32 s\n",
  247. "Wall time: 1.24 s\n"
  248. ]
  249. },
  250. {
  251. "data": {
  252. "text/plain": [
  253. "array([[ 0., 4864., 4920., ..., 4815., 5018., 4968.],\n",
  254. " [ 0., 0., 5004., ..., 4983., 4840., 4944.],\n",
  255. " [ 0., 0., 0., ..., 4903., 5020., 4996.],\n",
  256. " ...,\n",
  257. " [ 0., 0., 0., ..., 0., 5093., 5015.],\n",
  258. " [ 0., 0., 0., ..., 0., 0., 4956.],\n",
  259. " [ 0., 0., 0., ..., 0., 0., 0.]], dtype=float32)"
  260. ]
  261. },
  262. "execution_count": 10,
  263. "metadata": {},
  264. "output_type": "execute_result"
  265. }
  266. ],
  267. "source": [
  268. "%%time\n",
  269. "pdist_sqeuclid_colm_nb(XF)"
  270. ]
  271. },
  272. {
  273. "cell_type": "markdown",
  274. "metadata": {},
  275. "source": [
  276. "## SciPy implementations\n",
  277. "\n",
  278. "Compare with scipy.spatial.distance implementations, hand-optimised C. N.B., scipy thinks of the data as transposed relative to my usual layout."
  279. ]
  280. },
  281. {
  282. "cell_type": "code",
  283. "execution_count": 11,
  284. "metadata": {},
  285. "outputs": [
  286. {
  287. "name": "stdout",
  288. "output_type": "stream",
  289. "text": [
  290. "CPU times: user 2.77 s, sys: 20.1 ms, total: 2.79 s\n",
  291. "Wall time: 2.79 s\n"
  292. ]
  293. },
  294. {
  295. "data": {
  296. "text/plain": [
  297. "array([2476., 2482., 2491., ..., 2535., 2499., 2502.])"
  298. ]
  299. },
  300. "execution_count": 11,
  301. "metadata": {},
  302. "output_type": "execute_result"
  303. }
  304. ],
  305. "source": [
  306. "%%time\n",
  307. "sp_pdist(X.T, metric='cityblock')"
  308. ]
  309. },
  310. {
  311. "cell_type": "code",
  312. "execution_count": 12,
  313. "metadata": {},
  314. "outputs": [
  315. {
  316. "name": "stdout",
  317. "output_type": "stream",
  318. "text": [
  319. "CPU times: user 2.68 s, sys: 16.1 ms, total: 2.7 s\n",
  320. "Wall time: 2.7 s\n"
  321. ]
  322. },
  323. {
  324. "data": {
  325. "text/plain": [
  326. "array([2476., 2482., 2491., ..., 2535., 2499., 2502.])"
  327. ]
  328. },
  329. "execution_count": 12,
  330. "metadata": {},
  331. "output_type": "execute_result"
  332. }
  333. ],
  334. "source": [
  335. "%%time\n",
  336. "sp_pdist(XF.T, metric='cityblock')"
  337. ]
  338. },
  339. {
  340. "cell_type": "code",
  341. "execution_count": 13,
  342. "metadata": {},
  343. "outputs": [
  344. {
  345. "name": "stdout",
  346. "output_type": "stream",
  347. "text": [
  348. "CPU times: user 2.69 s, sys: 16.1 ms, total: 2.71 s\n",
  349. "Wall time: 2.71 s\n"
  350. ]
  351. },
  352. {
  353. "data": {
  354. "text/plain": [
  355. "array([4864., 4920., 5005., ..., 5093., 5015., 4956.])"
  356. ]
  357. },
  358. "execution_count": 13,
  359. "metadata": {},
  360. "output_type": "execute_result"
  361. }
  362. ],
  363. "source": [
  364. "%%time\n",
  365. "sp_pdist(X.T, metric='sqeuclidean')"
  366. ]
  367. },
  368. {
  369. "cell_type": "code",
  370. "execution_count": 14,
  371. "metadata": {},
  372. "outputs": [
  373. {
  374. "name": "stdout",
  375. "output_type": "stream",
  376. "text": [
  377. "CPU times: user 2.67 s, sys: 3.87 ms, total: 2.67 s\n",
  378. "Wall time: 2.67 s\n"
  379. ]
  380. },
  381. {
  382. "data": {
  383. "text/plain": [
  384. "array([4864., 4920., 5005., ..., 5093., 5015., 4956.])"
  385. ]
  386. },
  387. "execution_count": 14,
  388. "metadata": {},
  389. "output_type": "execute_result"
  390. }
  391. ],
  392. "source": [
  393. "%%time\n",
  394. "sp_pdist(XF.T, metric='sqeuclidean')"
  395. ]
  396. },
  397. {
  398. "cell_type": "markdown",
  399. "metadata": {},
  400. "source": [
  401. "## GPU implementations"
  402. ]
  403. },
  404. {
  405. "cell_type": "code",
  406. "execution_count": 15,
  407. "metadata": {},
  408. "outputs": [],
  409. "source": [
  410. "@cuda.jit(numba.void(numba.float32[:, :], numba.float32[:, :]))\n",
  411. "def pdist_cityblock_cuda(x, out):\n",
  412. " m = x.shape[0]\n",
  413. " n = x.shape[1]\n",
  414. " j, k = cuda.grid(2)\n",
  415. " if j < n and k < n and j < k:\n",
  416. " dd = np.float32(0)\n",
  417. " for i in range(m):\n",
  418. " vj = x[i, j]\n",
  419. " vk = x[i, k]\n",
  420. " d = math.fabs(vj - vk)\n",
  421. " dd += d\n",
  422. " out[j, k] = dd\n",
  423. "\n"
  424. ]
  425. },
  426. {
  427. "cell_type": "code",
  428. "execution_count": 16,
  429. "metadata": {},
  430. "outputs": [],
  431. "source": [
  432. "@cuda.jit(numba.void(numba.float32[:, :], numba.float32[:, :]))\n",
  433. "def pdist_sqeuclid_cuda(x, out):\n",
  434. " m = x.shape[0]\n",
  435. " n = x.shape[1]\n",
  436. " j, k = cuda.grid(2)\n",
  437. " if j < n and k < n and j < k:\n",
  438. " dd = np.float32(0)\n",
  439. " for i in range(m):\n",
  440. " vj = x[i, j]\n",
  441. " vk = x[i, k]\n",
  442. " d = (vj - vk)**2\n",
  443. " dd += d\n",
  444. " out[j, k] = dd\n",
  445. "\n"
  446. ]
  447. },
  448. {
  449. "cell_type": "code",
  450. "execution_count": 17,
  451. "metadata": {},
  452. "outputs": [
  453. {
  454. "data": {
  455. "text/plain": [
  456. "(125, 125)"
  457. ]
  458. },
  459. "execution_count": 17,
  460. "metadata": {},
  461. "output_type": "execute_result"
  462. }
  463. ],
  464. "source": [
  465. "threads = (16, 16)\n",
  466. "blocks = (\n",
  467. " math.ceil(X.shape[1] / threads[0]),\n",
  468. " math.ceil(X.shape[1] / threads[1]),\n",
  469. ")\n",
  470. "blocks"
  471. ]
  472. },
  473. {
  474. "cell_type": "code",
  475. "execution_count": 18,
  476. "metadata": {},
  477. "outputs": [
  478. {
  479. "name": "stdout",
  480. "output_type": "stream",
  481. "text": [
  482. "CPU times: user 14 ms, sys: 31 µs, total: 14 ms\n",
  483. "Wall time: 13 ms\n"
  484. ]
  485. }
  486. ],
  487. "source": [
  488. "%%time\n",
  489. "out = np.zeros((X.shape[1], X.shape[1]), dtype='f4')\n",
  490. "X_device = cuda.to_device(X)\n",
  491. "XF_device = cuda.to_device(XF)\n",
  492. "out_device = cuda.to_device(out)"
  493. ]
  494. },
  495. {
  496. "cell_type": "code",
  497. "execution_count": 19,
  498. "metadata": {},
  499. "outputs": [
  500. {
  501. "name": "stdout",
  502. "output_type": "stream",
  503. "text": [
  504. "154 ms ± 10.7 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)\n"
  505. ]
  506. }
  507. ],
  508. "source": [
  509. "%%timeit -n1 -r10\n",
  510. "pdist_cityblock_cuda[blocks, threads](X_device, out_device)\n",
  511. "cuda.synchronize()"
  512. ]
  513. },
  514. {
  515. "cell_type": "code",
  516. "execution_count": 20,
  517. "metadata": {},
  518. "outputs": [
  519. {
  520. "name": "stdout",
  521. "output_type": "stream",
  522. "text": [
  523. "CPU times: user 4.4 ms, sys: 8 ms, total: 12.4 ms\n",
  524. "Wall time: 11.5 ms\n"
  525. ]
  526. },
  527. {
  528. "data": {
  529. "text/plain": [
  530. "array([[ 0., 2476., 2482., ..., 2429., 2506., 2528.],\n",
  531. " [ 0., 0., 2520., ..., 2467., 2460., 2482.],\n",
  532. " [ 0., 0., 0., ..., 2475., 2512., 2514.],\n",
  533. " ...,\n",
  534. " [ 0., 0., 0., ..., 0., 2535., 2499.],\n",
  535. " [ 0., 0., 0., ..., 0., 0., 2502.],\n",
  536. " [ 0., 0., 0., ..., 0., 0., 0.]], dtype=float32)"
  537. ]
  538. },
  539. "execution_count": 20,
  540. "metadata": {},
  541. "output_type": "execute_result"
  542. }
  543. ],
  544. "source": [
  545. "%%time\n",
  546. "out_device.copy_to_host()"
  547. ]
  548. },
  549. {
  550. "cell_type": "code",
  551. "execution_count": 21,
  552. "metadata": {},
  553. "outputs": [
  554. {
  555. "name": "stdout",
  556. "output_type": "stream",
  557. "text": [
  558. "150 ms ± 7.12 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)\n"
  559. ]
  560. }
  561. ],
  562. "source": [
  563. "%%timeit -n1 -r10\n",
  564. "pdist_sqeuclid_cuda[blocks, threads](X_device, out_device)\n",
  565. "cuda.synchronize()"
  566. ]
  567. },
  568. {
  569. "cell_type": "code",
  570. "execution_count": 22,
  571. "metadata": {},
  572. "outputs": [
  573. {
  574. "data": {
  575. "text/plain": [
  576. "array([[ 0., 4864., 4920., ..., 4815., 5018., 4968.],\n",
  577. " [ 0., 0., 5004., ..., 4983., 4840., 4944.],\n",
  578. " [ 0., 0., 0., ..., 4903., 5020., 4996.],\n",
  579. " ...,\n",
  580. " [ 0., 0., 0., ..., 0., 5093., 5015.],\n",
  581. " [ 0., 0., 0., ..., 0., 0., 4956.],\n",
  582. " [ 0., 0., 0., ..., 0., 0., 0.]], dtype=float32)"
  583. ]
  584. },
  585. "execution_count": 22,
  586. "metadata": {},
  587. "output_type": "execute_result"
  588. }
  589. ],
  590. "source": [
  591. "out_device.copy_to_host()"
  592. ]
  593. },
  594. {
  595. "cell_type": "code",
  596. "execution_count": 23,
  597. "metadata": {},
  598. "outputs": [
  599. {
  600. "name": "stdout",
  601. "output_type": "stream",
  602. "text": [
  603. "546 ms ± 10.5 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)\n"
  604. ]
  605. }
  606. ],
  607. "source": [
  608. "%%timeit -n1 -r3\n",
  609. "pdist_cityblock_cuda[blocks, threads](XF_device, out_device)\n",
  610. "cuda.synchronize()"
  611. ]
  612. },
  613. {
  614. "cell_type": "code",
  615. "execution_count": 24,
  616. "metadata": {},
  617. "outputs": [
  618. {
  619. "name": "stdout",
  620. "output_type": "stream",
  621. "text": [
  622. "548 ms ± 11.9 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)\n"
  623. ]
  624. }
  625. ],
  626. "source": [
  627. "%%timeit -n1 -r3\n",
  628. "pdist_sqeuclid_cuda[blocks, threads](XF_device, out_device)\n",
  629. "cuda.synchronize()"
  630. ]
  631. },
  632. {
  633. "cell_type": "markdown",
  634. "metadata": {},
  635. "source": [
  636. "## Diagnostics"
  637. ]
  638. },
  639. {
  640. "cell_type": "code",
  641. "execution_count": 25,
  642. "metadata": {},
  643. "outputs": [
  644. {
  645. "name": "stdout",
  646. "output_type": "stream",
  647. "text": [
  648. "System info:\n",
  649. "--------------------------------------------------------------------------------\n",
  650. "__Time Stamp__\n",
  651. "2019-07-11 11:40:03.855282\n",
  652. "\n",
  653. "__Hardware Information__\n",
  654. "Machine : x86_64\n",
  655. "CPU Name : skylake\n",
  656. "Number of accessible CPU cores : 8\n",
  657. "Listed accessible CPUs cores : 0-7\n",
  658. "CFS restrictions : None\n",
  659. "CPU Features : \n",
  660. "64bit adx aes avx avx2 bmi bmi2 clflushopt cmov cx16 f16c fma fsgsbase invpcid\n",
  661. "lzcnt mmx movbe pclmul popcnt prfchw rdrnd rdseed rtm sahf sgx sse sse2 sse3\n",
  662. "sse4.1 sse4.2 ssse3 xsave xsavec xsaveopt xsaves\n",
  663. "\n",
  664. "__OS Information__\n",
  665. "Platform : Linux-4.15.0-54-generic-x86_64-with-debian-buster-sid\n",
  666. "Release : 4.15.0-54-generic\n",
  667. "System Name : Linux\n",
  668. "Version : #58-Ubuntu SMP Mon Jun 24 10:55:24 UTC 2019\n",
  669. "OS specific info : debianbuster/sid\n",
  670. "glibc info : glibc 2.10\n",
  671. "\n",
  672. "__Python Information__\n",
  673. "Python Compiler : GCC 7.3.0\n",
  674. "Python Implementation : CPython\n",
  675. "Python Version : 3.7.3\n",
  676. "Python Locale : en_GB UTF-8\n",
  677. "\n",
  678. "__LLVM information__\n",
  679. "LLVM version : 8.0.0\n",
  680. "\n",
  681. "__CUDA Information__\n",
  682. "Found 1 CUDA devices\n",
  683. "id 0 b'Quadro M1000M' [SUPPORTED]\n",
  684. " compute capability: 5.0\n",
  685. " pci device id: 0\n",
  686. " pci bus id: 1\n",
  687. "Summary:\n",
  688. "\t1/1 devices are supported\n",
  689. "CUDA driver version : 9020\n",
  690. "CUDA libraries:\n",
  691. "Finding cublas from Conda environment\n",
  692. "\tnamed libcublas.so.9.2.148\n",
  693. "\ttrying to open library...\tok\n",
  694. "Finding cusparse from Conda environment\n",
  695. "\tnamed libcusparse.so.9.2.148\n",
  696. "\ttrying to open library...\tok\n",
  697. "Finding cufft from Conda environment\n",
  698. "\tnamed libcufft.so.9.2.148\n",
  699. "\ttrying to open library...\tok\n",
  700. "Finding curand from Conda environment\n",
  701. "\tnamed libcurand.so.9.2.148\n",
  702. "\ttrying to open library...\tok\n",
  703. "Finding nvvm from Conda environment\n",
  704. "\tnamed libnvvm.so.3.2.0\n",
  705. "\ttrying to open library...\tok\n",
  706. "Finding libdevice from Conda environment\n",
  707. "\tsearching for compute_20...\tok\n",
  708. "\tsearching for compute_30...\tok\n",
  709. "\tsearching for compute_35...\tok\n",
  710. "\tsearching for compute_50...\tok\n",
  711. "\n",
  712. "__ROC Information__\n",
  713. "ROC available : False\n",
  714. "Error initialising ROC due to : No ROC toolchains found.\n",
  715. "No HSA Agents found, encountered exception when searching:\n",
  716. "Error at driver init: \n",
  717. "NUMBA_HSA_DRIVER /opt/rocm/lib/libhsa-runtime64.so is not a valid file path. Note it must be a filepath of the .so/.dll/.dylib or the driver:\n",
  718. "\n",
  719. "__SVML Information__\n",
  720. "SVML state, config.USING_SVML : False\n",
  721. "SVML library found and loaded : False\n",
  722. "llvmlite using SVML patched LLVM : True\n",
  723. "SVML operational : False\n",
  724. "\n",
  725. "__Threading Layer Information__\n",
  726. "TBB Threading layer available : False\n",
  727. "+--> Disabled due to : Unknown import problem.\n",
  728. "OpenMP Threading layer available : False\n",
  729. "+--> Disabled due to : Unknown import problem.\n",
  730. "Workqueue Threading layer available : True\n",
  731. "\n",
  732. "__Numba Environment Variable Information__\n",
  733. "None set.\n",
  734. "\n",
  735. "__Conda Information__\n",
  736. "conda_build_version : not installed\n",
  737. "conda_env_version : 4.7.5\n",
  738. "platform : linux-64\n",
  739. "python_version : 3.7.3.final.0\n",
  740. "root_writable : True\n",
  741. "\n",
  742. "__Current Conda Env__\n",
  743. "_libgcc_mutex 0.1 main \n",
  744. "attrs 19.1.0 py_0 conda-forge\n",
  745. "backcall 0.1.0 py_0 conda-forge\n",
  746. "bleach 3.1.0 py_0 conda-forge\n",
  747. "bokeh 1.2.0 py37_0 conda-forge\n",
  748. "bzip2 1.0.6 h14c3975_1002 conda-forge\n",
  749. "ca-certificates 2019.6.16 hecc5488_0 conda-forge\n",
  750. "certifi 2019.6.16 py37_0 conda-forge\n",
  751. "click 7.0 py_0 conda-forge\n",
  752. "cloudpickle 1.2.1 py_0 conda-forge\n",
  753. "cudatoolkit 9.2 0 \n",
  754. "cytoolz 0.9.0.1 py37h14c3975_1001 conda-forge\n",
  755. "dask 2.1.0 py_0 conda-forge\n",
  756. "dask-core 2.1.0 py_0 conda-forge\n",
  757. "decorator 4.4.0 py_0 conda-forge\n",
  758. "defusedxml 0.5.0 py_1 conda-forge\n",
  759. "distributed 2.1.0 py_0 conda-forge\n",
  760. "entrypoints 0.3 py37_1000 conda-forge\n",
  761. "freetype 2.10.0 he983fc9_0 conda-forge\n",
  762. "heapdict 1.0.0 py37_1000 conda-forge\n",
  763. "ipykernel 5.1.1 py37h24bf2e0_0 conda-forge\n",
  764. "ipython 7.6.1 py37h5ca1d4c_0 conda-forge\n",
  765. "ipython_genutils 0.2.0 py_1 conda-forge\n",
  766. "jedi 0.14.0 py37_0 conda-forge\n",
  767. "jinja2 2.10.1 py_0 conda-forge\n",
  768. "jpeg 9c h14c3975_1001 conda-forge\n",
  769. "jsonschema 3.0.1 py37_0 conda-forge\n",
  770. "jupyter_client 5.3.1 py_0 conda-forge\n",
  771. "jupyter_core 4.4.0 py_0 conda-forge\n",
  772. "libblas 3.8.0 10_openblas conda-forge\n",
  773. "libcblas 3.8.0 10_openblas conda-forge\n",
  774. "libffi 3.2.1 he1b5a44_1006 conda-forge\n",
  775. "libgcc-ng 9.1.0 hdf63c60_0 \n",
  776. "libgfortran-ng 7.3.0 hdf63c60_0 \n",
  777. "liblapack 3.8.0 10_openblas conda-forge\n",
  778. "libopenblas 0.3.6 h6e990d7_4 conda-forge\n",
  779. "libpng 1.6.37 hed695b0_0 conda-forge\n",
  780. "libsodium 1.0.16 h14c3975_1001 conda-forge\n",
  781. "libstdcxx-ng 9.1.0 hdf63c60_0 \n",
  782. "libtiff 4.0.10 h57b8799_1003 conda-forge\n",
  783. "llvmlite 0.29.0 py37hfd453ef_1 conda-forge\n",
  784. "locket 0.2.0 py_2 conda-forge\n",
  785. "lz4-c 1.8.3 he1b5a44_1001 conda-forge\n",
  786. "markupsafe 1.1.1 py37h14c3975_0 conda-forge\n",
  787. "mistune 0.8.4 py37h14c3975_1000 conda-forge\n",
  788. "msgpack-python 0.6.1 py37h6bb024c_0 conda-forge\n",
  789. "nbconvert 5.5.0 py_0 conda-forge\n",
  790. "nbformat 4.4.0 py_1 conda-forge\n",
  791. "ncurses 6.1 hf484d3e_1002 conda-forge\n",
  792. "notebook 5.7.8 py37_1 conda-forge\n",
  793. "numba 0.44.1 py37hb3f55d8_0 conda-forge\n",
  794. "numpy 1.16.4 py37h95a1406_0 conda-forge\n",
  795. "olefile 0.46 py_0 conda-forge\n",
  796. "openblas 0.3.6 h6e990d7_4 conda-forge\n",
  797. "openssl 1.1.1c h516909a_0 conda-forge\n",
  798. "packaging 19.0 py_0 conda-forge\n",
  799. "pandas 0.24.2 py37hb3f55d8_0 conda-forge\n",
  800. "pandoc 2.7.3 0 conda-forge\n",
  801. "pandocfilters 1.4.2 py_1 conda-forge\n",
  802. "parso 0.5.0 py_0 conda-forge\n",
  803. "partd 1.0.0 py_0 conda-forge\n",
  804. "pexpect 4.7.0 py37_0 conda-forge\n",
  805. "pickleshare 0.7.5 py37_1000 conda-forge\n",
  806. "pillow 6.1.0 py37he7afcd5_0 conda-forge\n",
  807. "pip 19.1.1 py37_0 conda-forge\n",
  808. "prometheus_client 0.7.1 py_0 conda-forge\n",
  809. "prompt_toolkit 2.0.9 py_0 conda-forge\n",
  810. "psutil 5.6.3 py37h516909a_0 conda-forge\n",
  811. "ptyprocess 0.6.0 py_1001 conda-forge\n",
  812. "pygments 2.4.2 py_0 conda-forge\n",
  813. "pyparsing 2.4.0 py_0 conda-forge\n",
  814. "pyrsistent 0.15.3 py37h516909a_0 conda-forge\n",
  815. "python 3.7.3 h33d41f4_1 conda-forge\n",
  816. "python-dateutil 2.8.0 py_0 conda-forge\n",
  817. "pytz 2019.1 py_0 conda-forge\n",
  818. "pyyaml 5.1.1 py37h516909a_0 conda-forge\n",
  819. "pyzmq 18.0.2 py37hc4ba49a_0 conda-forge\n",
  820. "readline 8.0 hf8c457e_0 conda-forge\n",
  821. "scipy 1.3.0 py37h921218d_0 conda-forge\n",
  822. "send2trash 1.5.0 py_0 conda-forge\n",
  823. "setuptools 41.0.1 py37_0 conda-forge\n",
  824. "six 1.12.0 py37_1000 conda-forge\n",
  825. "sortedcontainers 2.1.0 py_0 conda-forge\n",
  826. "sqlite 3.29.0 hcee41ef_0 conda-forge\n",
  827. "tblib 1.4.0 py_0 conda-forge\n",
  828. "terminado 0.8.2 py37_0 conda-forge\n",
  829. "testpath 0.4.2 py_1001 conda-forge\n",
  830. "tk 8.6.9 hed695b0_1002 conda-forge\n",
  831. "toolz 0.9.0 py_1 conda-forge\n",
  832. "tornado 6.0.3 py37h516909a_0 conda-forge\n",
  833. "traitlets 4.3.2 py37_1000 conda-forge\n",
  834. "wcwidth 0.1.7 py_1 conda-forge\n",
  835. "webencodings 0.5.1 py_1 conda-forge\n",
  836. "wheel 0.33.4 py37_0 conda-forge\n",
  837. "xz 5.2.4 h14c3975_1001 conda-forge\n",
  838. "yaml 0.1.7 h14c3975_1001 conda-forge\n",
  839. "zeromq 4.3.1 hf484d3e_1000 conda-forge\n",
  840. "zict 1.0.0 py_0 conda-forge\n",
  841. "zlib 1.2.11 h14c3975_1004 conda-forge\n",
  842. "zstd 1.4.0 h3b9ef0a_0 conda-forge\n",
  843. "--------------------------------------------------------------------------------\n",
  844. "If requested, please copy and paste the information between\n",
  845. "the dashed (----) lines, or from a given specific section as\n",
  846. "appropriate.\n",
  847. "\n",
  848. "=============================================================\n",
  849. "IMPORTANT: Please ensure that you are happy with sharing the\n",
  850. "contents of the information present, any information that you\n",
  851. "wish to keep private you should remove before sharing.\n",
  852. "=============================================================\n",
  853. "\n"
  854. ]
  855. }
  856. ],
  857. "source": [
  858. "!numba -s"
  859. ]
  860. },
  861. {
  862. "cell_type": "code",
  863. "execution_count": 26,
  864. "metadata": {
  865. "scrolled": false
  866. },
  867. "outputs": [
  868. {
  869. "name": "stdout",
  870. "output_type": "stream",
  871. "text": [
  872. "pdist_cityblock_rowm_nb (array(float32, 2d, A),)\n",
  873. "--------------------------------------------------------------------------------\n",
  874. "# File: <ipython-input-3-4b79be46c692>\n",
  875. "# --- LINE 1 --- \n",
  876. "# label 0\n",
  877. "\n",
  878. "@numba.jit(numba.float32[:, :](numba.float32[:, :]), nopython=True, parallel=True)\n",
  879. "\n",
  880. "# --- LINE 2 --- \n",
  881. "\n",
  882. "def pdist_cityblock_rowm_nb(x):\n",
  883. "\n",
  884. " # --- LINE 3 --- \n",
  885. " # x = arg(0, name=x) :: array(float32, 2d, A)\n",
  886. " # x_shape.0 = getattr(value=x, attr=shape) :: tuple(int64 x 2)\n",
  887. " # x_size0.1 = static_getitem(value=x_shape.0, index=0, index_var=None) :: int64\n",
  888. " # x_size1.2 = static_getitem(value=x_shape.0, index=1, index_var=None) :: int64\n",
  889. " # del x_shape.0\n",
  890. " # $0.2 = getattr(value=x, attr=shape) :: tuple(int64 x 2)\n",
  891. " # $const0.3 = const(int, 0) :: Literal[int](0)\n",
  892. " # $m.22 = static_getitem(value=$0.2, index=0, index_var=$const0.3) :: int64\n",
  893. " # del $const0.3\n",
  894. " # del $0.2\n",
  895. " # del $parfor__index_17.172\n",
  896. " # del $j.26\n",
  897. " # del $66.4\n",
  898. " # del $58.4\n",
  899. " # jump 30\n",
  900. "\n",
  901. " m = x.shape[0]\n",
  902. "\n",
  903. " # --- LINE 4 --- \n",
  904. " # $0.6 = getattr(value=x, attr=shape) :: tuple(int64 x 2)\n",
  905. " # $const0.7 = const(int, 1) :: Literal[int](1)\n",
  906. " # $n.21 = static_getitem(value=$0.6, index=1, index_var=$const0.7) :: int64\n",
  907. " # del $const0.7\n",
  908. " # del $0.6\n",
  909. "\n",
  910. " n = x.shape[1]\n",
  911. "\n",
  912. " # --- LINE 5 --- \n",
  913. " # id=0[LoopNest(index_variable = parfor_index.9, range = (0, x_size0.1, 1)), LoopNest(index_variable = parfor_index.10, range = (0, x_size1.2, 1))]{159: <ir.Block at <ipython-input-3-4b79be46c692> (5)>}Var($parfor_index_tuple_var.16, <ipython-input-3-4b79be46c692> (5))\n",
  914. "\n",
  915. " out = np.zeros((m, n), dtype=np.float32)\n",
  916. "\n",
  917. " # --- LINE 6 --- \n",
  918. " # id=1[LoopNest(index_variable = parfor_index.17, range = (0, $m.22, 1))]{66: <ir.Block at <ipython-input-3-4b79be46c692> (7)>, 98: <ir.Block at <ipython-input-3-4b79be46c692> (9)>, 68: <ir.Block at <ipython-input-3-4b79be46c692> (7)>, 100: <ir.Block at <ipython-input-3-4b79be46c692> (9)>, 150: <ir.Block at <ipython-input-3-4b79be46c692> (12)>, 54: <ir.Block at <ipython-input-3-4b79be46c692> (6)>, 154: <ir.Block at <ipython-input-3-4b79be46c692> (12)>}Var(parfor_index.17, <ipython-input-3-4b79be46c692> (6))\n",
  919. " # del x\n",
  920. " # del $n.21\n",
  921. " # label 52\n",
  922. " # $52.2 = iternext(value=$phi52.1) :: pair<int64, bool>\n",
  923. " # $52.3 = pair_first(value=$52.2) :: int64\n",
  924. " # $52.4 = pair_second(value=$52.2) :: bool\n",
  925. " # $phi54.1 = $52.3 :: int64\n",
  926. " # $phi158.1 = $52.3 :: int64\n",
  927. " # $phi158.2 = $phi52.1 :: range_iter_int64\n",
  928. " # branch $52.4, 54, 158\n",
  929. " # label 54\n",
  930. " # id=1[LoopNest(index_variable = parfor_index.17, range = (0, $m.22, 1))]{66: <ir.Block at <ipython-input-3-4b79be46c692> (7)>, 98: <ir.Block at <ipython-input-3-4b79be46c692> (9)>, 68: <ir.Block at <ipython-input-3-4b79be46c692> (7)>, 100: <ir.Block at <ipython-input-3-4b79be46c692> (9)>, 150: <ir.Block at <ipython-input-3-4b79be46c692> (12)>, 54: <ir.Block at <ipython-input-3-4b79be46c692> (6)>, 154: <ir.Block at <ipython-input-3-4b79be46c692> (12)>}Var(parfor_index.17, <ipython-input-3-4b79be46c692> (6))\n",
  931. "\n",
  932. " for i in numba.prange(m):\n",
  933. "\n",
  934. " # --- LINE 7 --- \n",
  935. " # $58.3 = call $58.1(_n_21, func=$58.1, args=[Var(_n_21, <ipython-input-3-4b79be46c692> (4))], kws=(), vararg=None) :: (int64,) -> range_state_int64\n",
  936. " # $58.4 = getiter(value=$58.3) :: range_iter_int64\n",
  937. " # del $58.3\n",
  938. " # jump 42\n",
  939. " # label 66\n",
  940. " # $66.2 = iternext(value=$58.4) :: pair<int64, bool>\n",
  941. " # $j.26 = pair_first(value=$66.2) :: int64\n",
  942. " # $66.4 = pair_second(value=$66.2) :: bool\n",
  943. " # del $66.2\n",
  944. " # branch $66.4, 97, 143\n",
  945. " # label 68\n",
  946. "\n",
  947. " for j in range(n):\n",
  948. "\n",
  949. " # --- LINE 8 --- \n",
  950. " # del $66.4\n",
  951. " # $68.5 = build_tuple(items=[Var($parfor__index_17.172, <string> (2)), Var($j.26, <ipython-input-3-4b79be46c692> (7))]) :: tuple(int64 x 2)\n",
  952. " # $vj.24 = getitem(value=x, index=$68.5) :: float32\n",
  953. " # del $68.5\n",
  954. "\n",
  955. " vj = x[i, j]\n",
  956. "\n",
  957. " # --- LINE 9 --- \n",
  958. " # $84.4 = $j.26 + $const84.3 :: int64\n",
  959. " # $84.1 = global(range: <class 'range'>) :: Function(<class 'range'>)\n",
  960. " # $84.6 = call $84.1($84.4, _n_21, func=$84.1, args=[Var($84.4, <ipython-input-3-4b79be46c692> (9)), Var(_n_21, <ipython-input-3-4b79be46c692> (4))], kws=(), vararg=None) :: (int64, int64) -> range_state_int64\n",
  961. " # del $84.4\n",
  962. " # del $84.1\n",
  963. " # $84.7 = getiter(value=$84.6) :: range_iter_int64\n",
  964. " # del $84.6\n",
  965. " # jump 109\n",
  966. " # label 98\n",
  967. " # $98.2 = iternext(value=$84.7) :: pair<int64, bool>\n",
  968. " # $k.27 = pair_first(value=$98.2) :: int64\n",
  969. " # $98.4 = pair_second(value=$98.2) :: bool\n",
  970. " # del $98.2\n",
  971. " # branch $98.4, 111, 141\n",
  972. " # label 100\n",
  973. "\n",
  974. " for k in range(j + 1, n):\n",
  975. "\n",
  976. " # --- LINE 10 --- \n",
  977. " # del $98.4\n",
  978. " # $100.5 = build_tuple(items=[Var($parfor__index_17.172, <string> (2)), Var($k.27, <ipython-input-3-4b79be46c692> (9))]) :: tuple(int64 x 2)\n",
  979. " # $vk.28 = getitem(value=x, index=$100.5) :: float32\n",
  980. " # del $100.5\n",
  981. "\n",
  982. " vk = x[i, k]\n",
  983. "\n",
  984. " # --- LINE 11 --- \n",
  985. " # $100.7 = global(math: <module 'math' from '/home/aliman/malariagen/binder/conda/envs/cuda/lib/python3.7/lib-dynload/math.cpython-37m-x86_64-linux-gnu.so'>) :: Module(<module 'math' from '/home/aliman/malariagen/binder/conda/envs/cuda/lib/python3.7/lib-dynload/math.cpython-37m-x86_64-linux-gnu.so'>)\n",
  986. " # $100.8 = getattr(value=$100.7, attr=fabs) :: Function(<built-in function fabs>)\n",
  987. " # del $100.7\n",
  988. " # $100.11 = $vk.28 - $vj.24 :: float32\n",
  989. " # del $vk.28\n",
  990. " # $d.29 = call $100.8($100.11, func=$100.8, args=[Var($100.11, <ipython-input-3-4b79be46c692> (11))], kws=(), vararg=None) :: (float32,) -> float32\n",
  991. " # del $100.8\n",
  992. " # del $100.11\n",
  993. "\n",
  994. " d = math.fabs(vk - vj)\n",
  995. "\n",
  996. " # --- LINE 12 --- \n",
  997. " # $100.16 = build_tuple(items=[Var($j.26, <ipython-input-3-4b79be46c692> (7)), Var($k.27, <ipython-input-3-4b79be46c692> (9))]) :: tuple(int64 x 2)\n",
  998. " # del $k.27\n",
  999. " # $100.19 = getitem(value=_out_23, index=$100.16) :: float32\n",
  1000. " # $100.21 = inplace_binop(fn=<built-in function iadd>, immutable_fn=<built-in function add>, lhs=$100.19, rhs=$d.29, static_lhs=Undefined, static_rhs=Undefined) :: float32\n",
  1001. " # del $d.29\n",
  1002. " # del $100.19\n",
  1003. " # _out_23[$100.16] = $100.21 :: (array(float32, 2d, C), tuple(int64 x 2), float32) -> none\n",
  1004. " # del $100.21\n",
  1005. " # del $100.16\n",
  1006. " # jump 109\n",
  1007. " # label 150\n",
  1008. " # del $vj.24\n",
  1009. " # del $k.27\n",
  1010. " # del $j.26\n",
  1011. " # del $98.4\n",
  1012. " # del $84.7\n",
  1013. " # jump 42\n",
  1014. " # label 154\n",
  1015. " # label 158\n",
  1016. "\n",
  1017. " out[j, k] += d\n",
  1018. "\n",
  1019. " # --- LINE 13 --- \n",
  1020. " # $160.2 = cast(value=$out.23) :: array(float32, 2d, A)\n",
  1021. " # del $out.23\n",
  1022. " # return $160.2\n",
  1023. " # $160.2 = cast(value=$out.23) :: array(float32, 2d, A)\n",
  1024. " # return $160.2\n",
  1025. "\n",
  1026. " return out\n",
  1027. "\n",
  1028. "\n",
  1029. "================================================================================\n"
  1030. ]
  1031. }
  1032. ],
  1033. "source": [
  1034. "pdist_cityblock_rowm_nb.inspect_types()"
  1035. ]
  1036. },
  1037. {
  1038. "cell_type": "code",
  1039. "execution_count": 27,
  1040. "metadata": {
  1041. "scrolled": false
  1042. },
  1043. "outputs": [
  1044. {
  1045. "name": "stdout",
  1046. "output_type": "stream",
  1047. "text": [
  1048. "_ZN6cudapy8__main__25pdist_cityblock_cuda$2414E5ArrayIfLi2E1A7mutable7alignedE5ArrayIfLi2E1A7mutable7alignedE (array(float32, 2d, A), array(float32, 2d, A))\n",
  1049. "--------------------------------------------------------------------------------\n",
  1050. "# File: <ipython-input-15-2d9f525674f9>\n",
  1051. "# --- LINE 1 --- \n",
  1052. "# label 0\n",
  1053. "\n",
  1054. "@cuda.jit(numba.void(numba.float32[:, :], numba.float32[:, :]))\n",
  1055. "\n",
  1056. "# --- LINE 2 --- \n",
  1057. "\n",
  1058. "def pdist_cityblock_cuda(x, out):\n",
  1059. "\n",
  1060. " # --- LINE 3 --- \n",
  1061. " # x = arg(0, name=x) :: array(float32, 2d, A)\n",
  1062. " # out = arg(1, name=out) :: array(float32, 2d, A)\n",
  1063. " # $0.2 = getattr(value=x, attr=shape) :: tuple(int64 x 2)\n",
  1064. " # $const0.3 = const(int, 0) :: Literal[int](0)\n",
  1065. " # $0.4 = static_getitem(value=$0.2, index=0, index_var=$const0.3) :: int64\n",
  1066. " # del $const0.3\n",
  1067. " # del $0.2\n",
  1068. " # m = $0.4 :: int64\n",
  1069. " # del $0.4\n",
  1070. "\n",
  1071. " m = x.shape[0]\n",
  1072. "\n",
  1073. " # --- LINE 4 --- \n",
  1074. " # $0.6 = getattr(value=x, attr=shape) :: tuple(int64 x 2)\n",
  1075. " # $const0.7 = const(int, 1) :: Literal[int](1)\n",
  1076. " # $0.8 = static_getitem(value=$0.6, index=1, index_var=$const0.7) :: int64\n",
  1077. " # del $const0.7\n",
  1078. " # del $0.6\n",
  1079. " # n = $0.8 :: int64\n",
  1080. " # del $0.8\n",
  1081. "\n",
  1082. " n = x.shape[1]\n",
  1083. "\n",
  1084. " # --- LINE 5 --- \n",
  1085. " # $0.9 = global(cuda: <module 'numba.cuda' from '/home/aliman/malariagen/binder/conda/envs/cuda/lib/python3.7/site-packages/numba/cuda/__init__.py'>) :: Module(<module 'numba.cuda' from '/home/aliman/malariagen/binder/conda/envs/cuda/lib/python3.7/site-packages/numba/cuda/__init__.py'>)\n",
  1086. " # $0.10 = getattr(value=$0.9, attr=grid) :: Macro(<class 'numba.cuda.cudadecl.Cuda_grid'>)\n",
  1087. " # del $0.9\n",
  1088. " # del $0.10\n",
  1089. " # $const0.11 = const(int, 2) :: Literal[int](2)\n",
  1090. " # $0.12 = call ptx.grid.2d($const0.11, func=ptx.grid.2d, args=[Var($const0.11, <ipython-input-15-2d9f525674f9> (5))], kws=(), vararg=None) :: (int64,) -> tuple(int32 x 2)\n",
  1091. " # del $const0.11\n",
  1092. " # $0.15 = exhaust_iter(value=$0.12, count=2) :: tuple(int32 x 2)\n",
  1093. " # del $0.12\n",
  1094. " # $0.13 = static_getitem(value=$0.15, index=0, index_var=None) :: int32\n",
  1095. " # $0.14 = static_getitem(value=$0.15, index=1, index_var=None) :: int32\n",
  1096. " # del $0.15\n",
  1097. " # j = $0.13 :: int32\n",
  1098. " # del $0.13\n",
  1099. " # k = $0.14 :: int32\n",
  1100. " # del $0.14\n",
  1101. "\n",
  1102. " j, k = cuda.grid(2)\n",
  1103. "\n",
  1104. " # --- LINE 6 --- \n",
  1105. " # $0.18 = j < n :: bool\n",
  1106. " # branch $0.18, 42, 144\n",
  1107. " # label 42\n",
  1108. " # del $0.18\n",
  1109. " # $42.3 = k < n :: bool\n",
  1110. " # del n\n",
  1111. " # branch $42.3, 50, 144\n",
  1112. " # label 50\n",
  1113. " # del $42.3\n",
  1114. " # $50.3 = j < k :: bool\n",
  1115. " # branch $50.3, 58, 144\n",
  1116. " # label 58\n",
  1117. "\n",
  1118. " if j < n and k < n and j < k:\n",
  1119. "\n",
  1120. " # --- LINE 7 --- \n",
  1121. " # del $50.3\n",
  1122. " # $58.1 = global(np: <module 'numpy' from '/home/aliman/malariagen/binder/conda/envs/cuda/lib/python3.7/site-packages/numpy/__init__.py'>) :: Module(<module 'numpy' from '/home/aliman/malariagen/binder/conda/envs/cuda/lib/python3.7/site-packages/numpy/__init__.py'>)\n",
  1123. " # $58.2 = getattr(value=$58.1, attr=float32) :: class(float32)\n",
  1124. " # del $58.1\n",
  1125. " # $const58.3 = const(int, 0) :: Literal[int](0)\n",
  1126. " # $58.4 = call $58.2($const58.3, func=$58.2, args=[Var($const58.3, <ipython-input-15-2d9f525674f9> (7))], kws=(), vararg=None) :: (Literal[int](0),) -> float32\n",
  1127. " # del $const58.3\n",
  1128. " # del $58.2\n",
  1129. " # dd = $58.4 :: float32\n",
  1130. " # del $58.4\n",
  1131. " # jump 68\n",
  1132. " # label 68\n",
  1133. "\n",
  1134. " dd = np.float32(0)\n",
  1135. "\n",
  1136. " # --- LINE 8 --- \n",
  1137. " # jump 70\n",
  1138. " # label 70\n",
  1139. " # $70.1 = global(range: <class 'range'>) :: Function(<class 'range'>)\n",
  1140. " # $70.3 = call $70.1(m, func=$70.1, args=[Var(m, <ipython-input-15-2d9f525674f9> (3))], kws=(), vararg=None) :: (int64,) -> range_state_int64\n",
  1141. " # del m\n",
  1142. " # del $70.1\n",
  1143. " # $70.4 = getiter(value=$70.3) :: range_iter_int64\n",
  1144. " # del $70.3\n",
  1145. " # $phi78.1 = $70.4 :: range_iter_int64\n",
  1146. " # del $70.4\n",
  1147. " # jump 78\n",
  1148. " # label 78\n",
  1149. " # $78.2 = iternext(value=$phi78.1) :: pair<int64, bool>\n",
  1150. " # $78.3 = pair_first(value=$78.2) :: int64\n",
  1151. " # $78.4 = pair_second(value=$78.2) :: bool\n",
  1152. " # del $78.2\n",
  1153. " # $phi80.1 = $78.3 :: int64\n",
  1154. " # $phi130.1 = $78.3 :: int64\n",
  1155. " # del $phi130.1\n",
  1156. " # del $78.3\n",
  1157. " # $phi130.2 = $phi78.1 :: range_iter_int64\n",
  1158. " # del $phi130.2\n",
  1159. " # branch $78.4, 80, 130\n",
  1160. " # label 80\n",
  1161. " # del $78.4\n",
  1162. " # i = $phi80.1 :: int64\n",
  1163. " # del $phi80.1\n",
  1164. "\n",
  1165. " for i in range(m):\n",
  1166. "\n",
  1167. " # --- LINE 9 --- \n",
  1168. " # $80.5 = build_tuple(items=[Var(i, <ipython-input-15-2d9f525674f9> (8)), Var(j, <ipython-input-15-2d9f525674f9> (5))]) :: (int64, int32)\n",
  1169. " # $80.6 = getitem(value=x, index=$80.5) :: float32\n",
  1170. " # del $80.5\n",
  1171. " # vj = $80.6 :: float32\n",
  1172. " # del $80.6\n",
  1173. "\n",
  1174. " vj = x[i, j]\n",
  1175. "\n",
  1176. " # --- LINE 10 --- \n",
  1177. " # $80.10 = build_tuple(items=[Var(i, <ipython-input-15-2d9f525674f9> (8)), Var(k, <ipython-input-15-2d9f525674f9> (5))]) :: (int64, int32)\n",
  1178. " # del i\n",
  1179. " # $80.11 = getitem(value=x, index=$80.10) :: float32\n",
  1180. " # del $80.10\n",
  1181. " # vk = $80.11 :: float32\n",
  1182. " # del $80.11\n",
  1183. "\n",
  1184. " vk = x[i, k]\n",
  1185. "\n",
  1186. " # --- LINE 11 --- \n",
  1187. " # $80.12 = global(math: <module 'math' from '/home/aliman/malariagen/binder/conda/envs/cuda/lib/python3.7/lib-dynload/math.cpython-37m-x86_64-linux-gnu.so'>) :: Module(<module 'math' from '/home/aliman/malariagen/binder/conda/envs/cuda/lib/python3.7/lib-dynload/math.cpython-37m-x86_64-linux-gnu.so'>)\n",
  1188. " # $80.13 = getattr(value=$80.12, attr=fabs) :: Function(<built-in function fabs>)\n",
  1189. " # del $80.12\n",
  1190. " # $80.16 = vj - vk :: float32\n",
  1191. " # del vk\n",
  1192. " # del vj\n",
  1193. " # $80.17 = call $80.13($80.16, func=$80.13, args=[Var($80.16, <ipython-input-15-2d9f525674f9> (11))], kws=(), vararg=None) :: (float32,) -> float32\n",
  1194. " # del $80.16\n",
  1195. " # del $80.13\n",
  1196. " # d = $80.17 :: float32\n",
  1197. " # del $80.17\n",
  1198. "\n",
  1199. " d = math.fabs(vj - vk)\n",
  1200. "\n",
  1201. " # --- LINE 12 --- \n",
  1202. " # $80.20 = inplace_binop(fn=<built-in function iadd>, immutable_fn=<built-in function add>, lhs=dd, rhs=d, static_lhs=Undefined, static_rhs=Undefined) :: float32\n",
  1203. " # del d\n",
  1204. " # dd = $80.20 :: float32\n",
  1205. " # del $80.20\n",
  1206. " # jump 78\n",
  1207. " # label 130\n",
  1208. " # del x\n",
  1209. " # del $phi80.1\n",
  1210. " # del $phi78.1\n",
  1211. " # del $78.4\n",
  1212. " # jump 132\n",
  1213. " # label 132\n",
  1214. "\n",
  1215. " dd += d\n",
  1216. "\n",
  1217. " # --- LINE 13 --- \n",
  1218. " # $132.5 = build_tuple(items=[Var(j, <ipython-input-15-2d9f525674f9> (5)), Var(k, <ipython-input-15-2d9f525674f9> (5))]) :: tuple(int32 x 2)\n",
  1219. " # del k\n",
  1220. " # del j\n",
  1221. " # out[$132.5] = dd :: (array(float32, 2d, A), tuple(int64 x 2), float32) -> none\n",
  1222. " # del out\n",
  1223. " # del dd\n",
  1224. " # del $132.5\n",
  1225. " # jump 144\n",
  1226. " # label 144\n",
  1227. " # del x\n",
  1228. " # del out\n",
  1229. " # del n\n",
  1230. " # del m\n",
  1231. " # del k\n",
  1232. " # del j\n",
  1233. " # del $50.3\n",
  1234. " # del $42.3\n",
  1235. " # del $0.18\n",
  1236. " # $const144.1 = const(NoneType, None) :: none\n",
  1237. " # $144.2 = cast(value=$const144.1) :: none\n",
  1238. " # del $const144.1\n",
  1239. " # return $144.2\n",
  1240. "\n",
  1241. " out[j, k] = dd\n",
  1242. "\n",
  1243. "\n",
  1244. "================================================================================\n"
  1245. ]
  1246. }
  1247. ],
  1248. "source": [
  1249. "pdist_cityblock_cuda.inspect_types()"
  1250. ]
  1251. },
  1252. {
  1253. "cell_type": "code",
  1254. "execution_count": null,
  1255. "metadata": {},
  1256. "outputs": [],
  1257. "source": []
  1258. }
  1259. ],
  1260. "metadata": {
  1261. "jupytext": {
  1262. "text_representation": {
  1263. "extension": ".py",
  1264. "format_name": "percent",
  1265. "format_version": "1.2",
  1266. "jupytext_version": "1.1.1"
  1267. }
  1268. },
  1269. "kernelspec": {
  1270. "display_name": "Python 3",
  1271. "language": "python",
  1272. "name": "python3"
  1273. },
  1274. "language_info": {
  1275. "codemirror_mode": {
  1276. "name": "ipython",
  1277. "version": 3
  1278. },
  1279. "file_extension": ".py",
  1280. "mimetype": "text/x-python",
  1281. "name": "python",
  1282. "nbconvert_exporter": "python",
  1283. "pygments_lexer": "ipython3",
  1284. "version": "3.7.3"
  1285. }
  1286. },
  1287. "nbformat": 4,
  1288. "nbformat_minor": 2
  1289. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement