Advertisement
Guest User

Untitled

a guest
Sep 19th, 2019
125
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 47.94 KB | None | 0 0
  1.  
  2. [Timur@timur-zen ~]$ cd ~/Projects/Others/VK-GL-CTS/build/^C
  3. [Timur@timur-zen ~]$ cat output.txt
  4. Writing test log into TestResults.qpa
  5. dEQP Core git-372a1fbf47d460b75ec9e2e1de114f82bfd0ac95 (0x372a1fbf) starting..
  6. target implementation = 'Default'
  7. WARNING: Experimental compiler backend enabled. Here be dragons! Incorrect rendering, GPU hangs and/or resets are likely
  8. WARNING: radv is not a conformant vulkan implementation, testing use only.
  9. WARNING: Experimental compiler backend enabled. Here be dragons! Incorrect rendering, GPU hangs and/or resets are likely
  10. WARNING: radv is not a conformant vulkan implementation, testing use only.
  11.  
  12. Test case 'dEQP-VK.subgroups.shuffle.compute.subgroupshuffle_uint'..
  13. shader: MESA_SHADER_COMPUTE
  14. local-size: 1, 1, 1
  15. shared-size: 1
  16. inputs: 0
  17. outputs: 0
  18. uniforms: 0
  19. shared: 0
  20. decl_var ssbo INTERP_MODE_NONE block @0 (429, 0, 2)
  21. decl_var ssbo INTERP_MODE_NONE block @1 (429, 0, 1)
  22. decl_var ssbo INTERP_MODE_NONE block @2 (429, 0, 0)
  23. decl_function main (0 params)
  24.  
  25. impl main {
  26. block block_0:
  27. /* preds: */
  28. vec1 32 ssa_0 = load_const (0x00000001 /* 0.000000 */)
  29. vec1 32 ssa_1 = load_const (0x00000000 /* 0.000000 */)
  30. vec1 1 ssa_2 = load_const (true)
  31. vec3 32 ssa_3 = intrinsic load_num_work_groups () ()
  32. vec3 32 ssa_4 = intrinsic load_work_group_id () ()
  33. vec3 32 ssa_5 = intrinsic load_local_invocation_id () ()
  34. vec1 32 ssa_6 = iadd ssa_4.z, ssa_5.z
  35. vec1 32 ssa_7 = imul ssa_3.y, ssa_6
  36. vec1 32 ssa_8 = iadd ssa_4.y, ssa_5.y
  37. vec1 32 ssa_9 = iadd ssa_7, ssa_8
  38. vec1 32 ssa_10 = imul ssa_3.x, ssa_9
  39. vec1 32 ssa_11 = iadd ssa_4.x, ssa_5.x
  40. vec1 32 ssa_12 = iadd ssa_10, ssa_11
  41. vec1 64 ssa_13 = intrinsic ballot (ssa_2) ()
  42. vec1 32 ssa_14 = intrinsic load_subgroup_invocation () ()
  43. vec1 32 ssa_15 = intrinsic vulkan_resource_index (ssa_1) (0, 2, 7) /* desc-set=0 */ /* binding=2 */ /* desc_type=SSBO */
  44. vec1 32 ssa_16 = load_const (0x00000002 /* 0.000000 */)
  45. vec1 32 ssa_17 = ishl ssa_14, ssa_16
  46. vec1 32 ssa_18 = intrinsic load_ssbo (ssa_15, ssa_17) (16, 4, 0) /* access=16 */ /* align_mul=4 */ /* align_offset=0 */
  47. vec1 32 ssa_19 = load_const (0x00000040 /* 0.000000 */)
  48. vec1 32 ssa_20 = load_const (0x0000003f /* 0.000000 */)
  49. vec1 32 ssa_21 = iand ssa_18, ssa_20
  50. vec1 32 ssa_22 = intrinsic vulkan_resource_index (ssa_1) (0, 1, 7) /* desc-set=0 */ /* binding=1 */ /* desc_type=SSBO */
  51. vec1 32 ssa_23 = intrinsic load_ssbo (ssa_22, ssa_17) (16, 4, 0) /* access=16 */ /* align_mul=4 */ /* align_offset=0 */
  52. vec1 32 ssa_24 = intrinsic shuffle (ssa_23, ssa_21) ()
  53. vec1 1 ssa_25 = ult ssa_21, ssa_19
  54. vec1 64 ssa_26 = load_const (0x 1 /* 0.000000 */)
  55. vec1 64 ssa_27 = ushr ssa_13, ssa_21
  56. vec1 64 ssa_28 = iand ssa_27, ssa_26
  57. vec1 1 ssa_29 = i2b1 ssa_28
  58. vec1 1 ssa_30 = iand ssa_25, ssa_29
  59. /* succs: block_1 block_2 */
  60. if ssa_30 {
  61. block block_1:
  62. /* preds: block_0 */
  63. vec1 32 ssa_31 = ishl ssa_21, ssa_16
  64. vec1 32 ssa_32 = intrinsic load_ssbo (ssa_22, ssa_31) (16, 4, 0) /* access=16 */ /* align_mul=4 */ /* align_offset=0 */
  65. vec1 1 ssa_33 = ieq ssa_24, ssa_32
  66. vec1 32 ssa_34 = bcsel ssa_33, ssa_0, ssa_1
  67. /* succs: block_3 */
  68. } else {
  69. block block_2:
  70. /* preds: block_0 */
  71. /* succs: block_3 */
  72. }
  73. block block_3:
  74. /* preds: block_1 block_2 */
  75. vec1 32 ssa_35 = phi block_1: ssa_34, block_2: ssa_0
  76. vec1 32 ssa_36 = intrinsic vulkan_resource_index (ssa_1) (0, 0, 7) /* desc-set=0 */ /* binding=0 */ /* desc_type=SSBO */
  77. vec1 32 ssa_37 = ishl ssa_12, ssa_16
  78. intrinsic store_ssbo (ssa_35, ssa_36, ssa_37) (1, 0, 4, 0) /* wrmask=x */ /* access=0 */ /* align_mul=4 */ /* align_offset=0 */
  79. /* succs: block_4 */
  80. block block_4:
  81. }
  82.  
  83. After RA:
  84. BB0
  85. /* logical preds: / linear preds: / kind: top-level, branch, */
  86. s2: %40:s[0-1], s1: %41:s[2], s1: %42:s[3], s1: %43:s[4], s1: %44:s[5], s1: %45:s[6], s1: %46:s[7], s1: %47:s[8], v1: %48:v[0], v1: %49:v[1], v1: %50:v[2], s2: %51:exec = p_startpgm
  87. p_logical_start
  88. s1: %64:s[0], s1: %63:scc = s_add_i32 32, %41:s[2]
  89. s2: %65:s[0-1] = p_create_vector %64:s[0], 0xffff8000
  90. s4: %66:s[12-15] = s_load_dwordx4 %65:s[0-1], 0 reorder
  91. v1: %62:v[3] = v_mbcnt_lo_u32_b32 -1, 0
  92. v1: %13:v[3] = v_mbcnt_hi_u32_b32 -1, %62:v[3]
  93. v1: %17:v[4] = v_lshlrev_b32 2, %13:v[3]
  94. v1: %18:v[5] = buffer_load_dword %17:v[4], %66:s[12-15], 0 offen buffer
  95. v1: %4:v[2] = v_add_u32 %47:s[8], %50:v[2]
  96. s1: %68:s[0], s1: %67:scc = s_add_i32 16, %41:s[2]
  97. s2: %69:s[0-1] = p_create_vector %68:s[0], 0xffff8000
  98. s4: %70:s[8-11] = s_load_dwordx4 %69:s[0-1], 0 reorder
  99. v1: %5:v[2] = v_mul_lo_u32 %43:s[4], %4:v[2]
  100. v1: %7:v[1] = v_add3_u32 %5:v[2], %46:s[7], %49:v[1]
  101. v1: %8:v[1] = v_mul_lo_u32 %42:s[3], %7:v[1]
  102. v1: %10:v[0] = v_add3_u32 %8:v[1], %45:s[6], %48:v[0]
  103. s1: %11:s[0] = s_mov_b32 1
  104. s1: %98:scc = p_parallelcopy %11:s[0]
  105. s2: %61:s[0-1] = s_cselect_b64 %0:exec, 0, %98:scc
  106. v1: %78:v[1] = v_and_b32 32, %13:v[3]
  107. v1: %20:v[2] = v_and_b32 63, %18:v[5]
  108. v1: %71:v[3] = v_lshlrev_b32 2, %20:v[2]
  109. v1: %79:v[5] = v_and_b32 32, %71:v[3]
  110. s2: %82:vcc = v_cmp_eq_u32 %78:v[1], %79:v[5]
  111. v2: %24:v[5-6] = v_lshrrev_b64 %20:v[2], %61:s[0-1]
  112. v1: %22:v[1] = buffer_load_dword %17:v[4], %70:s[8-11], 0 offen buffer
  113. v1: %25:v[5] = p_extract_vector %24:v[5-6], 0
  114. v1: %27:v[4] = v_and_b32 1, %25:v[5]
  115. v2: %28:v[4-5] = p_create_vector %27:v[4], 0
  116. s2: %29:s[0-1] = v_cmp_lg_u64 0, %28:v[4-5]
  117. s2: %31:s[4-5] = v_cmp_gt_u32 64, %20:v[2]
  118. s2: %32:s[0-1], s1: %84:scc = s_and_b64 %31:s[4-5], %29:s[0-1]
  119. v1: %75:v[4], s1: %74:s[3], s1: %73:s[4] = p_swap_half_waves %22:v[1]
  120. v1: %80:v[1] = ds_bpermute_b32 %71:v[3], %22:v[1]
  121. v1: %81:v[3] = ds_bpermute_b32 %71:v[3], %75:v[4]
  122. v1: %72:v[1] = v_cndmask_b32 %81:v[3], %80:v[1], %82:vcc
  123. p_logical_end
  124. s2: %93:s[0-1], s1: %92:scc, s2: %91:exec = s_and_saveexec_b64 %32:s[0-1], %51:exec
  125. p_cbranch_z %91:exec BB2, BB1
  126. BB1
  127. /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
  128. p_logical_start
  129. v1: %33:v[2] = v_lshlrev_b32 2, %20:v[2]
  130. v1: %34:v[2] = buffer_load_dword %33:v[2], %70:s[8-11], 0 offen buffer
  131. s2: %35:vcc = v_cmp_eq_i32 %72:v[1], %34:v[2]
  132. v1: %36:v[1] = v_cndmask_b32 0, 1, %35:vcc
  133. p_logical_end
  134. p_branch BB3
  135. BB2
  136. /* logical preds: / linear preds: BB0, / kind: uniform, */
  137. p_branch BB3
  138. BB3
  139. /* logical preds: / linear preds: BB1, BB2, / kind: invert, */
  140. s2: %94:exec = p_linear_phi %91:exec, %91:exec
  141. s2: %96:exec, s1: %95:scc = s_andn2_b64 %93:s[0-1], %94:exec
  142. p_cbranch_z %96:exec BB5, BB4
  143. BB4
  144. /* logical preds: BB0, / linear preds: BB3, / kind: uniform, */
  145. p_logical_start
  146. p_logical_end
  147. p_branch BB6
  148. BB5
  149. /* logical preds: / linear preds: BB3, / kind: uniform, */
  150. p_branch BB6
  151. BB6
  152. /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
  153. v1: %37:v[1] = p_phi %36:v[1], 1
  154. s2: %97:exec = p_parallelcopy %93:s[0-1]
  155. p_logical_start
  156. s2: %89:s[2-3] = p_create_vector %41:s[2], 0xffff8000
  157. s4: %90:s[0-3] = s_load_dwordx4 %89:s[2-3], 0 reorder
  158. v1: %39:v[0] = v_lshlrev_b32 2, %10:v[0]
  159. buffer_store_dword %39:v[0], %90:s[0-3], 0, %37:v[1] offen disable_wqm buffer
  160. p_logical_end
  161. s_endpgm
  162.  
  163. disasm:
  164. BB0:
  165. s_add_i32 s0, 32, s2 ; 810002a0
  166. s_movk_i32 s1, 0x8000 ; b0018000
  167. s_load_dwordx4 s[12:15], s[0:1], 0x0 ; f4080300 fa000000
  168. v_mbcnt_lo_u32_b32_e64 v3, -1, 0 ; d7650003 000100c1
  169. v_mbcnt_hi_u32_b32_e64 v3, -1, v3 ; d7660003 000206c1
  170. v_lshlrev_b32_e32 v4, 2, v3 ; 34080682
  171. s_waitcnt lgkmcnt(0) ; bf8cc07f
  172. buffer_load_dword v5, v4, s[12:15], 0 offen ; e0301000 80030504
  173. v_add_nc_u32_e32 v2, s8, v2 ; 4a040408
  174. s_add_i32 s0, 16, s2 ; 81000290
  175. s_movk_i32 s1, 0x8000 ; b0018000
  176. s_load_dwordx4 s[8:11], s[0:1], 0x0 ; f4080200 fa000000
  177. v_mul_lo_u32 v2, s4, v2 ; d5690002 00020404
  178. v_add3_u32 v1, v2, s7, v1 ; d76d0001 04040f02
  179. v_mul_lo_u32 v1, s3, v1 ; d5690001 00020203
  180. v_add3_u32 v0, v1, s6, v0 ; d76d0000 04000d01
  181. s_mov_b32 s0, 1 ; be800381
  182. s_cmp_lg_i32 s0, 0 ; bf018000
  183. s_cselect_b64 s[0:1], exec, 0 ; 8580807e
  184. v_and_b32_e32 v1, 32, v3 ; 360206a0
  185. s_waitcnt vmcnt(0) ; bf8c3f70
  186. v_and_b32_e32 v2, 63, v5 ; 36040abf
  187. v_lshlrev_b32_e32 v3, 2, v2 ; 34060482
  188. v_and_b32_e32 v5, 32, v3 ; 360a06a0
  189. v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; 7d840b01
  190. v_lshrrev_b64 v[5:6], v2, s[0:1] ; d7000005 00000102
  191. s_waitcnt lgkmcnt(0) ; bf8cc07f
  192. buffer_load_dword v1, v4, s[8:11], 0 offen ; e0301000 80020104
  193. v_and_b32_e32 v4, 1, v5 ; 36080a81
  194. v_mov_b32_e32 v5, 0 ; 7e0a0280
  195. v_cmp_ne_u64_e64 s0, 0, v[4:5] ; d4e50000 00020880
  196. v_cmp_gt_u32_e64 s4, 64, v2 ; d4c40004 000204c0
  197. s_and_b64 s[0:1], s[4:5], s[0:1] ; 87800004
  198. s_and_saveexec_b32 s3, 0 ; be833c80
  199. s_subvector_loop_begin s4, 208 ; bd8400d0
  200. s_waitcnt vmcnt(0) ; bf8c3f70
  201. v_mov_b32_e32 v8, v1 ; 7e100301
  202. s_subvector_loop_end s4, 192 ; be0400c0
  203. s_mov_b32 exec_lo, 3 ; befe0383
  204. s_subvector_loop_begin s4, 228 ; bd8400e4
  205. v_mov_b32_e32 v4, s8 ; 7e080208
  206. v_mov_b32_e32 v8, v1 ; 7e100301
  207. s_subvector_loop_end s4, 212 ; be0400d4
  208. ds_bpermute_b32 v1, v3, v1 ; dacc0000 01000103
  209. ds_bpermute_b32 v3, v3, v4 ; dacc0000 03000403
  210. s_waitcnt lgkmcnt(0) ; bf8cc07f
  211. v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; 02020303
  212. s_and_saveexec_b64 s[0:1], s[0:1] ; be802400
  213. s_cbranch_execz BB3 ; bf880007
  214. BB1:
  215. v_lshlrev_b32_e32 v2, 2, v2 ; 34040482
  216. buffer_load_dword v2, v2, s[8:11], 0 offen ; e0301000 80020202
  217. s_waitcnt vmcnt(0) ; bf8c3f70
  218. v_cmp_eq_i32_e32 vcc_lo, v1, v2 ; 7d040501
  219. v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; d5010001 01a90280
  220. BB3:
  221. s_andn2_b64 exec, s[0:1], exec ; 8afe7e00
  222. s_cbranch_execz BB6 ; bf880001
  223. BB4:
  224. v_mov_b32_e32 v1, 1 ; 7e020281
  225. BB6:
  226. s_mov_b64 exec, s[0:1] ; befe0400
  227. s_movk_i32 s3, 0x8000 ; b0038000
  228. s_load_dwordx4 s[0:3], s[2:3], 0x0 ; f4080001 fa000000
  229. v_lshlrev_b32_e32 v0, 2, v0 ; 34000082
  230. s_waitcnt lgkmcnt(0) ; bf8cc07f
  231. buffer_store_dword v1, v0, s[0:3], 0 offen ; e0701000 80000100
  232. s_endpgm ; bf810000
  233. s_code_end ; bf9f0000
  234. s_code_end ; bf9f0000
  235. s_code_end ; bf9f0000
  236. s_code_end ; bf9f0000
  237. s_code_end ; bf9f0000
  238. s_code_end ; bf9f0000
  239. s_code_end ; bf9f0000
  240. s_code_end ; bf9f0000
  241. s_code_end ; bf9f0000
  242. s_code_end ; bf9f0000
  243. s_code_end ; bf9f0000
  244. s_code_end ; bf9f0000
  245. s_code_end ; bf9f0000
  246. s_code_end ; bf9f0000
  247. s_code_end ; bf9f0000
  248. s_code_end ; bf9f0000
  249. s_code_end ; bf9f0000
  250. s_code_end ; bf9f0000
  251. s_code_end ; bf9f0000
  252. s_code_end ; bf9f0000
  253. s_code_end ; bf9f0000
  254. s_code_end ; bf9f0000
  255. s_code_end ; bf9f0000
  256. s_code_end ; bf9f0000
  257. s_code_end ; bf9f0000
  258. s_code_end ; bf9f0000
  259. s_code_end ; bf9f0000
  260. s_code_end ; bf9f0000
  261. s_code_end ; bf9f0000
  262. s_code_end ; bf9f0000
  263. s_code_end ; bf9f0000
  264. s_code_end ; bf9f0000
  265. s_code_end ; bf9f0000
  266. s_code_end ; bf9f0000
  267. s_code_end ; bf9f0000
  268. s_code_end ; bf9f0000
  269. s_code_end ; bf9f0000
  270. s_code_end ; bf9f0000
  271. s_code_end ; bf9f0000
  272. s_code_end ; bf9f0000
  273. s_code_end ; bf9f0000
  274. s_code_end ; bf9f0000
  275. s_code_end ; bf9f0000
  276. s_code_end ; bf9f0000
  277. s_code_end ; bf9f0000
  278. s_code_end ; bf9f0000
  279. s_code_end ; bf9f0000
  280. s_code_end ; bf9f0000
  281. s_code_end ; bf9f0000
  282. s_code_end ; bf9f0000
  283. s_code_end ; bf9f0000
  284. s_code_end ; bf9f0000
  285. s_code_end ; bf9f0000
  286. s_code_end ; bf9f0000
  287. s_code_end ; bf9f0000
  288. s_code_end ; bf9f0000
  289. s_code_end ; bf9f0000
  290. s_code_end ; bf9f0000
  291. s_code_end ; bf9f0000
  292. s_code_end ; bf9f0000
  293. s_code_end ; bf9f0000
  294.  
  295. shader: MESA_SHADER_COMPUTE
  296. local-size: 32, 4, 1
  297. shared-size: 1
  298. inputs: 0
  299. outputs: 0
  300. uniforms: 0
  301. shared: 0
  302. decl_var ssbo INTERP_MODE_NONE block @0 (429, 0, 2)
  303. decl_var ssbo INTERP_MODE_NONE block @1 (429, 0, 1)
  304. decl_var ssbo INTERP_MODE_NONE block @2 (429, 0, 0)
  305. decl_function main (0 params)
  306.  
  307. impl main {
  308. block block_0:
  309. /* preds: */
  310. vec1 32 ssa_0 = load_const (0x00000001 /* 0.000000 */)
  311. vec1 32 ssa_1 = load_const (0x00000000 /* 0.000000 */)
  312. vec1 1 ssa_2 = load_const (true)
  313. vec3 32 ssa_3 = intrinsic load_num_work_groups () ()
  314. vec1 32 ssa_4 = load_const (0x00000005 /* 0.000000 */)
  315. vec1 32 ssa_5 = ishl ssa_3.x, ssa_4
  316. vec1 32 ssa_6 = load_const (0x00000002 /* 0.000000 */)
  317. vec1 32 ssa_7 = ishl ssa_3.y, ssa_6
  318. vec3 32 ssa_8 = intrinsic load_work_group_id () ()
  319. vec3 32 ssa_9 = intrinsic load_local_invocation_id () ()
  320. vec1 32 ssa_10 = iadd ssa_8.z, ssa_9.z
  321. vec1 32 ssa_11 = imul ssa_7, ssa_10
  322. vec1 32 ssa_12 = ishl ssa_8.y, ssa_6
  323. vec1 32 ssa_13 = iadd ssa_12, ssa_9.y
  324. vec1 32 ssa_14 = iadd ssa_11, ssa_13
  325. vec1 32 ssa_15 = imul ssa_5, ssa_14
  326. vec1 32 ssa_16 = ishl ssa_8.x, ssa_4
  327. vec1 32 ssa_17 = iadd ssa_16, ssa_9.x
  328. vec1 32 ssa_18 = iadd ssa_15, ssa_17
  329. vec1 64 ssa_19 = intrinsic ballot (ssa_2) ()
  330. vec1 32 ssa_20 = intrinsic load_subgroup_invocation () ()
  331. vec1 32 ssa_21 = intrinsic vulkan_resource_index (ssa_1) (0, 2, 7) /* desc-set=0 */ /* binding=2 */ /* desc_type=SSBO */
  332. vec1 32 ssa_22 = ishl ssa_20, ssa_6
  333. vec1 32 ssa_23 = intrinsic load_ssbo (ssa_21, ssa_22) (16, 4, 0) /* access=16 */ /* align_mul=4 */ /* align_offset=0 */
  334. vec1 32 ssa_24 = load_const (0x00000040 /* 0.000000 */)
  335. vec1 32 ssa_25 = load_const (0x0000003f /* 0.000000 */)
  336. vec1 32 ssa_26 = iand ssa_23, ssa_25
  337. vec1 32 ssa_27 = intrinsic vulkan_resource_index (ssa_1) (0, 1, 7) /* desc-set=0 */ /* binding=1 */ /* desc_type=SSBO */
  338. vec1 32 ssa_28 = intrinsic load_ssbo (ssa_27, ssa_22) (16, 4, 0) /* access=16 */ /* align_mul=4 */ /* align_offset=0 */
  339. vec1 32 ssa_29 = intrinsic shuffle (ssa_28, ssa_26) ()
  340. vec1 1 ssa_30 = ult ssa_26, ssa_24
  341. vec1 64 ssa_31 = load_const (0x 1 /* 0.000000 */)
  342. vec1 64 ssa_32 = ushr ssa_19, ssa_26
  343. vec1 64 ssa_33 = iand ssa_32, ssa_31
  344. vec1 1 ssa_34 = i2b1 ssa_33
  345. vec1 1 ssa_35 = iand ssa_30, ssa_34
  346. /* succs: block_1 block_2 */
  347. if ssa_35 {
  348. block block_1:
  349. /* preds: block_0 */
  350. vec1 32 ssa_36 = ishl ssa_26, ssa_6
  351. vec1 32 ssa_37 = intrinsic load_ssbo (ssa_27, ssa_36) (16, 4, 0) /* access=16 */ /* align_mul=4 */ /* align_offset=0 */
  352. vec1 1 ssa_38 = ieq ssa_29, ssa_37
  353. vec1 32 ssa_39 = bcsel ssa_38, ssa_0, ssa_1
  354. /* succs: block_3 */
  355. } else {
  356. block block_2:
  357. /* preds: block_0 */
  358. /* succs: block_3 */
  359. }
  360. block block_3:
  361. /* preds: block_1 block_2 */
  362. vec1 32 ssa_40 = phi block_1: ssa_39, block_2: ssa_0
  363. vec1 32 ssa_41 = intrinsic vulkan_resource_index (ssa_1) (0, 0, 7) /* desc-set=0 */ /* binding=0 */ /* desc_type=SSBO */
  364. vec1 32 ssa_42 = ishl ssa_18, ssa_6
  365. intrinsic store_ssbo (ssa_40, ssa_41, ssa_42) (1, 0, 4, 0) /* wrmask=x */ /* access=0 */ /* align_mul=4 */ /* align_offset=0 */
  366. /* succs: block_4 */
  367. block block_4:
  368. }
  369.  
  370. After RA:
  371. BB0
  372. /* logical preds: / linear preds: / kind: top-level, branch, */
  373. s2: %45:s[0-1], s1: %46:s[2], s1: %47:s[3], s1: %48:s[4], s1: %49:s[5], s1: %50:s[6], s1: %51:s[7], s1: %52:s[8], v1: %53:v[0], v1: %54:v[1], v1: %55:v[2], s2: %56:exec = p_startpgm
  374. p_logical_start
  375. s1: %73:s[0], s1: %72:scc = s_add_i32 32, %46:s[2]
  376. s2: %74:s[0-1] = p_create_vector %73:s[0], 0xffff8000
  377. s4: %75:s[12-15] = s_load_dwordx4 %74:s[0-1], 0 reorder
  378. v1: %71:v[3] = v_mbcnt_lo_u32_b32 -1, 0
  379. v1: %19:v[3] = v_mbcnt_hi_u32_b32 -1, %71:v[3]
  380. v1: %22:v[4] = v_lshlrev_b32 2, %19:v[3]
  381. v1: %23:v[5] = buffer_load_dword %22:v[4], %75:s[12-15], 0 offen buffer
  382. s1: %3:s[3], s1: %60:scc = s_lshl_b32 %47:s[3], 5
  383. s1: %77:s[0], s1: %76:scc = s_add_i32 16, %46:s[2]
  384. s2: %78:s[0-1] = p_create_vector %77:s[0], 0xffff8000
  385. s4: %79:s[12-15] = s_load_dwordx4 %78:s[0-1], 0 reorder
  386. s1: %5:s[0], s1: %61:scc = s_lshl_b32 %48:s[4], 2
  387. v1: %8:v[2] = v_add_u32 %52:s[8], %55:v[2]
  388. v1: %9:v[2] = v_mul_lo_u32 %5:s[0], %8:v[2]
  389. s1: %10:s[0], s1: %68:scc = s_lshl_b32 %51:s[7], 2
  390. v1: %12:v[1] = v_add3_u32 %9:v[2], %10:s[0], %54:v[1]
  391. v1: %13:v[1] = v_mul_lo_u32 %3:s[3], %12:v[1]
  392. s1: %14:s[0], s1: %69:scc = s_lshl_b32 %50:s[6], 5
  393. v1: %16:v[0] = v_add3_u32 %13:v[1], %14:s[0], %53:v[0]
  394. s1: %17:s[0] = s_mov_b32 1
  395. s1: %107:scc = p_parallelcopy %17:s[0]
  396. s2: %70:s[0-1] = s_cselect_b64 %0:exec, 0, %107:scc
  397. v1: %87:v[1] = v_and_b32 32, %19:v[3]
  398. v1: %25:v[2] = v_and_b32 63, %23:v[5]
  399. v1: %80:v[3] = v_lshlrev_b32 2, %25:v[2]
  400. v1: %88:v[5] = v_and_b32 32, %80:v[3]
  401. s2: %91:vcc = v_cmp_eq_u32 %87:v[1], %88:v[5]
  402. v2: %29:v[5-6] = v_lshrrev_b64 %25:v[2], %70:s[0-1]
  403. v1: %27:v[1] = buffer_load_dword %22:v[4], %79:s[12-15], 0 offen buffer
  404. v1: %30:v[5] = p_extract_vector %29:v[5-6], 0
  405. v1: %32:v[4] = v_and_b32 1, %30:v[5]
  406. v2: %33:v[4-5] = p_create_vector %32:v[4], 0
  407. s2: %34:s[0-1] = v_cmp_lg_u64 0, %33:v[4-5]
  408. s2: %36:s[4-5] = v_cmp_gt_u32 64, %25:v[2]
  409. s2: %37:s[0-1], s1: %93:scc = s_and_b64 %36:s[4-5], %34:s[0-1]
  410. v1: %84:v[4], s1: %83:s[3], s1: %82:s[4] = p_swap_half_waves %27:v[1]
  411. v1: %89:v[1] = ds_bpermute_b32 %80:v[3], %27:v[1]
  412. v1: %90:v[3] = ds_bpermute_b32 %80:v[3], %84:v[4]
  413. v1: %81:v[1] = v_cndmask_b32 %90:v[3], %89:v[1], %91:vcc
  414. p_logical_end
  415. s2: %102:s[0-1], s1: %101:scc, s2: %100:exec = s_and_saveexec_b64 %37:s[0-1], %56:exec
  416. p_cbranch_z %100:exec BB2, BB1
  417. BB1
  418. /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
  419. p_logical_start
  420. v1: %38:v[2] = v_lshlrev_b32 2, %25:v[2]
  421. v1: %39:v[2] = buffer_load_dword %38:v[2], %79:s[12-15], 0 offen buffer
  422. s2: %40:vcc = v_cmp_eq_i32 %81:v[1], %39:v[2]
  423. v1: %41:v[1] = v_cndmask_b32 0, 1, %40:vcc
  424. p_logical_end
  425. p_branch BB3
  426. BB2
  427. /* logical preds: / linear preds: BB0, / kind: uniform, */
  428. p_branch BB3
  429. BB3
  430. /* logical preds: / linear preds: BB1, BB2, / kind: invert, */
  431. s2: %103:exec = p_linear_phi %100:exec, %100:exec
  432. s2: %105:exec, s1: %104:scc = s_andn2_b64 %102:s[0-1], %103:exec
  433. p_cbranch_z %105:exec BB5, BB4
  434. BB4
  435. /* logical preds: BB0, / linear preds: BB3, / kind: uniform, */
  436. p_logical_start
  437. p_logical_end
  438. p_branch BB6
  439. BB5
  440. /* logical preds: / linear preds: BB3, / kind: uniform, */
  441. p_branch BB6
  442. BB6
  443. /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
  444. v1: %42:v[1] = p_phi %41:v[1], 1
  445. s2: %106:exec = p_parallelcopy %102:s[0-1]
  446. p_logical_start
  447. s2: %98:s[2-3] = p_create_vector %46:s[2], 0xffff8000
  448. s4: %99:s[0-3] = s_load_dwordx4 %98:s[2-3], 0 reorder
  449. v1: %44:v[0] = v_lshlrev_b32 2, %16:v[0]
  450. buffer_store_dword %44:v[0], %99:s[0-3], 0, %42:v[1] offen disable_wqm buffer
  451. p_logical_end
  452. s_endpgm
  453.  
  454. disasm:
  455. BB0:
  456. s_add_i32 s0, 32, s2 ; 810002a0
  457. s_movk_i32 s1, 0x8000 ; b0018000
  458. s_load_dwordx4 s[12:15], s[0:1], 0x0 ; f4080300 fa000000
  459. v_mbcnt_lo_u32_b32_e64 v3, -1, 0 ; d7650003 000100c1
  460. v_mbcnt_hi_u32_b32_e64 v3, -1, v3 ; d7660003 000206c1
  461. v_lshlrev_b32_e32 v4, 2, v3 ; 34080682
  462. s_waitcnt lgkmcnt(0) ; bf8cc07f
  463. buffer_load_dword v5, v4, s[12:15], 0 offen ; e0301000 80030504
  464. v_nop ; 7e000000
  465. s_lshl_b32 s3, s3, 5 ; 8f038503
  466. s_add_i32 s0, 16, s2 ; 81000290
  467. s_movk_i32 s1, 0x8000 ; b0018000
  468. s_load_dwordx4 s[12:15], s[0:1], 0x0 ; f4080300 fa000000
  469. s_lshl_b32 s0, s4, 2 ; 8f008204
  470. v_add_nc_u32_e32 v2, s8, v2 ; 4a040408
  471. v_mul_lo_u32 v2, s0, v2 ; d5690002 00020400
  472. s_lshl_b32 s0, s7, 2 ; 8f008207
  473. v_add3_u32 v1, v2, s0, v1 ; d76d0001 04040102
  474. v_mul_lo_u32 v1, s3, v1 ; d5690001 00020203
  475. s_lshl_b32 s0, s6, 5 ; 8f008506
  476. v_add3_u32 v0, v1, s0, v0 ; d76d0000 04000101
  477. s_mov_b32 s0, 1 ; be800381
  478. s_cmp_lg_i32 s0, 0 ; bf018000
  479. s_cselect_b64 s[0:1], exec, 0 ; 8580807e
  480. v_and_b32_e32 v1, 32, v3 ; 360206a0
  481. s_waitcnt vmcnt(0) ; bf8c3f70
  482. v_and_b32_e32 v2, 63, v5 ; 36040abf
  483. v_lshlrev_b32_e32 v3, 2, v2 ; 34060482
  484. v_and_b32_e32 v5, 32, v3 ; 360a06a0
  485. v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; 7d840b01
  486. v_lshrrev_b64 v[5:6], v2, s[0:1] ; d7000005 00000102
  487. s_waitcnt lgkmcnt(0) ; bf8cc07f
  488. buffer_load_dword v1, v4, s[12:15], 0 offen ; e0301000 80030104
  489. v_and_b32_e32 v4, 1, v5 ; 36080a81
  490. v_mov_b32_e32 v5, 0 ; 7e0a0280
  491. v_cmp_ne_u64_e64 s0, 0, v[4:5] ; d4e50000 00020880
  492. v_cmp_gt_u32_e64 s4, 64, v2 ; d4c40004 000204c0
  493. s_and_b64 s[0:1], s[4:5], s[0:1] ; 87800004
  494. s_and_saveexec_b32 s3, 0 ; be833c80
  495. s_subvector_loop_begin s4, 228 ; bd8400e4
  496. s_waitcnt vmcnt(0) ; bf8c3f70
  497. v_mov_b32_e32 v8, v1 ; 7e100301
  498. s_subvector_loop_end s4, 212 ; be0400d4
  499. s_mov_b32 exec_lo, 3 ; befe0383
  500. s_subvector_loop_begin s4, 248 ; bd8400f8
  501. v_mov_b32_e32 v4, s8 ; 7e080208
  502. v_mov_b32_e32 v8, v1 ; 7e100301
  503. s_subvector_loop_end s4, 232 ; be0400e8
  504. ds_bpermute_b32 v1, v3, v1 ; dacc0000 01000103
  505. ds_bpermute_b32 v3, v3, v4 ; dacc0000 03000403
  506. s_waitcnt lgkmcnt(0) ; bf8cc07f
  507. v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; 02020303
  508. s_and_saveexec_b64 s[0:1], s[0:1] ; be802400
  509. s_cbranch_execz BB3 ; bf880007
  510. BB1:
  511. v_lshlrev_b32_e32 v2, 2, v2 ; 34040482
  512. buffer_load_dword v2, v2, s[12:15], 0 offen ; e0301000 80030202
  513. s_waitcnt vmcnt(0) ; bf8c3f70
  514. v_cmp_eq_i32_e32 vcc_lo, v1, v2 ; 7d040501
  515. v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; d5010001 01a90280
  516. BB3:
  517. s_andn2_b64 exec, s[0:1], exec ; 8afe7e00
  518. s_cbranch_execz BB6 ; bf880001
  519. BB4:
  520. v_mov_b32_e32 v1, 1 ; 7e020281
  521. BB6:
  522. s_mov_b64 exec, s[0:1] ; befe0400
  523. s_movk_i32 s3, 0x8000 ; b0038000
  524. s_load_dwordx4 s[0:3], s[2:3], 0x0 ; f4080001 fa000000
  525. v_lshlrev_b32_e32 v0, 2, v0 ; 34000082
  526. s_waitcnt lgkmcnt(0) ; bf8cc07f
  527. buffer_store_dword v1, v0, s[0:3], 0 offen ; e0701000 80000100
  528. s_endpgm ; bf810000
  529. s_code_end ; bf9f0000
  530. s_code_end ; bf9f0000
  531. s_code_end ; bf9f0000
  532. s_code_end ; bf9f0000
  533. s_code_end ; bf9f0000
  534. s_code_end ; bf9f0000
  535. s_code_end ; bf9f0000
  536. s_code_end ; bf9f0000
  537. s_code_end ; bf9f0000
  538. s_code_end ; bf9f0000
  539. s_code_end ; bf9f0000
  540. s_code_end ; bf9f0000
  541. s_code_end ; bf9f0000
  542. s_code_end ; bf9f0000
  543. s_code_end ; bf9f0000
  544. s_code_end ; bf9f0000
  545. s_code_end ; bf9f0000
  546. s_code_end ; bf9f0000
  547. s_code_end ; bf9f0000
  548. s_code_end ; bf9f0000
  549. s_code_end ; bf9f0000
  550. s_code_end ; bf9f0000
  551. s_code_end ; bf9f0000
  552. s_code_end ; bf9f0000
  553. s_code_end ; bf9f0000
  554. s_code_end ; bf9f0000
  555. s_code_end ; bf9f0000
  556. s_code_end ; bf9f0000
  557. s_code_end ; bf9f0000
  558. s_code_end ; bf9f0000
  559. s_code_end ; bf9f0000
  560. s_code_end ; bf9f0000
  561. s_code_end ; bf9f0000
  562. s_code_end ; bf9f0000
  563. s_code_end ; bf9f0000
  564. s_code_end ; bf9f0000
  565. s_code_end ; bf9f0000
  566. s_code_end ; bf9f0000
  567. s_code_end ; bf9f0000
  568. s_code_end ; bf9f0000
  569. s_code_end ; bf9f0000
  570. s_code_end ; bf9f0000
  571. s_code_end ; bf9f0000
  572. s_code_end ; bf9f0000
  573. s_code_end ; bf9f0000
  574. s_code_end ; bf9f0000
  575. s_code_end ; bf9f0000
  576. s_code_end ; bf9f0000
  577. s_code_end ; bf9f0000
  578. s_code_end ; bf9f0000
  579. s_code_end ; bf9f0000
  580. s_code_end ; bf9f0000
  581. s_code_end ; bf9f0000
  582. s_code_end ; bf9f0000
  583. s_code_end ; bf9f0000
  584. s_code_end ; bf9f0000
  585.  
  586. shader: MESA_SHADER_COMPUTE
  587. local-size: 32, 1, 4
  588. shared-size: 1
  589. inputs: 0
  590. outputs: 0
  591. uniforms: 0
  592. shared: 0
  593. decl_var ssbo INTERP_MODE_NONE block @0 (429, 0, 2)
  594. decl_var ssbo INTERP_MODE_NONE block @1 (429, 0, 1)
  595. decl_var ssbo INTERP_MODE_NONE block @2 (429, 0, 0)
  596. decl_function main (0 params)
  597.  
  598. impl main {
  599. block block_0:
  600. /* preds: */
  601. vec1 32 ssa_0 = load_const (0x00000001 /* 0.000000 */)
  602. vec1 32 ssa_1 = load_const (0x00000000 /* 0.000000 */)
  603. vec1 1 ssa_2 = load_const (true)
  604. vec3 32 ssa_3 = intrinsic load_num_work_groups () ()
  605. vec1 32 ssa_4 = load_const (0x00000005 /* 0.000000 */)
  606. vec1 32 ssa_5 = ishl ssa_3.x, ssa_4
  607. vec3 32 ssa_6 = intrinsic load_work_group_id () ()
  608. vec3 32 ssa_7 = intrinsic load_local_invocation_id () ()
  609. vec1 32 ssa_8 = load_const (0x00000002 /* 0.000000 */)
  610. vec1 32 ssa_9 = ishl ssa_6.z, ssa_8
  611. vec1 32 ssa_10 = iadd ssa_9, ssa_7.z
  612. vec1 32 ssa_11 = imul ssa_3.y, ssa_10
  613. vec1 32 ssa_12 = iadd ssa_6.y, ssa_7.y
  614. vec1 32 ssa_13 = iadd ssa_11, ssa_12
  615. vec1 32 ssa_14 = imul ssa_5, ssa_13
  616. vec1 32 ssa_15 = ishl ssa_6.x, ssa_4
  617. vec1 32 ssa_16 = iadd ssa_15, ssa_7.x
  618. vec1 32 ssa_17 = iadd ssa_14, ssa_16
  619. vec1 64 ssa_18 = intrinsic ballot (ssa_2) ()
  620. vec1 32 ssa_19 = intrinsic load_subgroup_invocation () ()
  621. vec1 32 ssa_20 = intrinsic vulkan_resource_index (ssa_1) (0, 2, 7) /* desc-set=0 */ /* binding=2 */ /* desc_type=SSBO */
  622. vec1 32 ssa_21 = ishl ssa_19, ssa_8
  623. vec1 32 ssa_22 = intrinsic load_ssbo (ssa_20, ssa_21) (16, 4, 0) /* access=16 */ /* align_mul=4 */ /* align_offset=0 */
  624. vec1 32 ssa_23 = load_const (0x00000040 /* 0.000000 */)
  625. vec1 32 ssa_24 = load_const (0x0000003f /* 0.000000 */)
  626. vec1 32 ssa_25 = iand ssa_22, ssa_24
  627. vec1 32 ssa_26 = intrinsic vulkan_resource_index (ssa_1) (0, 1, 7) /* desc-set=0 */ /* binding=1 */ /* desc_type=SSBO */
  628. vec1 32 ssa_27 = intrinsic load_ssbo (ssa_26, ssa_21) (16, 4, 0) /* access=16 */ /* align_mul=4 */ /* align_offset=0 */
  629. vec1 32 ssa_28 = intrinsic shuffle (ssa_27, ssa_25) ()
  630. vec1 1 ssa_29 = ult ssa_25, ssa_23
  631. vec1 64 ssa_30 = load_const (0x 1 /* 0.000000 */)
  632. vec1 64 ssa_31 = ushr ssa_18, ssa_25
  633. vec1 64 ssa_32 = iand ssa_31, ssa_30
  634. vec1 1 ssa_33 = i2b1 ssa_32
  635. vec1 1 ssa_34 = iand ssa_29, ssa_33
  636. /* succs: block_1 block_2 */
  637. if ssa_34 {
  638. block block_1:
  639. /* preds: block_0 */
  640. vec1 32 ssa_35 = ishl ssa_25, ssa_8
  641. vec1 32 ssa_36 = intrinsic load_ssbo (ssa_26, ssa_35) (16, 4, 0) /* access=16 */ /* align_mul=4 */ /* align_offset=0 */
  642. vec1 1 ssa_37 = ieq ssa_28, ssa_36
  643. vec1 32 ssa_38 = bcsel ssa_37, ssa_0, ssa_1
  644. /* succs: block_3 */
  645. } else {
  646. block block_2:
  647. /* preds: block_0 */
  648. /* succs: block_3 */
  649. }
  650. block block_3:
  651. /* preds: block_1 block_2 */
  652. vec1 32 ssa_39 = phi block_1: ssa_38, block_2: ssa_0
  653. vec1 32 ssa_40 = intrinsic vulkan_resource_index (ssa_1) (0, 0, 7) /* desc-set=0 */ /* binding=0 */ /* desc_type=SSBO */
  654. vec1 32 ssa_41 = ishl ssa_17, ssa_8
  655. intrinsic store_ssbo (ssa_39, ssa_40, ssa_41) (1, 0, 4, 0) /* wrmask=x */ /* access=0 */ /* align_mul=4 */ /* align_offset=0 */
  656. /* succs: block_4 */
  657. block block_4:
  658. }
  659.  
  660. After RA:
  661. BB0
  662. /* logical preds: / linear preds: / kind: top-level, branch, */
  663. s2: %44:s[0-1], s1: %45:s[2], s1: %46:s[3], s1: %47:s[4], s1: %48:s[5], s1: %49:s[6], s1: %50:s[7], s1: %51:s[8], v1: %52:v[0], v1: %53:v[1], v1: %54:v[2], s2: %55:exec = p_startpgm
  664. p_logical_start
  665. s1: %71:s[0], s1: %70:scc = s_add_i32 32, %45:s[2]
  666. s2: %72:s[0-1] = p_create_vector %71:s[0], 0xffff8000
  667. s4: %73:s[12-15] = s_load_dwordx4 %72:s[0-1], 0 reorder
  668. v1: %69:v[3] = v_mbcnt_lo_u32_b32 -1, 0
  669. v1: %18:v[3] = v_mbcnt_hi_u32_b32 -1, %69:v[3]
  670. v1: %21:v[4] = v_lshlrev_b32 2, %18:v[3]
  671. v1: %22:v[5] = buffer_load_dword %21:v[4], %73:s[12-15], 0 offen buffer
  672. s1: %3:s[3], s1: %59:scc = s_lshl_b32 %46:s[3], 5
  673. s1: %75:s[0], s1: %74:scc = s_add_i32 16, %45:s[2]
  674. s2: %76:s[0-1] = p_create_vector %75:s[0], 0xffff8000
  675. s4: %77:s[12-15] = s_load_dwordx4 %76:s[0-1], 0 reorder
  676. s1: %7:s[5], s1: %66:scc = s_lshl_b32 %51:s[8], 2
  677. v1: %8:v[2] = v_add_u32 %7:s[5], %54:v[2]
  678. v1: %9:v[2] = v_mul_lo_u32 %47:s[4], %8:v[2]
  679. v1: %11:v[1] = v_add3_u32 %9:v[2], %50:s[7], %53:v[1]
  680. v1: %12:v[1] = v_mul_lo_u32 %3:s[3], %11:v[1]
  681. s1: %13:s[0], s1: %67:scc = s_lshl_b32 %49:s[6], 5
  682. v1: %15:v[0] = v_add3_u32 %12:v[1], %13:s[0], %52:v[0]
  683. s1: %16:s[0] = s_mov_b32 1
  684. s1: %105:scc = p_parallelcopy %16:s[0]
  685. s2: %68:s[0-1] = s_cselect_b64 %0:exec, 0, %105:scc
  686. v1: %85:v[1] = v_and_b32 32, %18:v[3]
  687. v1: %24:v[2] = v_and_b32 63, %22:v[5]
  688. v1: %78:v[3] = v_lshlrev_b32 2, %24:v[2]
  689. v1: %86:v[5] = v_and_b32 32, %78:v[3]
  690. s2: %89:vcc = v_cmp_eq_u32 %85:v[1], %86:v[5]
  691. v2: %28:v[5-6] = v_lshrrev_b64 %24:v[2], %68:s[0-1]
  692. v1: %26:v[1] = buffer_load_dword %21:v[4], %77:s[12-15], 0 offen buffer
  693. v1: %29:v[5] = p_extract_vector %28:v[5-6], 0
  694. v1: %31:v[4] = v_and_b32 1, %29:v[5]
  695. v2: %32:v[4-5] = p_create_vector %31:v[4], 0
  696. s2: %33:s[0-1] = v_cmp_lg_u64 0, %32:v[4-5]
  697. s2: %35:s[4-5] = v_cmp_gt_u32 64, %24:v[2]
  698. s2: %36:s[0-1], s1: %91:scc = s_and_b64 %35:s[4-5], %33:s[0-1]
  699. v1: %82:v[4], s1: %81:s[3], s1: %80:s[4] = p_swap_half_waves %26:v[1]
  700. v1: %87:v[1] = ds_bpermute_b32 %78:v[3], %26:v[1]
  701. v1: %88:v[3] = ds_bpermute_b32 %78:v[3], %82:v[4]
  702. v1: %79:v[1] = v_cndmask_b32 %88:v[3], %87:v[1], %89:vcc
  703. p_logical_end
  704. s2: %100:s[0-1], s1: %99:scc, s2: %98:exec = s_and_saveexec_b64 %36:s[0-1], %55:exec
  705. p_cbranch_z %98:exec BB2, BB1
  706. BB1
  707. /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
  708. p_logical_start
  709. v1: %37:v[2] = v_lshlrev_b32 2, %24:v[2]
  710. v1: %38:v[2] = buffer_load_dword %37:v[2], %77:s[12-15], 0 offen buffer
  711. s2: %39:vcc = v_cmp_eq_i32 %79:v[1], %38:v[2]
  712. v1: %40:v[1] = v_cndmask_b32 0, 1, %39:vcc
  713. p_logical_end
  714. p_branch BB3
  715. BB2
  716. /* logical preds: / linear preds: BB0, / kind: uniform, */
  717. p_branch BB3
  718. BB3
  719. /* logical preds: / linear preds: BB1, BB2, / kind: invert, */
  720. s2: %101:exec = p_linear_phi %98:exec, %98:exec
  721. s2: %103:exec, s1: %102:scc = s_andn2_b64 %100:s[0-1], %101:exec
  722. p_cbranch_z %103:exec BB5, BB4
  723. BB4
  724. /* logical preds: BB0, / linear preds: BB3, / kind: uniform, */
  725. p_logical_start
  726. p_logical_end
  727. p_branch BB6
  728. BB5
  729. /* logical preds: / linear preds: BB3, / kind: uniform, */
  730. p_branch BB6
  731. BB6
  732. /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
  733. v1: %41:v[1] = p_phi %40:v[1], 1
  734. s2: %104:exec = p_parallelcopy %100:s[0-1]
  735. p_logical_start
  736. s2: %96:s[2-3] = p_create_vector %45:s[2], 0xffff8000
  737. s4: %97:s[0-3] = s_load_dwordx4 %96:s[2-3], 0 reorder
  738. v1: %43:v[0] = v_lshlrev_b32 2, %15:v[0]
  739. buffer_store_dword %43:v[0], %97:s[0-3], 0, %41:v[1] offen disable_wqm buffer
  740. p_logical_end
  741. s_endpgm
  742.  
  743. disasm:
  744. BB0:
  745. s_add_i32 s0, 32, s2 ; 810002a0
  746. s_movk_i32 s1, 0x8000 ; b0018000
  747. s_load_dwordx4 s[12:15], s[0:1], 0x0 ; f4080300 fa000000
  748. v_mbcnt_lo_u32_b32_e64 v3, -1, 0 ; d7650003 000100c1
  749. v_mbcnt_hi_u32_b32_e64 v3, -1, v3 ; d7660003 000206c1
  750. v_lshlrev_b32_e32 v4, 2, v3 ; 34080682
  751. s_waitcnt lgkmcnt(0) ; bf8cc07f
  752. buffer_load_dword v5, v4, s[12:15], 0 offen ; e0301000 80030504
  753. v_nop ; 7e000000
  754. s_lshl_b32 s3, s3, 5 ; 8f038503
  755. s_add_i32 s0, 16, s2 ; 81000290
  756. s_movk_i32 s1, 0x8000 ; b0018000
  757. s_load_dwordx4 s[12:15], s[0:1], 0x0 ; f4080300 fa000000
  758. s_lshl_b32 s5, s8, 2 ; 8f058208
  759. v_add_nc_u32_e32 v2, s5, v2 ; 4a040405
  760. v_mul_lo_u32 v2, s4, v2 ; d5690002 00020404
  761. v_add3_u32 v1, v2, s7, v1 ; d76d0001 04040f02
  762. v_mul_lo_u32 v1, s3, v1 ; d5690001 00020203
  763. s_lshl_b32 s0, s6, 5 ; 8f008506
  764. v_add3_u32 v0, v1, s0, v0 ; d76d0000 04000101
  765. s_mov_b32 s0, 1 ; be800381
  766. s_cmp_lg_i32 s0, 0 ; bf018000
  767. s_cselect_b64 s[0:1], exec, 0 ; 8580807e
  768. v_and_b32_e32 v1, 32, v3 ; 360206a0
  769. s_waitcnt vmcnt(0) ; bf8c3f70
  770. v_and_b32_e32 v2, 63, v5 ; 36040abf
  771. v_lshlrev_b32_e32 v3, 2, v2 ; 34060482
  772. v_and_b32_e32 v5, 32, v3 ; 360a06a0
  773. v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; 7d840b01
  774. v_lshrrev_b64 v[5:6], v2, s[0:1] ; d7000005 00000102
  775. s_waitcnt lgkmcnt(0) ; bf8cc07f
  776. buffer_load_dword v1, v4, s[12:15], 0 offen ; e0301000 80030104
  777. v_and_b32_e32 v4, 1, v5 ; 36080a81
  778. v_mov_b32_e32 v5, 0 ; 7e0a0280
  779. v_cmp_ne_u64_e64 s0, 0, v[4:5] ; d4e50000 00020880
  780. v_cmp_gt_u32_e64 s4, 64, v2 ; d4c40004 000204c0
  781. s_and_b64 s[0:1], s[4:5], s[0:1] ; 87800004
  782. s_and_saveexec_b32 s3, 0 ; be833c80
  783. s_subvector_loop_begin s4, 224 ; bd8400e0
  784. s_waitcnt vmcnt(0) ; bf8c3f70
  785. v_mov_b32_e32 v8, v1 ; 7e100301
  786. s_subvector_loop_end s4, 208 ; be0400d0
  787. s_mov_b32 exec_lo, 3 ; befe0383
  788. s_subvector_loop_begin s4, 244 ; bd8400f4
  789. v_mov_b32_e32 v4, s8 ; 7e080208
  790. v_mov_b32_e32 v8, v1 ; 7e100301
  791. s_subvector_loop_end s4, 228 ; be0400e4
  792. ds_bpermute_b32 v1, v3, v1 ; dacc0000 01000103
  793. ds_bpermute_b32 v3, v3, v4 ; dacc0000 03000403
  794. s_waitcnt lgkmcnt(0) ; bf8cc07f
  795. v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; 02020303
  796. s_and_saveexec_b64 s[0:1], s[0:1] ; be802400
  797. s_cbranch_execz BB3 ; bf880007
  798. BB1:
  799. v_lshlrev_b32_e32 v2, 2, v2 ; 34040482
  800. buffer_load_dword v2, v2, s[12:15], 0 offen ; e0301000 80030202
  801. s_waitcnt vmcnt(0) ; bf8c3f70
  802. v_cmp_eq_i32_e32 vcc_lo, v1, v2 ; 7d040501
  803. v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; d5010001 01a90280
  804. BB3:
  805. s_andn2_b64 exec, s[0:1], exec ; 8afe7e00
  806. s_cbranch_execz BB6 ; bf880001
  807. BB4:
  808. v_mov_b32_e32 v1, 1 ; 7e020281
  809. BB6:
  810. s_mov_b64 exec, s[0:1] ; befe0400
  811. s_movk_i32 s3, 0x8000 ; b0038000
  812. s_load_dwordx4 s[0:3], s[2:3], 0x0 ; f4080001 fa000000
  813. v_lshlrev_b32_e32 v0, 2, v0 ; 34000082
  814. s_waitcnt lgkmcnt(0) ; bf8cc07f
  815. buffer_store_dword v1, v0, s[0:3], 0 offen ; e0701000 80000100
  816. s_endpgm ; bf810000
  817. s_code_end ; bf9f0000
  818. s_code_end ; bf9f0000
  819. s_code_end ; bf9f0000
  820. s_code_end ; bf9f0000
  821. s_code_end ; bf9f0000
  822. s_code_end ; bf9f0000
  823. s_code_end ; bf9f0000
  824. s_code_end ; bf9f0000
  825. s_code_end ; bf9f0000
  826. s_code_end ; bf9f0000
  827. s_code_end ; bf9f0000
  828. s_code_end ; bf9f0000
  829. s_code_end ; bf9f0000
  830. s_code_end ; bf9f0000
  831. s_code_end ; bf9f0000
  832. s_code_end ; bf9f0000
  833. s_code_end ; bf9f0000
  834. s_code_end ; bf9f0000
  835. s_code_end ; bf9f0000
  836. s_code_end ; bf9f0000
  837. s_code_end ; bf9f0000
  838. s_code_end ; bf9f0000
  839. s_code_end ; bf9f0000
  840. s_code_end ; bf9f0000
  841. s_code_end ; bf9f0000
  842. s_code_end ; bf9f0000
  843. s_code_end ; bf9f0000
  844. s_code_end ; bf9f0000
  845. s_code_end ; bf9f0000
  846. s_code_end ; bf9f0000
  847. s_code_end ; bf9f0000
  848. s_code_end ; bf9f0000
  849. s_code_end ; bf9f0000
  850. s_code_end ; bf9f0000
  851. s_code_end ; bf9f0000
  852. s_code_end ; bf9f0000
  853. s_code_end ; bf9f0000
  854. s_code_end ; bf9f0000
  855. s_code_end ; bf9f0000
  856. s_code_end ; bf9f0000
  857. s_code_end ; bf9f0000
  858. s_code_end ; bf9f0000
  859. s_code_end ; bf9f0000
  860. s_code_end ; bf9f0000
  861. s_code_end ; bf9f0000
  862. s_code_end ; bf9f0000
  863. s_code_end ; bf9f0000
  864. s_code_end ; bf9f0000
  865. s_code_end ; bf9f0000
  866. s_code_end ; bf9f0000
  867. s_code_end ; bf9f0000
  868. s_code_end ; bf9f0000
  869. s_code_end ; bf9f0000
  870. s_code_end ; bf9f0000
  871. s_code_end ; bf9f0000
  872. s_code_end ; bf9f0000
  873. s_code_end ; bf9f0000
  874.  
  875. [Timur@timur-zen ~]$
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement