Advertisement
tthtlc

nvidia nvcc generated PTX codes

Jun 5th, 2019
206
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 58.22 KB | None | 0 0
  1. //
  2. // Generated by NVIDIA NVVM Compiler
  3. //
  4. // Compiler Build ID: CL-24817639
  5. // Cuda compilation tools, release 10.0, V10.0.130
  6. // Based on LLVM 3.4svn
  7. //
  8.  
  9. .version 6.3
  10. .target sm_30
  11. .address_size 64
  12.  
  13. // .globl _Z8compressPKjS0_P5uint2i
  14. // _ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance has been demoted
  15. .const .align 4 .b8 kColorMetric[12] = {0, 0, 128, 63, 0, 0, 128, 63, 0, 0, 128, 63};
  16. // _ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps has been demoted
  17. .const .align 4 .b8 alphaTable4[16] = {0, 0, 16, 65, 0, 0, 0, 0, 0, 0, 192, 64, 0, 0, 64, 64};
  18. .const .align 4 .b8 alphaTable3[16] = {0, 0, 128, 64, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 64};
  19. .const .align 4 .b8 prods4[16] = {0, 0, 9, 0, 0, 9, 0, 0, 2, 1, 4, 0, 2, 4, 1, 0};
  20. .const .align 4 .b8 prods3[16] = {0, 0, 4, 0, 0, 4, 0, 0, 1, 1, 4, 0, 1, 4, 1, 0};
  21. // _ZZ19evalAllPermutationsPK6float3PKjRtS4_RjPfS_N18cooperative_groups4__v112thread_blockEE14s_permutations has been demoted
  22. // _ZZ12findMinErrorPfN18cooperative_groups4__v112thread_blockEE7indices has been demoted
  23. // _ZZ8compressPKjS0_P5uint2iE6colors has been demoted
  24. // _ZZ8compressPKjS0_P5uint2iE4sums has been demoted
  25. // _ZZ8compressPKjS0_P5uint2iE5xrefs has been demoted
  26. // _ZZ8compressPKjS0_P5uint2iE6errors has been demoted
  27.  
  28. .visible .entry _Z8compressPKjS0_P5uint2i(
  29. .param .u64 _Z8compressPKjS0_P5uint2i_param_0,
  30. .param .u64 _Z8compressPKjS0_P5uint2i_param_1,
  31. .param .u64 _Z8compressPKjS0_P5uint2i_param_2,
  32. .param .u32 _Z8compressPKjS0_P5uint2i_param_3
  33. )
  34. {
  35. .reg .pred %p<81>;
  36. .reg .b16 %rs<38>;
  37. .reg .f32 %f<754>;
  38. .reg .b32 %r<443>;
  39. .reg .b64 %rd<143>;
  40. // demoted variable
  41. .shared .align 4 .b8 _ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance[384];
  42. // demoted variable
  43. .shared .align 4 .b8 _ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps[64];
  44. // demoted variable
  45. .shared .align 4 .b8 _ZZ19evalAllPermutationsPK6float3PKjRtS4_RjPfS_N18cooperative_groups4__v112thread_blockEE14s_permutations[640];
  46. // demoted variable
  47. .shared .align 4 .b8 _ZZ12findMinErrorPfN18cooperative_groups4__v112thread_blockEE7indices[256];
  48. // demoted variable
  49. .shared .align 4 .b8 _ZZ8compressPKjS0_P5uint2iE6colors[192];
  50. // demoted variable
  51. .shared .align 4 .b8 _ZZ8compressPKjS0_P5uint2iE4sums[192];
  52. // demoted variable
  53. .shared .align 4 .b8 _ZZ8compressPKjS0_P5uint2iE5xrefs[64];
  54. // demoted variable
  55. .shared .align 4 .b8 _ZZ8compressPKjS0_P5uint2iE6errors[256];
  56.  
  57. ld.param.u64 %rd2, [_Z8compressPKjS0_P5uint2i_param_1];
  58. ld.param.u32 %r65, [_Z8compressPKjS0_P5uint2i_param_3];
  59. mov.u32 %r1, %tid.x;
  60. mov.u32 %r67, %ntid.y;
  61. mov.u32 %r68, %tid.z;
  62. mov.u32 %r69, %tid.y;
  63. mad.lo.s32 %r70, %r67, %r68, %r69;
  64. mov.u32 %r71, %ntid.x;
  65. mad.lo.s32 %r72, %r70, %r71, %r1;
  66. and.b32 %r73, %r72, -16;
  67. mul.lo.s32 %r74, %r67, %r71;
  68. mov.u32 %r75, %ntid.z;
  69. mul.lo.s32 %r76, %r74, %r75;
  70. sub.s32 %r77, %r76, %r73;
  71. mov.u32 %r78, 16;
  72. min.u32 %r79, %r77, %r78;
  73. mov.u32 %r80, 32;
  74. sub.s32 %r81, %r80, %r79;
  75. mov.u32 %r82, -1;
  76. shr.u32 %r83, %r82, %r81;
  77. // inline asm
  78. mov.u32 %r66, %laneid;
  79. // inline asm
  80. and.b32 %r84, %r66, -16;
  81. shl.b32 %r2, %r83, %r84;
  82. setp.gt.s32 %p1, %r1, 15;
  83. @%p1 bra BB0_55;
  84.  
  85. cvta.to.global.u64 %rd4, %rd2;
  86. mov.u32 %r85, %ctaid.x;
  87. add.s32 %r86, %r85, %r65;
  88. shl.b32 %r87, %r86, 4;
  89. add.s32 %r88, %r87, %r1;
  90. mul.wide.s32 %rd5, %r88, 4;
  91. add.s64 %rd6, %rd4, %rd5;
  92. ld.global.u32 %r89, [%rd6];
  93. and.b32 %r90, %r89, 255;
  94. cvt.rn.f32.u32 %f141, %r90;
  95. mul.f32 %f142, %f141, 0f3B808081;
  96. mov.u32 %r91, _ZZ8compressPKjS0_P5uint2iE6colors;
  97. mad.lo.s32 %r3, %r1, 12, %r91;
  98. st.shared.f32 [%r3], %f142;
  99. bfe.u32 %r92, %r89, 8, 8;
  100. cvt.rn.f32.u32 %f143, %r92;
  101. mul.f32 %f144, %f143, 0f3B808081;
  102. st.shared.f32 [%r3+4], %f144;
  103. bfe.u32 %r93, %r89, 16, 8;
  104. cvt.rn.f32.u32 %f145, %r93;
  105. mul.f32 %f146, %f145, 0f3B808081;
  106. st.shared.f32 [%r3+8], %f146;
  107. bar.warp.sync %r2;
  108. ld.shared.f32 %f147, [%r3];
  109. ld.shared.f32 %f148, [%r3+4];
  110. ld.shared.f32 %f149, [%r3+8];
  111. mov.u32 %r94, _ZZ8compressPKjS0_P5uint2iE4sums;
  112. mad.lo.s32 %r4, %r1, 12, %r94;
  113. st.shared.f32 [%r4+8], %f149;
  114. st.shared.f32 [%r4+4], %f148;
  115. st.shared.f32 [%r4], %f147;
  116. bar.warp.sync %r2;
  117. xor.b32 %r95, %r1, 8;
  118. mad.lo.s32 %r97, %r95, 12, %r94;
  119. ld.shared.f32 %f150, [%r97+8];
  120. ld.shared.f32 %f151, [%r97+4];
  121. ld.shared.f32 %f152, [%r97];
  122. ld.shared.f32 %f153, [%r4];
  123. add.f32 %f154, %f153, %f152;
  124. ld.shared.f32 %f155, [%r4+4];
  125. ld.shared.f32 %f156, [%r4+8];
  126. st.shared.f32 [%r4], %f154;
  127. add.f32 %f157, %f151, %f155;
  128. st.shared.f32 [%r4+4], %f157;
  129. add.f32 %f158, %f150, %f156;
  130. st.shared.f32 [%r4+8], %f158;
  131. bar.warp.sync %r2;
  132. xor.b32 %r98, %r1, 4;
  133. mad.lo.s32 %r100, %r98, 12, %r94;
  134. ld.shared.f32 %f159, [%r100+8];
  135. ld.shared.f32 %f160, [%r100+4];
  136. ld.shared.f32 %f161, [%r100];
  137. ld.shared.f32 %f162, [%r4];
  138. add.f32 %f163, %f162, %f161;
  139. ld.shared.f32 %f164, [%r4+4];
  140. ld.shared.f32 %f165, [%r4+8];
  141. st.shared.f32 [%r4], %f163;
  142. add.f32 %f166, %f160, %f164;
  143. st.shared.f32 [%r4+4], %f166;
  144. add.f32 %f167, %f159, %f165;
  145. st.shared.f32 [%r4+8], %f167;
  146. bar.warp.sync %r2;
  147. xor.b32 %r101, %r1, 2;
  148. mad.lo.s32 %r103, %r101, 12, %r94;
  149. ld.shared.f32 %f168, [%r103+8];
  150. ld.shared.f32 %f169, [%r103+4];
  151. ld.shared.f32 %f170, [%r103];
  152. ld.shared.f32 %f171, [%r4];
  153. add.f32 %f172, %f171, %f170;
  154. ld.shared.f32 %f173, [%r4+4];
  155. ld.shared.f32 %f174, [%r4+8];
  156. st.shared.f32 [%r4], %f172;
  157. add.f32 %f175, %f169, %f173;
  158. st.shared.f32 [%r4+4], %f175;
  159. add.f32 %f176, %f168, %f174;
  160. st.shared.f32 [%r4+8], %f176;
  161. bar.warp.sync %r2;
  162. xor.b32 %r104, %r1, 1;
  163. mad.lo.s32 %r106, %r104, 12, %r94;
  164. ld.shared.f32 %f177, [%r106+8];
  165. ld.shared.f32 %f178, [%r106+4];
  166. ld.shared.f32 %f179, [%r106];
  167. ld.shared.f32 %f180, [%r4];
  168. add.f32 %f181, %f180, %f179;
  169. ld.shared.f32 %f182, [%r4+4];
  170. ld.shared.f32 %f183, [%r4+8];
  171. st.shared.f32 [%r4], %f181;
  172. add.f32 %f184, %f178, %f182;
  173. st.shared.f32 [%r4+4], %f184;
  174. add.f32 %f185, %f177, %f183;
  175. st.shared.f32 [%r4+8], %f185;
  176. bar.warp.sync %r2;
  177. ld.shared.f32 %f186, [_ZZ8compressPKjS0_P5uint2iE4sums+8];
  178. ld.shared.f32 %f187, [_ZZ8compressPKjS0_P5uint2iE4sums+4];
  179. ld.shared.f32 %f188, [_ZZ8compressPKjS0_P5uint2iE4sums];
  180. mul.f32 %f189, %f188, 0f3D800000;
  181. mul.f32 %f190, %f187, 0f3D800000;
  182. mul.f32 %f191, %f186, 0f3D800000;
  183. ld.shared.f32 %f192, [%r3+8];
  184. ld.shared.f32 %f193, [%r3+4];
  185. ld.shared.f32 %f194, [%r3];
  186. sub.f32 %f195, %f194, %f189;
  187. sub.f32 %f196, %f193, %f190;
  188. sub.f32 %f197, %f192, %f191;
  189. mul.f32 %f198, %f195, %f195;
  190. mul.lo.s32 %r107, %r1, 6;
  191. shl.b32 %r108, %r107, 2;
  192. mov.u32 %r109, _ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance;
  193. add.s32 %r5, %r109, %r108;
  194. st.shared.f32 [%r5], %f198;
  195. mul.f32 %f199, %f195, %f196;
  196. st.shared.f32 [%r5+4], %f199;
  197. mul.f32 %f200, %f195, %f197;
  198. st.shared.f32 [%r5+8], %f200;
  199. mul.f32 %f201, %f196, %f196;
  200. st.shared.f32 [%r5+12], %f201;
  201. mul.f32 %f202, %f196, %f197;
  202. st.shared.f32 [%r5+16], %f202;
  203. mul.f32 %f203, %f197, %f197;
  204. st.shared.f32 [%r5+20], %f203;
  205. bar.warp.sync %r2;
  206. setp.gt.s32 %p2, %r1, 7;
  207. @%p2 bra BB0_3;
  208.  
  209. ld.shared.f32 %f204, [%r5];
  210. ld.shared.f32 %f205, [%r5+192];
  211. add.f32 %f206, %f205, %f204;
  212. ld.shared.f32 %f207, [%r5+4];
  213. ld.shared.f32 %f208, [%r5+196];
  214. ld.shared.f32 %f209, [%r5+8];
  215. ld.shared.f32 %f210, [%r5+200];
  216. ld.shared.f32 %f211, [%r5+12];
  217. ld.shared.f32 %f212, [%r5+204];
  218. ld.shared.f32 %f213, [%r5+16];
  219. ld.shared.f32 %f214, [%r5+208];
  220. ld.shared.f32 %f215, [%r5+20];
  221. ld.shared.f32 %f216, [%r5+212];
  222. st.shared.f32 [%r5], %f206;
  223. add.f32 %f217, %f208, %f207;
  224. st.shared.f32 [%r5+4], %f217;
  225. add.f32 %f218, %f210, %f209;
  226. st.shared.f32 [%r5+8], %f218;
  227. add.f32 %f219, %f212, %f211;
  228. st.shared.f32 [%r5+12], %f219;
  229. add.f32 %f220, %f214, %f213;
  230. st.shared.f32 [%r5+16], %f220;
  231. add.f32 %f221, %f216, %f215;
  232. st.shared.f32 [%r5+20], %f221;
  233.  
  234. BB0_3:
  235. bar.warp.sync %r2;
  236. setp.gt.s32 %p3, %r1, 3;
  237. @%p3 bra BB0_5;
  238.  
  239. ld.shared.f32 %f222, [%r5];
  240. ld.shared.f32 %f223, [%r5+96];
  241. add.f32 %f224, %f223, %f222;
  242. ld.shared.f32 %f225, [%r5+4];
  243. ld.shared.f32 %f226, [%r5+100];
  244. ld.shared.f32 %f227, [%r5+8];
  245. ld.shared.f32 %f228, [%r5+104];
  246. ld.shared.f32 %f229, [%r5+12];
  247. ld.shared.f32 %f230, [%r5+108];
  248. ld.shared.f32 %f231, [%r5+16];
  249. ld.shared.f32 %f232, [%r5+112];
  250. ld.shared.f32 %f233, [%r5+20];
  251. ld.shared.f32 %f234, [%r5+116];
  252. st.shared.f32 [%r5], %f224;
  253. add.f32 %f235, %f226, %f225;
  254. st.shared.f32 [%r5+4], %f235;
  255. add.f32 %f236, %f228, %f227;
  256. st.shared.f32 [%r5+8], %f236;
  257. add.f32 %f237, %f230, %f229;
  258. st.shared.f32 [%r5+12], %f237;
  259. add.f32 %f238, %f232, %f231;
  260. st.shared.f32 [%r5+16], %f238;
  261. add.f32 %f239, %f234, %f233;
  262. st.shared.f32 [%r5+20], %f239;
  263.  
  264. BB0_5:
  265. bar.warp.sync %r2;
  266. setp.gt.s32 %p4, %r1, 1;
  267. @%p4 bra BB0_7;
  268.  
  269. ld.shared.f32 %f240, [%r5];
  270. ld.shared.f32 %f241, [%r5+48];
  271. add.f32 %f242, %f241, %f240;
  272. ld.shared.f32 %f243, [%r5+4];
  273. ld.shared.f32 %f244, [%r5+52];
  274. ld.shared.f32 %f245, [%r5+8];
  275. ld.shared.f32 %f246, [%r5+56];
  276. ld.shared.f32 %f247, [%r5+12];
  277. ld.shared.f32 %f248, [%r5+60];
  278. ld.shared.f32 %f249, [%r5+16];
  279. ld.shared.f32 %f250, [%r5+64];
  280. ld.shared.f32 %f251, [%r5+20];
  281. ld.shared.f32 %f252, [%r5+68];
  282. st.shared.f32 [%r5], %f242;
  283. add.f32 %f253, %f244, %f243;
  284. st.shared.f32 [%r5+4], %f253;
  285. add.f32 %f254, %f246, %f245;
  286. st.shared.f32 [%r5+8], %f254;
  287. add.f32 %f255, %f248, %f247;
  288. st.shared.f32 [%r5+12], %f255;
  289. add.f32 %f256, %f250, %f249;
  290. st.shared.f32 [%r5+16], %f256;
  291. add.f32 %f257, %f252, %f251;
  292. st.shared.f32 [%r5+20], %f257;
  293.  
  294. BB0_7:
  295. bar.warp.sync %r2;
  296. setp.gt.s32 %p5, %r1, 0;
  297. @%p5 bra BB0_9;
  298.  
  299. ld.shared.f32 %f258, [%r5];
  300. ld.shared.f32 %f259, [%r5+24];
  301. add.f32 %f260, %f259, %f258;
  302. ld.shared.f32 %f261, [%r5+4];
  303. ld.shared.f32 %f262, [%r5+28];
  304. ld.shared.f32 %f263, [%r5+8];
  305. ld.shared.f32 %f264, [%r5+32];
  306. ld.shared.f32 %f265, [%r5+12];
  307. ld.shared.f32 %f266, [%r5+36];
  308. ld.shared.f32 %f267, [%r5+16];
  309. ld.shared.f32 %f268, [%r5+40];
  310. ld.shared.f32 %f269, [%r5+20];
  311. ld.shared.f32 %f270, [%r5+44];
  312. st.shared.f32 [%r5], %f260;
  313. add.f32 %f271, %f262, %f261;
  314. st.shared.f32 [%r5+4], %f271;
  315. add.f32 %f272, %f264, %f263;
  316. st.shared.f32 [%r5+8], %f272;
  317. add.f32 %f273, %f266, %f265;
  318. st.shared.f32 [%r5+12], %f273;
  319. add.f32 %f274, %f268, %f267;
  320. st.shared.f32 [%r5+16], %f274;
  321. add.f32 %f275, %f270, %f269;
  322. st.shared.f32 [%r5+20], %f275;
  323.  
  324. BB0_9:
  325. bar.warp.sync %r2;
  326. ld.shared.f32 %f276, [_ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance+4];
  327. ld.shared.f32 %f277, [_ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance];
  328. add.f32 %f278, %f277, %f276;
  329. ld.shared.f32 %f279, [_ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance+8];
  330. add.f32 %f280, %f278, %f279;
  331. ld.shared.f32 %f281, [_ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance+12];
  332. add.f32 %f282, %f276, %f281;
  333. ld.shared.f32 %f283, [_ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance+16];
  334. add.f32 %f284, %f282, %f283;
  335. add.f32 %f285, %f283, %f279;
  336. ld.shared.f32 %f286, [_ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance+20];
  337. add.f32 %f287, %f285, %f286;
  338. max.f32 %f288, %f280, %f284;
  339. max.f32 %f289, %f288, %f287;
  340. rcp.rn.f32 %f290, %f289;
  341. mul.f32 %f291, %f280, %f290;
  342. mul.f32 %f292, %f284, %f290;
  343. mul.f32 %f293, %f287, %f290;
  344. mul.f32 %f294, %f292, %f276;
  345. fma.rn.f32 %f295, %f291, %f277, %f294;
  346. fma.rn.f32 %f296, %f293, %f279, %f295;
  347. mul.f32 %f297, %f292, %f281;
  348. fma.rn.f32 %f298, %f291, %f276, %f297;
  349. fma.rn.f32 %f299, %f293, %f283, %f298;
  350. mul.f32 %f300, %f291, %f279;
  351. fma.rn.f32 %f301, %f292, %f283, %f300;
  352. fma.rn.f32 %f302, %f293, %f286, %f301;
  353. max.f32 %f303, %f296, %f299;
  354. max.f32 %f304, %f303, %f302;
  355. rcp.rn.f32 %f305, %f304;
  356. mul.f32 %f306, %f296, %f305;
  357. mul.f32 %f307, %f299, %f305;
  358. mul.f32 %f308, %f302, %f305;
  359. mul.f32 %f309, %f307, %f276;
  360. fma.rn.f32 %f310, %f306, %f277, %f309;
  361. fma.rn.f32 %f311, %f308, %f279, %f310;
  362. mul.f32 %f312, %f307, %f281;
  363. fma.rn.f32 %f313, %f306, %f276, %f312;
  364. fma.rn.f32 %f314, %f308, %f283, %f313;
  365. mul.f32 %f315, %f306, %f279;
  366. fma.rn.f32 %f316, %f307, %f283, %f315;
  367. fma.rn.f32 %f317, %f308, %f286, %f316;
  368. max.f32 %f318, %f311, %f314;
  369. max.f32 %f319, %f318, %f317;
  370. rcp.rn.f32 %f320, %f319;
  371. mul.f32 %f321, %f311, %f320;
  372. mul.f32 %f322, %f314, %f320;
  373. mul.f32 %f323, %f317, %f320;
  374. mul.f32 %f324, %f322, %f276;
  375. fma.rn.f32 %f325, %f321, %f277, %f324;
  376. fma.rn.f32 %f326, %f323, %f279, %f325;
  377. mul.f32 %f327, %f322, %f281;
  378. fma.rn.f32 %f328, %f321, %f276, %f327;
  379. fma.rn.f32 %f329, %f323, %f283, %f328;
  380. mul.f32 %f330, %f321, %f279;
  381. fma.rn.f32 %f331, %f322, %f283, %f330;
  382. fma.rn.f32 %f332, %f323, %f286, %f331;
  383. max.f32 %f333, %f326, %f329;
  384. max.f32 %f334, %f333, %f332;
  385. rcp.rn.f32 %f335, %f334;
  386. mul.f32 %f336, %f326, %f335;
  387. mul.f32 %f337, %f329, %f335;
  388. mul.f32 %f338, %f332, %f335;
  389. mul.f32 %f339, %f337, %f276;
  390. fma.rn.f32 %f340, %f336, %f277, %f339;
  391. fma.rn.f32 %f341, %f338, %f279, %f340;
  392. mul.f32 %f342, %f337, %f281;
  393. fma.rn.f32 %f343, %f336, %f276, %f342;
  394. fma.rn.f32 %f344, %f338, %f283, %f343;
  395. mul.f32 %f345, %f336, %f279;
  396. fma.rn.f32 %f346, %f337, %f283, %f345;
  397. fma.rn.f32 %f347, %f338, %f286, %f346;
  398. max.f32 %f348, %f341, %f344;
  399. max.f32 %f349, %f348, %f347;
  400. rcp.rn.f32 %f350, %f349;
  401. mul.f32 %f351, %f341, %f350;
  402. mul.f32 %f352, %f344, %f350;
  403. mul.f32 %f353, %f347, %f350;
  404. mul.f32 %f354, %f352, %f276;
  405. fma.rn.f32 %f355, %f351, %f277, %f354;
  406. fma.rn.f32 %f356, %f353, %f279, %f355;
  407. mul.f32 %f357, %f352, %f281;
  408. fma.rn.f32 %f358, %f351, %f276, %f357;
  409. fma.rn.f32 %f359, %f353, %f283, %f358;
  410. mul.f32 %f360, %f351, %f279;
  411. fma.rn.f32 %f361, %f352, %f283, %f360;
  412. fma.rn.f32 %f362, %f353, %f286, %f361;
  413. max.f32 %f363, %f356, %f359;
  414. max.f32 %f364, %f363, %f362;
  415. rcp.rn.f32 %f365, %f364;
  416. mul.f32 %f366, %f356, %f365;
  417. mul.f32 %f367, %f359, %f365;
  418. mul.f32 %f368, %f362, %f365;
  419. mul.f32 %f369, %f367, %f276;
  420. fma.rn.f32 %f370, %f366, %f277, %f369;
  421. fma.rn.f32 %f371, %f368, %f279, %f370;
  422. mul.f32 %f372, %f367, %f281;
  423. fma.rn.f32 %f373, %f366, %f276, %f372;
  424. fma.rn.f32 %f374, %f368, %f283, %f373;
  425. mul.f32 %f375, %f366, %f279;
  426. fma.rn.f32 %f376, %f367, %f283, %f375;
  427. fma.rn.f32 %f377, %f368, %f286, %f376;
  428. max.f32 %f378, %f371, %f374;
  429. max.f32 %f379, %f378, %f377;
  430. rcp.rn.f32 %f380, %f379;
  431. mul.f32 %f381, %f371, %f380;
  432. mul.f32 %f382, %f374, %f380;
  433. mul.f32 %f383, %f377, %f380;
  434. mul.f32 %f384, %f382, %f276;
  435. fma.rn.f32 %f385, %f381, %f277, %f384;
  436. fma.rn.f32 %f386, %f383, %f279, %f385;
  437. mul.f32 %f387, %f382, %f281;
  438. fma.rn.f32 %f388, %f381, %f276, %f387;
  439. fma.rn.f32 %f389, %f383, %f283, %f388;
  440. mul.f32 %f390, %f381, %f279;
  441. fma.rn.f32 %f391, %f382, %f283, %f390;
  442. fma.rn.f32 %f392, %f383, %f286, %f391;
  443. max.f32 %f393, %f386, %f389;
  444. max.f32 %f394, %f393, %f392;
  445. rcp.rn.f32 %f395, %f394;
  446. mul.f32 %f1, %f386, %f395;
  447. mul.f32 %f2, %f389, %f395;
  448. mul.f32 %f3, %f392, %f395;
  449. bar.warp.sync %r2;
  450. mov.u32 %r426, _ZZ8compressPKjS0_P5uint2iE6colors;
  451. mad.lo.s32 %r425, %r1, 12, %r426;
  452. ld.shared.f32 %f396, [%r425+8];
  453. ld.shared.f32 %f397, [%r425];
  454. ld.shared.f32 %f398, [%r425+4];
  455. mul.f32 %f399, %f2, %f398;
  456. fma.rn.f32 %f400, %f1, %f397, %f399;
  457. fma.rn.f32 %f401, %f3, %f396, %f400;
  458. shl.b32 %r110, %r1, 2;
  459. mov.u32 %r111, _ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps;
  460. add.s32 %r6, %r111, %r110;
  461. st.shared.f32 [%r6], %f401;
  462. bar.warp.sync %r2;
  463. ld.shared.f32 %f402, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps];
  464. ld.shared.f32 %f403, [%r6];
  465. setp.lt.f32 %p6, %f402, %f403;
  466. selp.u32 %r112, 1, 0, %p6;
  467. ld.shared.f32 %f404, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+4];
  468. setp.lt.f32 %p7, %f404, %f403;
  469. selp.u32 %r113, 1, 0, %p7;
  470. add.s32 %r114, %r113, %r112;
  471. ld.shared.f32 %f405, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+8];
  472. setp.lt.f32 %p8, %f405, %f403;
  473. selp.u32 %r115, 1, 0, %p8;
  474. add.s32 %r116, %r115, %r114;
  475. ld.shared.f32 %f406, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+12];
  476. setp.lt.f32 %p9, %f406, %f403;
  477. selp.u32 %r117, 1, 0, %p9;
  478. add.s32 %r118, %r117, %r116;
  479. ld.shared.f32 %f407, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+16];
  480. setp.lt.f32 %p10, %f407, %f403;
  481. selp.u32 %r119, 1, 0, %p10;
  482. add.s32 %r120, %r119, %r118;
  483. ld.shared.f32 %f408, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+20];
  484. setp.lt.f32 %p11, %f408, %f403;
  485. selp.u32 %r121, 1, 0, %p11;
  486. add.s32 %r122, %r121, %r120;
  487. ld.shared.f32 %f409, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+24];
  488. setp.lt.f32 %p12, %f409, %f403;
  489. selp.u32 %r123, 1, 0, %p12;
  490. add.s32 %r124, %r123, %r122;
  491. ld.shared.f32 %f410, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+28];
  492. setp.lt.f32 %p13, %f410, %f403;
  493. selp.u32 %r125, 1, 0, %p13;
  494. add.s32 %r126, %r125, %r124;
  495. ld.shared.f32 %f411, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+32];
  496. setp.lt.f32 %p14, %f411, %f403;
  497. selp.u32 %r127, 1, 0, %p14;
  498. add.s32 %r128, %r127, %r126;
  499. ld.shared.f32 %f412, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+36];
  500. setp.lt.f32 %p15, %f412, %f403;
  501. selp.u32 %r129, 1, 0, %p15;
  502. add.s32 %r130, %r129, %r128;
  503. ld.shared.f32 %f413, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+40];
  504. setp.lt.f32 %p16, %f413, %f403;
  505. selp.u32 %r131, 1, 0, %p16;
  506. add.s32 %r132, %r131, %r130;
  507. ld.shared.f32 %f414, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+44];
  508. setp.lt.f32 %p17, %f414, %f403;
  509. selp.u32 %r133, 1, 0, %p17;
  510. add.s32 %r134, %r133, %r132;
  511. ld.shared.f32 %f415, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+48];
  512. setp.lt.f32 %p18, %f415, %f403;
  513. selp.u32 %r135, 1, 0, %p18;
  514. add.s32 %r136, %r135, %r134;
  515. ld.shared.f32 %f416, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+52];
  516. setp.lt.f32 %p19, %f416, %f403;
  517. selp.u32 %r137, 1, 0, %p19;
  518. add.s32 %r138, %r137, %r136;
  519. ld.shared.f32 %f417, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+56];
  520. setp.lt.f32 %p20, %f417, %f403;
  521. selp.u32 %r139, 1, 0, %p20;
  522. add.s32 %r140, %r139, %r138;
  523. ld.shared.f32 %f418, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+60];
  524. setp.lt.f32 %p21, %f418, %f403;
  525. selp.u32 %r141, 1, 0, %p21;
  526. add.s32 %r142, %r141, %r140;
  527. mov.u32 %r144, _ZZ8compressPKjS0_P5uint2iE5xrefs;
  528. add.s32 %r7, %r144, %r110;
  529. st.shared.u32 [%r7], %r142;
  530. bar.warp.sync %r2;
  531. setp.lt.s32 %p22, %r1, 1;
  532. @%p22 bra BB0_12;
  533.  
  534. ld.shared.u32 %r8, [%r7];
  535. ld.shared.u32 %r145, [_ZZ8compressPKjS0_P5uint2iE5xrefs];
  536. setp.ne.s32 %p23, %r8, %r145;
  537. @%p23 bra BB0_12;
  538.  
  539. add.s32 %r146, %r8, 1;
  540. st.shared.u32 [%r7], %r146;
  541.  
  542. BB0_12:
  543. bar.warp.sync %r2;
  544. setp.lt.s32 %p24, %r1, 2;
  545. @%p24 bra BB0_15;
  546.  
  547. ld.shared.u32 %r9, [%r7];
  548. ld.shared.u32 %r147, [_ZZ8compressPKjS0_P5uint2iE5xrefs+4];
  549. setp.ne.s32 %p25, %r9, %r147;
  550. @%p25 bra BB0_15;
  551.  
  552. add.s32 %r148, %r9, 1;
  553. st.shared.u32 [%r7], %r148;
  554.  
  555. BB0_15:
  556. bar.warp.sync %r2;
  557. setp.lt.s32 %p26, %r1, 3;
  558. @%p26 bra BB0_18;
  559.  
  560. ld.shared.u32 %r10, [%r7];
  561. ld.shared.u32 %r149, [_ZZ8compressPKjS0_P5uint2iE5xrefs+8];
  562. setp.ne.s32 %p27, %r10, %r149;
  563. @%p27 bra BB0_18;
  564.  
  565. add.s32 %r150, %r10, 1;
  566. st.shared.u32 [%r7], %r150;
  567.  
  568. BB0_18:
  569. bar.warp.sync %r2;
  570. setp.lt.s32 %p28, %r1, 4;
  571. @%p28 bra BB0_21;
  572.  
  573. ld.shared.u32 %r11, [%r7];
  574. ld.shared.u32 %r151, [_ZZ8compressPKjS0_P5uint2iE5xrefs+12];
  575. setp.ne.s32 %p29, %r11, %r151;
  576. @%p29 bra BB0_21;
  577.  
  578. add.s32 %r152, %r11, 1;
  579. st.shared.u32 [%r7], %r152;
  580.  
  581. BB0_21:
  582. bar.warp.sync %r2;
  583. setp.lt.s32 %p30, %r1, 5;
  584. @%p30 bra BB0_24;
  585.  
  586. ld.shared.u32 %r12, [%r7];
  587. ld.shared.u32 %r153, [_ZZ8compressPKjS0_P5uint2iE5xrefs+16];
  588. setp.ne.s32 %p31, %r12, %r153;
  589. @%p31 bra BB0_24;
  590.  
  591. add.s32 %r154, %r12, 1;
  592. st.shared.u32 [%r7], %r154;
  593.  
  594. BB0_24:
  595. bar.warp.sync %r2;
  596. setp.lt.s32 %p32, %r1, 6;
  597. @%p32 bra BB0_27;
  598.  
  599. ld.shared.u32 %r13, [%r7];
  600. ld.shared.u32 %r155, [_ZZ8compressPKjS0_P5uint2iE5xrefs+20];
  601. setp.ne.s32 %p33, %r13, %r155;
  602. @%p33 bra BB0_27;
  603.  
  604. add.s32 %r156, %r13, 1;
  605. st.shared.u32 [%r7], %r156;
  606.  
  607. BB0_27:
  608. bar.warp.sync %r2;
  609. setp.lt.s32 %p34, %r1, 7;
  610. @%p34 bra BB0_30;
  611.  
  612. ld.shared.u32 %r14, [%r7];
  613. ld.shared.u32 %r157, [_ZZ8compressPKjS0_P5uint2iE5xrefs+24];
  614. setp.ne.s32 %p35, %r14, %r157;
  615. @%p35 bra BB0_30;
  616.  
  617. add.s32 %r158, %r14, 1;
  618. st.shared.u32 [%r7], %r158;
  619.  
  620. BB0_30:
  621. bar.warp.sync %r2;
  622. setp.lt.s32 %p36, %r1, 8;
  623. @%p36 bra BB0_33;
  624.  
  625. ld.shared.u32 %r15, [%r7];
  626. ld.shared.u32 %r159, [_ZZ8compressPKjS0_P5uint2iE5xrefs+28];
  627. setp.ne.s32 %p37, %r15, %r159;
  628. @%p37 bra BB0_33;
  629.  
  630. add.s32 %r160, %r15, 1;
  631. st.shared.u32 [%r7], %r160;
  632.  
  633. BB0_33:
  634. bar.warp.sync %r2;
  635. setp.lt.s32 %p38, %r1, 9;
  636. @%p38 bra BB0_36;
  637.  
  638. ld.shared.u32 %r16, [%r7];
  639. ld.shared.u32 %r161, [_ZZ8compressPKjS0_P5uint2iE5xrefs+32];
  640. setp.ne.s32 %p39, %r16, %r161;
  641. @%p39 bra BB0_36;
  642.  
  643. add.s32 %r162, %r16, 1;
  644. st.shared.u32 [%r7], %r162;
  645.  
  646. BB0_36:
  647. bar.warp.sync %r2;
  648. setp.lt.s32 %p40, %r1, 10;
  649. @%p40 bra BB0_39;
  650.  
  651. ld.shared.u32 %r17, [%r7];
  652. ld.shared.u32 %r163, [_ZZ8compressPKjS0_P5uint2iE5xrefs+36];
  653. setp.ne.s32 %p41, %r17, %r163;
  654. @%p41 bra BB0_39;
  655.  
  656. add.s32 %r164, %r17, 1;
  657. st.shared.u32 [%r7], %r164;
  658.  
  659. BB0_39:
  660. bar.warp.sync %r2;
  661. setp.lt.s32 %p42, %r1, 11;
  662. @%p42 bra BB0_42;
  663.  
  664. ld.shared.u32 %r18, [%r7];
  665. ld.shared.u32 %r165, [_ZZ8compressPKjS0_P5uint2iE5xrefs+40];
  666. setp.ne.s32 %p43, %r18, %r165;
  667. @%p43 bra BB0_42;
  668.  
  669. add.s32 %r166, %r18, 1;
  670. st.shared.u32 [%r7], %r166;
  671.  
  672. BB0_42:
  673. bar.warp.sync %r2;
  674. setp.lt.s32 %p44, %r1, 12;
  675. @%p44 bra BB0_45;
  676.  
  677. ld.shared.u32 %r19, [%r7];
  678. ld.shared.u32 %r167, [_ZZ8compressPKjS0_P5uint2iE5xrefs+44];
  679. setp.ne.s32 %p45, %r19, %r167;
  680. @%p45 bra BB0_45;
  681.  
  682. add.s32 %r168, %r19, 1;
  683. st.shared.u32 [%r7], %r168;
  684.  
  685. BB0_45:
  686. bar.warp.sync %r2;
  687. setp.lt.s32 %p46, %r1, 13;
  688. @%p46 bra BB0_48;
  689.  
  690. ld.shared.u32 %r20, [%r7];
  691. ld.shared.u32 %r169, [_ZZ8compressPKjS0_P5uint2iE5xrefs+48];
  692. setp.ne.s32 %p47, %r20, %r169;
  693. @%p47 bra BB0_48;
  694.  
  695. add.s32 %r170, %r20, 1;
  696. st.shared.u32 [%r7], %r170;
  697.  
  698. BB0_48:
  699. bar.warp.sync %r2;
  700. setp.lt.s32 %p48, %r1, 14;
  701. @%p48 bra BB0_51;
  702.  
  703. ld.shared.u32 %r21, [%r7];
  704. ld.shared.u32 %r171, [_ZZ8compressPKjS0_P5uint2iE5xrefs+52];
  705. setp.ne.s32 %p49, %r21, %r171;
  706. @%p49 bra BB0_51;
  707.  
  708. add.s32 %r172, %r21, 1;
  709. st.shared.u32 [%r7], %r172;
  710.  
  711. BB0_51:
  712. bar.warp.sync %r2;
  713. setp.lt.s32 %p50, %r1, 15;
  714. @%p50 bra BB0_54;
  715.  
  716. ld.shared.u32 %r22, [%r7];
  717. ld.shared.u32 %r173, [_ZZ8compressPKjS0_P5uint2iE5xrefs+56];
  718. setp.ne.s32 %p51, %r22, %r173;
  719. @%p51 bra BB0_54;
  720.  
  721. add.s32 %r174, %r22, 1;
  722. st.shared.u32 [%r7], %r174;
  723.  
  724. BB0_54:
  725. bar.warp.sync %r2;
  726. bar.warp.sync %r2;
  727. mov.u32 %r428, _ZZ8compressPKjS0_P5uint2iE6colors;
  728. mad.lo.s32 %r427, %r1, 12, %r428;
  729. ld.shared.f32 %f6, [%r427+8];
  730. ld.shared.f32 %f5, [%r427+4];
  731. ld.shared.f32 %f4, [%r427];
  732. bar.warp.sync %r2;
  733. mov.u32 %r423, _ZZ8compressPKjS0_P5uint2iE6colors;
  734. ld.shared.u32 %r175, [%r7];
  735. mad.lo.s32 %r177, %r175, 12, %r423;
  736. st.shared.f32 [%r177+8], %f6;
  737. st.shared.f32 [%r177+4], %f5;
  738. st.shared.f32 [%r177], %f4;
  739.  
  740. BB0_55:
  741. barrier.sync 0;
  742. ld.param.u64 %rd141, [_Z8compressPKjS0_P5uint2i_param_0];
  743. ld.shared.f32 %f11, [_ZZ8compressPKjS0_P5uint2iE4sums+8];
  744. ld.shared.f32 %f9, [_ZZ8compressPKjS0_P5uint2iE4sums+4];
  745. ld.shared.f32 %f7, [_ZZ8compressPKjS0_P5uint2iE4sums];
  746. mul.f32 %f8, %f7, 0f41100000;
  747. mul.f32 %f10, %f9, 0f41100000;
  748. mul.f32 %f12, %f11, 0f41100000;
  749. ld.const.f32 %f13, [kColorMetric];
  750. ld.const.f32 %f14, [kColorMetric+4];
  751. ld.const.f32 %f15, [kColorMetric+8];
  752. ld.shared.f32 %f16, [_ZZ8compressPKjS0_P5uint2iE6colors];
  753. ld.shared.f32 %f17, [_ZZ8compressPKjS0_P5uint2iE6colors+4];
  754. ld.shared.f32 %f18, [_ZZ8compressPKjS0_P5uint2iE6colors+8];
  755. ld.shared.f32 %f19, [_ZZ8compressPKjS0_P5uint2iE6colors+12];
  756. ld.shared.f32 %f20, [_ZZ8compressPKjS0_P5uint2iE6colors+16];
  757. ld.shared.f32 %f21, [_ZZ8compressPKjS0_P5uint2iE6colors+20];
  758. ld.shared.f32 %f22, [_ZZ8compressPKjS0_P5uint2iE6colors+24];
  759. ld.shared.f32 %f23, [_ZZ8compressPKjS0_P5uint2iE6colors+28];
  760. ld.shared.f32 %f24, [_ZZ8compressPKjS0_P5uint2iE6colors+32];
  761. ld.shared.f32 %f25, [_ZZ8compressPKjS0_P5uint2iE6colors+36];
  762. ld.shared.f32 %f26, [_ZZ8compressPKjS0_P5uint2iE6colors+40];
  763. ld.shared.f32 %f27, [_ZZ8compressPKjS0_P5uint2iE6colors+44];
  764. ld.shared.f32 %f28, [_ZZ8compressPKjS0_P5uint2iE6colors+48];
  765. ld.shared.f32 %f29, [_ZZ8compressPKjS0_P5uint2iE6colors+52];
  766. ld.shared.f32 %f30, [_ZZ8compressPKjS0_P5uint2iE6colors+56];
  767. ld.shared.f32 %f31, [_ZZ8compressPKjS0_P5uint2iE6colors+60];
  768. ld.shared.f32 %f32, [_ZZ8compressPKjS0_P5uint2iE6colors+64];
  769. ld.shared.f32 %f33, [_ZZ8compressPKjS0_P5uint2iE6colors+68];
  770. ld.shared.f32 %f34, [_ZZ8compressPKjS0_P5uint2iE6colors+72];
  771. ld.shared.f32 %f35, [_ZZ8compressPKjS0_P5uint2iE6colors+76];
  772. ld.shared.f32 %f36, [_ZZ8compressPKjS0_P5uint2iE6colors+80];
  773. ld.shared.f32 %f37, [_ZZ8compressPKjS0_P5uint2iE6colors+84];
  774. ld.shared.f32 %f38, [_ZZ8compressPKjS0_P5uint2iE6colors+88];
  775. ld.shared.f32 %f39, [_ZZ8compressPKjS0_P5uint2iE6colors+92];
  776. ld.shared.f32 %f40, [_ZZ8compressPKjS0_P5uint2iE6colors+96];
  777. ld.shared.f32 %f41, [_ZZ8compressPKjS0_P5uint2iE6colors+100];
  778. ld.shared.f32 %f42, [_ZZ8compressPKjS0_P5uint2iE6colors+104];
  779. ld.shared.f32 %f43, [_ZZ8compressPKjS0_P5uint2iE6colors+108];
  780. ld.shared.f32 %f44, [_ZZ8compressPKjS0_P5uint2iE6colors+112];
  781. ld.shared.f32 %f45, [_ZZ8compressPKjS0_P5uint2iE6colors+116];
  782. ld.shared.f32 %f46, [_ZZ8compressPKjS0_P5uint2iE6colors+120];
  783. ld.shared.f32 %f47, [_ZZ8compressPKjS0_P5uint2iE6colors+124];
  784. ld.shared.f32 %f48, [_ZZ8compressPKjS0_P5uint2iE6colors+128];
  785. ld.shared.f32 %f49, [_ZZ8compressPKjS0_P5uint2iE6colors+132];
  786. ld.shared.f32 %f50, [_ZZ8compressPKjS0_P5uint2iE6colors+136];
  787. ld.shared.f32 %f51, [_ZZ8compressPKjS0_P5uint2iE6colors+140];
  788. ld.shared.f32 %f52, [_ZZ8compressPKjS0_P5uint2iE6colors+144];
  789. ld.shared.f32 %f53, [_ZZ8compressPKjS0_P5uint2iE6colors+148];
  790. ld.shared.f32 %f54, [_ZZ8compressPKjS0_P5uint2iE6colors+152];
  791. ld.shared.f32 %f55, [_ZZ8compressPKjS0_P5uint2iE6colors+156];
  792. ld.shared.f32 %f56, [_ZZ8compressPKjS0_P5uint2iE6colors+160];
  793. ld.shared.f32 %f57, [_ZZ8compressPKjS0_P5uint2iE6colors+164];
  794. ld.shared.f32 %f58, [_ZZ8compressPKjS0_P5uint2iE6colors+168];
  795. ld.shared.f32 %f59, [_ZZ8compressPKjS0_P5uint2iE6colors+172];
  796. ld.shared.f32 %f60, [_ZZ8compressPKjS0_P5uint2iE6colors+176];
  797. ld.shared.f32 %f61, [_ZZ8compressPKjS0_P5uint2iE6colors+180];
  798. ld.shared.f32 %f62, [_ZZ8compressPKjS0_P5uint2iE6colors+184];
  799. ld.shared.f32 %f63, [_ZZ8compressPKjS0_P5uint2iE6colors+188];
  800. mov.f32 %f745, 0f7F7FFFFF;
  801. mov.u32 %r429, 0;
  802. cvta.to.global.u64 %rd7, %rd141;
  803.  
  804. BB0_56:
  805. shl.b32 %r181, %r429, 6;
  806. add.s32 %r25, %r181, %r1;
  807. setp.gt.s32 %p52, %r25, 991;
  808. @%p52 bra BB0_62;
  809.  
  810. mul.wide.s32 %rd8, %r25, 4;
  811. add.s64 %rd9, %rd7, %rd8;
  812. ld.global.u32 %r26, [%rd9];
  813. setp.gt.s32 %p53, %r25, 159;
  814. @%p53 bra BB0_59;
  815.  
  816. shl.b32 %r182, %r25, 2;
  817. mov.u32 %r183, _ZZ19evalAllPermutationsPK6float3PKjRtS4_RjPfS_N18cooperative_groups4__v112thread_blockEE14s_permutations;
  818. add.s32 %r184, %r183, %r182;
  819. st.shared.u32 [%r184], %r26;
  820.  
  821. BB0_59:
  822. and.b32 %r185, %r26, 3;
  823. mul.wide.u32 %rd10, %r185, 4;
  824. mov.u64 %rd11, alphaTable4;
  825. add.s64 %rd12, %rd11, %rd10;
  826. ld.const.f32 %f420, [%rd12];
  827. fma.rn.f32 %f421, %f420, %f16, 0f00000000;
  828. fma.rn.f32 %f422, %f420, %f17, 0f00000000;
  829. fma.rn.f32 %f423, %f420, %f18, 0f00000000;
  830. mov.u64 %rd13, prods4;
  831. add.s64 %rd14, %rd13, %rd10;
  832. shr.u32 %r186, %r26, 2;
  833. mul.wide.u32 %rd15, %r186, 4;
  834. and.b64 %rd16, %rd15, 12;
  835. add.s64 %rd17, %rd11, %rd16;
  836. ld.const.f32 %f424, [%rd17];
  837. fma.rn.f32 %f425, %f424, %f19, %f421;
  838. fma.rn.f32 %f426, %f424, %f20, %f422;
  839. fma.rn.f32 %f427, %f424, %f21, %f423;
  840. add.s64 %rd18, %rd13, %rd16;
  841. ld.const.u32 %r187, [%rd18];
  842. ld.const.u32 %r188, [%rd14];
  843. add.s32 %r189, %r187, %r188;
  844. shr.u32 %r190, %r26, 4;
  845. mul.wide.u32 %rd19, %r190, 4;
  846. and.b64 %rd20, %rd19, 12;
  847. add.s64 %rd21, %rd11, %rd20;
  848. ld.const.f32 %f428, [%rd21];
  849. fma.rn.f32 %f429, %f428, %f22, %f425;
  850. fma.rn.f32 %f430, %f428, %f23, %f426;
  851. fma.rn.f32 %f431, %f428, %f24, %f427;
  852. add.s64 %rd22, %rd13, %rd20;
  853. ld.const.u32 %r191, [%rd22];
  854. add.s32 %r192, %r191, %r189;
  855. shr.u32 %r193, %r26, 6;
  856. mul.wide.u32 %rd23, %r193, 4;
  857. and.b64 %rd24, %rd23, 12;
  858. add.s64 %rd25, %rd11, %rd24;
  859. ld.const.f32 %f432, [%rd25];
  860. fma.rn.f32 %f433, %f432, %f25, %f429;
  861. fma.rn.f32 %f434, %f432, %f26, %f430;
  862. fma.rn.f32 %f435, %f432, %f27, %f431;
  863. add.s64 %rd26, %rd13, %rd24;
  864. ld.const.u32 %r194, [%rd26];
  865. add.s32 %r195, %r194, %r192;
  866. shr.u32 %r196, %r26, 8;
  867. mul.wide.u32 %rd27, %r196, 4;
  868. and.b64 %rd28, %rd27, 12;
  869. add.s64 %rd29, %rd11, %rd28;
  870. ld.const.f32 %f436, [%rd29];
  871. fma.rn.f32 %f437, %f436, %f28, %f433;
  872. fma.rn.f32 %f438, %f436, %f29, %f434;
  873. fma.rn.f32 %f439, %f436, %f30, %f435;
  874. add.s64 %rd30, %rd13, %rd28;
  875. ld.const.u32 %r197, [%rd30];
  876. add.s32 %r198, %r197, %r195;
  877. shr.u32 %r199, %r26, 10;
  878. mul.wide.u32 %rd31, %r199, 4;
  879. and.b64 %rd32, %rd31, 12;
  880. add.s64 %rd33, %rd11, %rd32;
  881. ld.const.f32 %f440, [%rd33];
  882. fma.rn.f32 %f441, %f440, %f31, %f437;
  883. fma.rn.f32 %f442, %f440, %f32, %f438;
  884. fma.rn.f32 %f443, %f440, %f33, %f439;
  885. add.s64 %rd34, %rd13, %rd32;
  886. ld.const.u32 %r200, [%rd34];
  887. add.s32 %r201, %r200, %r198;
  888. shr.u32 %r202, %r26, 12;
  889. mul.wide.u32 %rd35, %r202, 4;
  890. and.b64 %rd36, %rd35, 12;
  891. add.s64 %rd37, %rd11, %rd36;
  892. ld.const.f32 %f444, [%rd37];
  893. fma.rn.f32 %f445, %f444, %f34, %f441;
  894. fma.rn.f32 %f446, %f444, %f35, %f442;
  895. fma.rn.f32 %f447, %f444, %f36, %f443;
  896. add.s64 %rd38, %rd13, %rd36;
  897. ld.const.u32 %r203, [%rd38];
  898. add.s32 %r204, %r203, %r201;
  899. shr.u32 %r205, %r26, 14;
  900. mul.wide.u32 %rd39, %r205, 4;
  901. and.b64 %rd40, %rd39, 12;
  902. add.s64 %rd41, %rd11, %rd40;
  903. ld.const.f32 %f448, [%rd41];
  904. fma.rn.f32 %f449, %f448, %f37, %f445;
  905. fma.rn.f32 %f450, %f448, %f38, %f446;
  906. fma.rn.f32 %f451, %f448, %f39, %f447;
  907. add.s64 %rd42, %rd13, %rd40;
  908. ld.const.u32 %r206, [%rd42];
  909. add.s32 %r207, %r206, %r204;
  910. shr.u32 %r208, %r26, 16;
  911. mul.wide.u32 %rd43, %r208, 4;
  912. and.b64 %rd44, %rd43, 12;
  913. add.s64 %rd45, %rd11, %rd44;
  914. ld.const.f32 %f452, [%rd45];
  915. fma.rn.f32 %f453, %f452, %f40, %f449;
  916. fma.rn.f32 %f454, %f452, %f41, %f450;
  917. fma.rn.f32 %f455, %f452, %f42, %f451;
  918. add.s64 %rd46, %rd13, %rd44;
  919. ld.const.u32 %r209, [%rd46];
  920. add.s32 %r210, %r209, %r207;
  921. shr.u32 %r211, %r26, 18;
  922. mul.wide.u32 %rd47, %r211, 4;
  923. and.b64 %rd48, %rd47, 12;
  924. add.s64 %rd49, %rd11, %rd48;
  925. ld.const.f32 %f456, [%rd49];
  926. fma.rn.f32 %f457, %f456, %f43, %f453;
  927. fma.rn.f32 %f458, %f456, %f44, %f454;
  928. fma.rn.f32 %f459, %f456, %f45, %f455;
  929. add.s64 %rd50, %rd13, %rd48;
  930. ld.const.u32 %r212, [%rd50];
  931. add.s32 %r213, %r212, %r210;
  932. shr.u32 %r214, %r26, 20;
  933. mul.wide.u32 %rd51, %r214, 4;
  934. and.b64 %rd52, %rd51, 12;
  935. add.s64 %rd53, %rd11, %rd52;
  936. ld.const.f32 %f460, [%rd53];
  937. fma.rn.f32 %f461, %f460, %f46, %f457;
  938. fma.rn.f32 %f462, %f460, %f47, %f458;
  939. fma.rn.f32 %f463, %f460, %f48, %f459;
  940. add.s64 %rd54, %rd13, %rd52;
  941. ld.const.u32 %r215, [%rd54];
  942. add.s32 %r216, %r215, %r213;
  943. shr.u32 %r217, %r26, 22;
  944. mul.wide.u32 %rd55, %r217, 4;
  945. and.b64 %rd56, %rd55, 12;
  946. add.s64 %rd57, %rd11, %rd56;
  947. ld.const.f32 %f464, [%rd57];
  948. fma.rn.f32 %f465, %f464, %f49, %f461;
  949. fma.rn.f32 %f466, %f464, %f50, %f462;
  950. fma.rn.f32 %f467, %f464, %f51, %f463;
  951. add.s64 %rd58, %rd13, %rd56;
  952. ld.const.u32 %r218, [%rd58];
  953. add.s32 %r219, %r218, %r216;
  954. shr.u32 %r220, %r26, 24;
  955. mul.wide.u32 %rd59, %r220, 4;
  956. and.b64 %rd60, %rd59, 12;
  957. add.s64 %rd61, %rd11, %rd60;
  958. ld.const.f32 %f468, [%rd61];
  959. fma.rn.f32 %f469, %f468, %f52, %f465;
  960. fma.rn.f32 %f470, %f468, %f53, %f466;
  961. fma.rn.f32 %f471, %f468, %f54, %f467;
  962. add.s64 %rd62, %rd13, %rd60;
  963. ld.const.u32 %r221, [%rd62];
  964. add.s32 %r222, %r221, %r219;
  965. shr.u32 %r223, %r26, 26;
  966. mul.wide.u32 %rd63, %r223, 4;
  967. and.b64 %rd64, %rd63, 12;
  968. add.s64 %rd65, %rd11, %rd64;
  969. ld.const.f32 %f472, [%rd65];
  970. fma.rn.f32 %f473, %f472, %f55, %f469;
  971. fma.rn.f32 %f474, %f472, %f56, %f470;
  972. fma.rn.f32 %f475, %f472, %f57, %f471;
  973. add.s64 %rd66, %rd13, %rd64;
  974. ld.const.u32 %r224, [%rd66];
  975. add.s32 %r225, %r224, %r222;
  976. shr.u32 %r226, %r26, 28;
  977. mul.wide.u32 %rd67, %r226, 4;
  978. and.b64 %rd68, %rd67, 12;
  979. add.s64 %rd69, %rd11, %rd68;
  980. ld.const.f32 %f476, [%rd69];
  981. fma.rn.f32 %f477, %f476, %f58, %f473;
  982. fma.rn.f32 %f478, %f476, %f59, %f474;
  983. fma.rn.f32 %f479, %f476, %f60, %f475;
  984. add.s64 %rd70, %rd13, %rd68;
  985. ld.const.u32 %r227, [%rd70];
  986. add.s32 %r228, %r227, %r225;
  987. shr.u32 %r229, %r26, 30;
  988. mul.wide.u32 %rd71, %r229, 4;
  989. add.s64 %rd72, %rd11, %rd71;
  990. ld.const.f32 %f480, [%rd72];
  991. fma.rn.f32 %f481, %f480, %f61, %f477;
  992. fma.rn.f32 %f482, %f480, %f62, %f478;
  993. fma.rn.f32 %f483, %f480, %f63, %f479;
  994. add.s64 %rd73, %rd13, %rd71;
  995. ld.const.u32 %r230, [%rd73];
  996. add.s32 %r231, %r230, %r228;
  997. shr.s32 %r232, %r231, 16;
  998. cvt.rn.f32.s32 %f484, %r232;
  999. bfe.u32 %r233, %r231, 8, 8;
  1000. cvt.rn.f32.s32 %f485, %r233;
  1001. and.b32 %r234, %r231, 255;
  1002. cvt.rn.f32.s32 %f486, %r234;
  1003. sub.f32 %f487, %f8, %f481;
  1004. sub.f32 %f488, %f10, %f482;
  1005. sub.f32 %f489, %f12, %f483;
  1006. mul.f32 %f490, %f484, %f485;
  1007. mul.f32 %f491, %f486, %f486;
  1008. sub.f32 %f492, %f490, %f491;
  1009. rcp.rn.f32 %f493, %f492;
  1010. mul.f32 %f494, %f481, %f485;
  1011. mul.f32 %f495, %f482, %f485;
  1012. mul.f32 %f496, %f483, %f485;
  1013. mul.f32 %f497, %f487, %f486;
  1014. mul.f32 %f498, %f488, %f486;
  1015. mul.f32 %f499, %f489, %f486;
  1016. sub.f32 %f500, %f494, %f497;
  1017. sub.f32 %f501, %f495, %f498;
  1018. sub.f32 %f502, %f496, %f499;
  1019. mul.f32 %f503, %f500, %f493;
  1020. mul.f32 %f504, %f501, %f493;
  1021. mul.f32 %f505, %f502, %f493;
  1022. mul.f32 %f506, %f487, %f484;
  1023. mul.f32 %f507, %f488, %f484;
  1024. mul.f32 %f508, %f489, %f484;
  1025. mul.f32 %f509, %f481, %f486;
  1026. mul.f32 %f510, %f482, %f486;
  1027. mul.f32 %f511, %f483, %f486;
  1028. sub.f32 %f512, %f506, %f509;
  1029. sub.f32 %f513, %f507, %f510;
  1030. sub.f32 %f514, %f508, %f511;
  1031. mul.f32 %f515, %f512, %f493;
  1032. mul.f32 %f516, %f513, %f493;
  1033. mul.f32 %f517, %f514, %f493;
  1034. cvt.sat.f32.f32 %f518, %f503;
  1035. mul.f32 %f519, %f518, 0f41F80000;
  1036. cvt.rni.f32.f32 %f520, %f519;
  1037. cvt.sat.f32.f32 %f521, %f504;
  1038. mul.f32 %f522, %f521, 0f427C0000;
  1039. cvt.rni.f32.f32 %f523, %f522;
  1040. cvt.sat.f32.f32 %f524, %f505;
  1041. mul.f32 %f525, %f524, 0f41F80000;
  1042. cvt.rni.f32.f32 %f526, %f525;
  1043. cvt.rzi.u32.f32 %r235, %f520;
  1044. shl.b32 %r236, %r235, 11;
  1045. and.b32 %r237, %r236, 63488;
  1046. cvt.rzi.u32.f32 %r238, %f523;
  1047. shl.b32 %r239, %r238, 5;
  1048. and.b32 %r240, %r239, 65504;
  1049. cvt.rzi.u32.f32 %r241, %f526;
  1050. or.b32 %r242, %r241, %r237;
  1051. or.b32 %r27, %r242, %r240;
  1052. mul.f32 %f527, %f520, 0f3D043571;
  1053. mul.f32 %f528, %f523, 0f3C81B119;
  1054. mul.f32 %f529, %f526, 0f3D043571;
  1055. cvt.sat.f32.f32 %f530, %f515;
  1056. mul.f32 %f531, %f530, 0f41F80000;
  1057. cvt.rni.f32.f32 %f532, %f531;
  1058. cvt.sat.f32.f32 %f533, %f516;
  1059. mul.f32 %f534, %f533, 0f427C0000;
  1060. cvt.rni.f32.f32 %f535, %f534;
  1061. cvt.sat.f32.f32 %f536, %f517;
  1062. mul.f32 %f537, %f536, 0f41F80000;
  1063. cvt.rni.f32.f32 %f538, %f537;
  1064. cvt.rzi.u32.f32 %r243, %f532;
  1065. shl.b32 %r244, %r243, 11;
  1066. and.b32 %r245, %r244, 63488;
  1067. cvt.rzi.u32.f32 %r246, %f535;
  1068. shl.b32 %r247, %r246, 5;
  1069. and.b32 %r248, %r247, 65504;
  1070. cvt.rzi.u32.f32 %r249, %f538;
  1071. or.b32 %r250, %r249, %r245;
  1072. or.b32 %r28, %r250, %r248;
  1073. mul.f32 %f539, %f532, 0f3D043571;
  1074. mul.f32 %f540, %f535, 0f3C81B119;
  1075. mul.f32 %f541, %f538, 0f3D043571;
  1076. mul.f32 %f542, %f527, %f527;
  1077. mul.f32 %f543, %f528, %f528;
  1078. mul.f32 %f544, %f529, %f529;
  1079. mul.f32 %f545, %f539, %f539;
  1080. mul.f32 %f546, %f540, %f540;
  1081. mul.f32 %f547, %f541, %f541;
  1082. mul.f32 %f548, %f485, %f545;
  1083. mul.f32 %f549, %f485, %f546;
  1084. mul.f32 %f550, %f485, %f547;
  1085. fma.rn.f32 %f551, %f484, %f542, %f548;
  1086. fma.rn.f32 %f552, %f484, %f543, %f549;
  1087. fma.rn.f32 %f553, %f484, %f544, %f550;
  1088. mul.f32 %f554, %f527, %f539;
  1089. mul.f32 %f555, %f528, %f540;
  1090. mul.f32 %f556, %f529, %f541;
  1091. mul.f32 %f557, %f486, %f554;
  1092. mul.f32 %f558, %f486, %f555;
  1093. mul.f32 %f559, %f486, %f556;
  1094. mul.f32 %f560, %f481, %f527;
  1095. mul.f32 %f561, %f482, %f528;
  1096. mul.f32 %f562, %f483, %f529;
  1097. sub.f32 %f563, %f557, %f560;
  1098. sub.f32 %f564, %f558, %f561;
  1099. sub.f32 %f565, %f559, %f562;
  1100. mul.f32 %f566, %f487, %f539;
  1101. mul.f32 %f567, %f488, %f540;
  1102. mul.f32 %f568, %f489, %f541;
  1103. sub.f32 %f569, %f563, %f566;
  1104. sub.f32 %f570, %f564, %f567;
  1105. sub.f32 %f571, %f565, %f568;
  1106. fma.rn.f32 %f572, %f569, 0f40000000, %f551;
  1107. fma.rn.f32 %f573, %f570, 0f40000000, %f552;
  1108. fma.rn.f32 %f574, %f571, 0f40000000, %f553;
  1109. mul.f32 %f575, %f573, %f14;
  1110. fma.rn.f32 %f576, %f572, %f13, %f575;
  1111. fma.rn.f32 %f577, %f15, %f574, %f576;
  1112. mul.f32 %f65, %f577, 0f3DE38E39;
  1113. setp.geu.f32 %p54, %f65, %f745;
  1114. @%p54 bra BB0_61;
  1115.  
  1116. cvt.u16.u32 %rs26, %r28;
  1117. cvt.u16.u32 %rs27, %r27;
  1118. mov.u32 %r430, %r26;
  1119. mov.f32 %f745, %f65;
  1120.  
  1121. BB0_61:
  1122. add.s32 %r429, %r429, 1;
  1123. setp.lt.s32 %p55, %r429, 16;
  1124. @%p55 bra BB0_56;
  1125.  
  1126. BB0_62:
  1127. setp.lt.u16 %p56, %rs27, %rs26;
  1128. selp.b16 %rs33, %rs26, %rs27, %p56;
  1129. selp.b16 %rs32, %rs27, %rs26, %p56;
  1130. xor.b32 %r251, %r430, 1431655765;
  1131. selp.b32 %r434, %r251, %r430, %p56;
  1132. barrier.sync 0;
  1133. mul.f32 %f68, %f7, 0f40800000;
  1134. mul.f32 %f69, %f9, 0f40800000;
  1135. mul.f32 %f70, %f11, 0f40800000;
  1136. ld.shared.f32 %f71, [_ZZ8compressPKjS0_P5uint2iE6colors];
  1137. ld.shared.f32 %f72, [_ZZ8compressPKjS0_P5uint2iE6colors+4];
  1138. ld.shared.f32 %f73, [_ZZ8compressPKjS0_P5uint2iE6colors+8];
  1139. ld.shared.f32 %f74, [_ZZ8compressPKjS0_P5uint2iE6colors+12];
  1140. ld.shared.f32 %f75, [_ZZ8compressPKjS0_P5uint2iE6colors+16];
  1141. ld.shared.f32 %f76, [_ZZ8compressPKjS0_P5uint2iE6colors+20];
  1142. ld.shared.f32 %f77, [_ZZ8compressPKjS0_P5uint2iE6colors+24];
  1143. ld.shared.f32 %f78, [_ZZ8compressPKjS0_P5uint2iE6colors+28];
  1144. ld.shared.f32 %f79, [_ZZ8compressPKjS0_P5uint2iE6colors+32];
  1145. ld.shared.f32 %f80, [_ZZ8compressPKjS0_P5uint2iE6colors+36];
  1146. ld.shared.f32 %f81, [_ZZ8compressPKjS0_P5uint2iE6colors+40];
  1147. ld.shared.f32 %f82, [_ZZ8compressPKjS0_P5uint2iE6colors+44];
  1148. ld.shared.f32 %f83, [_ZZ8compressPKjS0_P5uint2iE6colors+48];
  1149. ld.shared.f32 %f84, [_ZZ8compressPKjS0_P5uint2iE6colors+52];
  1150. ld.shared.f32 %f85, [_ZZ8compressPKjS0_P5uint2iE6colors+56];
  1151. ld.shared.f32 %f86, [_ZZ8compressPKjS0_P5uint2iE6colors+60];
  1152. ld.shared.f32 %f87, [_ZZ8compressPKjS0_P5uint2iE6colors+64];
  1153. ld.shared.f32 %f88, [_ZZ8compressPKjS0_P5uint2iE6colors+68];
  1154. ld.shared.f32 %f89, [_ZZ8compressPKjS0_P5uint2iE6colors+72];
  1155. ld.shared.f32 %f90, [_ZZ8compressPKjS0_P5uint2iE6colors+76];
  1156. ld.shared.f32 %f91, [_ZZ8compressPKjS0_P5uint2iE6colors+80];
  1157. ld.shared.f32 %f92, [_ZZ8compressPKjS0_P5uint2iE6colors+84];
  1158. ld.shared.f32 %f93, [_ZZ8compressPKjS0_P5uint2iE6colors+88];
  1159. ld.shared.f32 %f94, [_ZZ8compressPKjS0_P5uint2iE6colors+92];
  1160. ld.shared.f32 %f95, [_ZZ8compressPKjS0_P5uint2iE6colors+96];
  1161. ld.shared.f32 %f96, [_ZZ8compressPKjS0_P5uint2iE6colors+100];
  1162. ld.shared.f32 %f97, [_ZZ8compressPKjS0_P5uint2iE6colors+104];
  1163. ld.shared.f32 %f98, [_ZZ8compressPKjS0_P5uint2iE6colors+108];
  1164. ld.shared.f32 %f99, [_ZZ8compressPKjS0_P5uint2iE6colors+112];
  1165. ld.shared.f32 %f100, [_ZZ8compressPKjS0_P5uint2iE6colors+116];
  1166. ld.shared.f32 %f101, [_ZZ8compressPKjS0_P5uint2iE6colors+120];
  1167. ld.shared.f32 %f102, [_ZZ8compressPKjS0_P5uint2iE6colors+124];
  1168. ld.shared.f32 %f103, [_ZZ8compressPKjS0_P5uint2iE6colors+128];
  1169. ld.shared.f32 %f104, [_ZZ8compressPKjS0_P5uint2iE6colors+132];
  1170. ld.shared.f32 %f105, [_ZZ8compressPKjS0_P5uint2iE6colors+136];
  1171. ld.shared.f32 %f106, [_ZZ8compressPKjS0_P5uint2iE6colors+140];
  1172. ld.shared.f32 %f107, [_ZZ8compressPKjS0_P5uint2iE6colors+144];
  1173. ld.shared.f32 %f108, [_ZZ8compressPKjS0_P5uint2iE6colors+148];
  1174. ld.shared.f32 %f109, [_ZZ8compressPKjS0_P5uint2iE6colors+152];
  1175. ld.shared.f32 %f110, [_ZZ8compressPKjS0_P5uint2iE6colors+156];
  1176. ld.shared.f32 %f111, [_ZZ8compressPKjS0_P5uint2iE6colors+160];
  1177. ld.shared.f32 %f112, [_ZZ8compressPKjS0_P5uint2iE6colors+164];
  1178. ld.shared.f32 %f113, [_ZZ8compressPKjS0_P5uint2iE6colors+168];
  1179. ld.shared.f32 %f114, [_ZZ8compressPKjS0_P5uint2iE6colors+172];
  1180. ld.shared.f32 %f115, [_ZZ8compressPKjS0_P5uint2iE6colors+176];
  1181. ld.shared.f32 %f116, [_ZZ8compressPKjS0_P5uint2iE6colors+180];
  1182. ld.shared.f32 %f117, [_ZZ8compressPKjS0_P5uint2iE6colors+184];
  1183. ld.shared.f32 %f118, [_ZZ8compressPKjS0_P5uint2iE6colors+188];
  1184. mov.u32 %r433, 0;
  1185.  
  1186. BB0_63:
  1187. shl.b32 %r253, %r433, 6;
  1188. add.s32 %r36, %r253, %r1;
  1189. setp.gt.s32 %p57, %r36, 159;
  1190. @%p57 bra BB0_67;
  1191.  
  1192. shl.b32 %r254, %r36, 2;
  1193. mov.u32 %r255, _ZZ19evalAllPermutationsPK6float3PKjRtS4_RjPfS_N18cooperative_groups4__v112thread_blockEE14s_permutations;
  1194. add.s32 %r256, %r255, %r254;
  1195. ld.shared.u32 %r37, [%r256];
  1196. and.b32 %r257, %r37, 3;
  1197. mul.wide.u32 %rd74, %r257, 4;
  1198. mov.u64 %rd75, alphaTable3;
  1199. add.s64 %rd76, %rd75, %rd74;
  1200. ld.const.f32 %f578, [%rd76];
  1201. fma.rn.f32 %f579, %f578, %f71, 0f00000000;
  1202. fma.rn.f32 %f580, %f578, %f72, 0f00000000;
  1203. fma.rn.f32 %f581, %f578, %f73, 0f00000000;
  1204. mov.u64 %rd77, prods3;
  1205. add.s64 %rd78, %rd77, %rd74;
  1206. shr.u32 %r258, %r37, 2;
  1207. mul.wide.u32 %rd79, %r258, 4;
  1208. and.b64 %rd80, %rd79, 12;
  1209. add.s64 %rd81, %rd75, %rd80;
  1210. ld.const.f32 %f582, [%rd81];
  1211. fma.rn.f32 %f583, %f582, %f74, %f579;
  1212. fma.rn.f32 %f584, %f582, %f75, %f580;
  1213. fma.rn.f32 %f585, %f582, %f76, %f581;
  1214. add.s64 %rd82, %rd77, %rd80;
  1215. ld.const.u32 %r259, [%rd82];
  1216. ld.const.u32 %r260, [%rd78];
  1217. add.s32 %r261, %r259, %r260;
  1218. shr.u32 %r262, %r37, 4;
  1219. mul.wide.u32 %rd83, %r262, 4;
  1220. and.b64 %rd84, %rd83, 12;
  1221. add.s64 %rd85, %rd75, %rd84;
  1222. ld.const.f32 %f586, [%rd85];
  1223. fma.rn.f32 %f587, %f586, %f77, %f583;
  1224. fma.rn.f32 %f588, %f586, %f78, %f584;
  1225. fma.rn.f32 %f589, %f586, %f79, %f585;
  1226. add.s64 %rd86, %rd77, %rd84;
  1227. ld.const.u32 %r263, [%rd86];
  1228. add.s32 %r264, %r263, %r261;
  1229. shr.u32 %r265, %r37, 6;
  1230. mul.wide.u32 %rd87, %r265, 4;
  1231. and.b64 %rd88, %rd87, 12;
  1232. add.s64 %rd89, %rd75, %rd88;
  1233. ld.const.f32 %f590, [%rd89];
  1234. fma.rn.f32 %f591, %f590, %f80, %f587;
  1235. fma.rn.f32 %f592, %f590, %f81, %f588;
  1236. fma.rn.f32 %f593, %f590, %f82, %f589;
  1237. add.s64 %rd90, %rd77, %rd88;
  1238. ld.const.u32 %r266, [%rd90];
  1239. add.s32 %r267, %r266, %r264;
  1240. shr.u32 %r268, %r37, 8;
  1241. mul.wide.u32 %rd91, %r268, 4;
  1242. and.b64 %rd92, %rd91, 12;
  1243. add.s64 %rd93, %rd75, %rd92;
  1244. ld.const.f32 %f594, [%rd93];
  1245. fma.rn.f32 %f595, %f594, %f83, %f591;
  1246. fma.rn.f32 %f596, %f594, %f84, %f592;
  1247. fma.rn.f32 %f597, %f594, %f85, %f593;
  1248. add.s64 %rd94, %rd77, %rd92;
  1249. ld.const.u32 %r269, [%rd94];
  1250. add.s32 %r270, %r269, %r267;
  1251. shr.u32 %r271, %r37, 10;
  1252. mul.wide.u32 %rd95, %r271, 4;
  1253. and.b64 %rd96, %rd95, 12;
  1254. add.s64 %rd97, %rd75, %rd96;
  1255. ld.const.f32 %f598, [%rd97];
  1256. fma.rn.f32 %f599, %f598, %f86, %f595;
  1257. fma.rn.f32 %f600, %f598, %f87, %f596;
  1258. fma.rn.f32 %f601, %f598, %f88, %f597;
  1259. add.s64 %rd98, %rd77, %rd96;
  1260. ld.const.u32 %r272, [%rd98];
  1261. add.s32 %r273, %r272, %r270;
  1262. shr.u32 %r274, %r37, 12;
  1263. mul.wide.u32 %rd99, %r274, 4;
  1264. and.b64 %rd100, %rd99, 12;
  1265. add.s64 %rd101, %rd75, %rd100;
  1266. ld.const.f32 %f602, [%rd101];
  1267. fma.rn.f32 %f603, %f602, %f89, %f599;
  1268. fma.rn.f32 %f604, %f602, %f90, %f600;
  1269. fma.rn.f32 %f605, %f602, %f91, %f601;
  1270. add.s64 %rd102, %rd77, %rd100;
  1271. ld.const.u32 %r275, [%rd102];
  1272. add.s32 %r276, %r275, %r273;
  1273. shr.u32 %r277, %r37, 14;
  1274. mul.wide.u32 %rd103, %r277, 4;
  1275. and.b64 %rd104, %rd103, 12;
  1276. add.s64 %rd105, %rd75, %rd104;
  1277. ld.const.f32 %f606, [%rd105];
  1278. fma.rn.f32 %f607, %f606, %f92, %f603;
  1279. fma.rn.f32 %f608, %f606, %f93, %f604;
  1280. fma.rn.f32 %f609, %f606, %f94, %f605;
  1281. add.s64 %rd106, %rd77, %rd104;
  1282. ld.const.u32 %r278, [%rd106];
  1283. add.s32 %r279, %r278, %r276;
  1284. shr.u32 %r280, %r37, 16;
  1285. mul.wide.u32 %rd107, %r280, 4;
  1286. and.b64 %rd108, %rd107, 12;
  1287. add.s64 %rd109, %rd75, %rd108;
  1288. ld.const.f32 %f610, [%rd109];
  1289. fma.rn.f32 %f611, %f610, %f95, %f607;
  1290. fma.rn.f32 %f612, %f610, %f96, %f608;
  1291. fma.rn.f32 %f613, %f610, %f97, %f609;
  1292. add.s64 %rd110, %rd77, %rd108;
  1293. ld.const.u32 %r281, [%rd110];
  1294. add.s32 %r282, %r281, %r279;
  1295. shr.u32 %r283, %r37, 18;
  1296. mul.wide.u32 %rd111, %r283, 4;
  1297. and.b64 %rd112, %rd111, 12;
  1298. add.s64 %rd113, %rd75, %rd112;
  1299. ld.const.f32 %f614, [%rd113];
  1300. fma.rn.f32 %f615, %f614, %f98, %f611;
  1301. fma.rn.f32 %f616, %f614, %f99, %f612;
  1302. fma.rn.f32 %f617, %f614, %f100, %f613;
  1303. add.s64 %rd114, %rd77, %rd112;
  1304. ld.const.u32 %r284, [%rd114];
  1305. add.s32 %r285, %r284, %r282;
  1306. shr.u32 %r286, %r37, 20;
  1307. mul.wide.u32 %rd115, %r286, 4;
  1308. and.b64 %rd116, %rd115, 12;
  1309. add.s64 %rd117, %rd75, %rd116;
  1310. ld.const.f32 %f618, [%rd117];
  1311. fma.rn.f32 %f619, %f618, %f101, %f615;
  1312. fma.rn.f32 %f620, %f618, %f102, %f616;
  1313. fma.rn.f32 %f621, %f618, %f103, %f617;
  1314. add.s64 %rd118, %rd77, %rd116;
  1315. ld.const.u32 %r287, [%rd118];
  1316. add.s32 %r288, %r287, %r285;
  1317. shr.u32 %r289, %r37, 22;
  1318. mul.wide.u32 %rd119, %r289, 4;
  1319. and.b64 %rd120, %rd119, 12;
  1320. add.s64 %rd121, %rd75, %rd120;
  1321. ld.const.f32 %f622, [%rd121];
  1322. fma.rn.f32 %f623, %f622, %f104, %f619;
  1323. fma.rn.f32 %f624, %f622, %f105, %f620;
  1324. fma.rn.f32 %f625, %f622, %f106, %f621;
  1325. add.s64 %rd122, %rd77, %rd120;
  1326. ld.const.u32 %r290, [%rd122];
  1327. add.s32 %r291, %r290, %r288;
  1328. shr.u32 %r292, %r37, 24;
  1329. mul.wide.u32 %rd123, %r292, 4;
  1330. and.b64 %rd124, %rd123, 12;
  1331. add.s64 %rd125, %rd75, %rd124;
  1332. ld.const.f32 %f626, [%rd125];
  1333. fma.rn.f32 %f627, %f626, %f107, %f623;
  1334. fma.rn.f32 %f628, %f626, %f108, %f624;
  1335. fma.rn.f32 %f629, %f626, %f109, %f625;
  1336. add.s64 %rd126, %rd77, %rd124;
  1337. ld.const.u32 %r293, [%rd126];
  1338. add.s32 %r294, %r293, %r291;
  1339. shr.u32 %r295, %r37, 26;
  1340. mul.wide.u32 %rd127, %r295, 4;
  1341. and.b64 %rd128, %rd127, 12;
  1342. add.s64 %rd129, %rd75, %rd128;
  1343. ld.const.f32 %f630, [%rd129];
  1344. fma.rn.f32 %f631, %f630, %f110, %f627;
  1345. fma.rn.f32 %f632, %f630, %f111, %f628;
  1346. fma.rn.f32 %f633, %f630, %f112, %f629;
  1347. add.s64 %rd130, %rd77, %rd128;
  1348. ld.const.u32 %r296, [%rd130];
  1349. add.s32 %r297, %r296, %r294;
  1350. shr.u32 %r298, %r37, 28;
  1351. mul.wide.u32 %rd131, %r298, 4;
  1352. and.b64 %rd132, %rd131, 12;
  1353. add.s64 %rd133, %rd75, %rd132;
  1354. ld.const.f32 %f634, [%rd133];
  1355. fma.rn.f32 %f635, %f634, %f113, %f631;
  1356. fma.rn.f32 %f636, %f634, %f114, %f632;
  1357. fma.rn.f32 %f637, %f634, %f115, %f633;
  1358. add.s64 %rd134, %rd77, %rd132;
  1359. ld.const.u32 %r299, [%rd134];
  1360. add.s32 %r300, %r299, %r297;
  1361. shr.u32 %r301, %r37, 30;
  1362. mul.wide.u32 %rd135, %r301, 4;
  1363. add.s64 %rd136, %rd75, %rd135;
  1364. ld.const.f32 %f638, [%rd136];
  1365. fma.rn.f32 %f639, %f638, %f116, %f635;
  1366. fma.rn.f32 %f640, %f638, %f117, %f636;
  1367. fma.rn.f32 %f641, %f638, %f118, %f637;
  1368. add.s64 %rd137, %rd77, %rd135;
  1369. ld.const.u32 %r302, [%rd137];
  1370. add.s32 %r303, %r302, %r300;
  1371. shr.s32 %r304, %r303, 16;
  1372. cvt.rn.f32.s32 %f642, %r304;
  1373. bfe.u32 %r305, %r303, 8, 8;
  1374. cvt.rn.f32.s32 %f643, %r305;
  1375. and.b32 %r306, %r303, 255;
  1376. cvt.rn.f32.s32 %f644, %r306;
  1377. sub.f32 %f645, %f68, %f639;
  1378. sub.f32 %f646, %f69, %f640;
  1379. sub.f32 %f647, %f70, %f641;
  1380. mul.f32 %f648, %f642, %f643;
  1381. mul.f32 %f649, %f644, %f644;
  1382. sub.f32 %f650, %f648, %f649;
  1383. rcp.rn.f32 %f651, %f650;
  1384. mul.f32 %f652, %f639, %f643;
  1385. mul.f32 %f653, %f640, %f643;
  1386. mul.f32 %f654, %f641, %f643;
  1387. mul.f32 %f655, %f645, %f644;
  1388. mul.f32 %f656, %f646, %f644;
  1389. mul.f32 %f657, %f647, %f644;
  1390. sub.f32 %f658, %f652, %f655;
  1391. sub.f32 %f659, %f653, %f656;
  1392. sub.f32 %f660, %f654, %f657;
  1393. mul.f32 %f661, %f658, %f651;
  1394. mul.f32 %f662, %f659, %f651;
  1395. mul.f32 %f663, %f660, %f651;
  1396. mul.f32 %f664, %f645, %f642;
  1397. mul.f32 %f665, %f646, %f642;
  1398. mul.f32 %f666, %f647, %f642;
  1399. mul.f32 %f667, %f639, %f644;
  1400. mul.f32 %f668, %f640, %f644;
  1401. mul.f32 %f669, %f641, %f644;
  1402. sub.f32 %f670, %f664, %f667;
  1403. sub.f32 %f671, %f665, %f668;
  1404. sub.f32 %f672, %f666, %f669;
  1405. mul.f32 %f673, %f670, %f651;
  1406. mul.f32 %f674, %f671, %f651;
  1407. mul.f32 %f675, %f672, %f651;
  1408. cvt.sat.f32.f32 %f676, %f661;
  1409. mul.f32 %f677, %f676, 0f41F80000;
  1410. cvt.rni.f32.f32 %f678, %f677;
  1411. cvt.sat.f32.f32 %f679, %f662;
  1412. mul.f32 %f680, %f679, 0f427C0000;
  1413. cvt.rni.f32.f32 %f681, %f680;
  1414. cvt.sat.f32.f32 %f682, %f663;
  1415. mul.f32 %f683, %f682, 0f41F80000;
  1416. cvt.rni.f32.f32 %f684, %f683;
  1417. cvt.rzi.u32.f32 %r307, %f678;
  1418. shl.b32 %r308, %r307, 11;
  1419. cvt.rzi.u32.f32 %r309, %f681;
  1420. shl.b32 %r310, %r309, 5;
  1421. cvt.rzi.u32.f32 %r311, %f684;
  1422. cvt.u16.u32 %rs21, %r311;
  1423. cvt.u16.u32 %rs22, %r308;
  1424. or.b16 %rs23, %rs21, %rs22;
  1425. cvt.u16.u32 %rs24, %r310;
  1426. or.b16 %rs13, %rs23, %rs24;
  1427. mul.f32 %f685, %f678, 0f3D043571;
  1428. mul.f32 %f686, %f681, 0f3C81B119;
  1429. mul.f32 %f687, %f684, 0f3D043571;
  1430. cvt.sat.f32.f32 %f688, %f673;
  1431. mul.f32 %f689, %f688, 0f41F80000;
  1432. cvt.rni.f32.f32 %f690, %f689;
  1433. cvt.sat.f32.f32 %f691, %f674;
  1434. mul.f32 %f692, %f691, 0f427C0000;
  1435. cvt.rni.f32.f32 %f693, %f692;
  1436. cvt.sat.f32.f32 %f694, %f675;
  1437. mul.f32 %f695, %f694, 0f41F80000;
  1438. cvt.rni.f32.f32 %f696, %f695;
  1439. cvt.rzi.u32.f32 %r312, %f690;
  1440. shl.b32 %r313, %r312, 11;
  1441. and.b32 %r314, %r313, 63488;
  1442. cvt.rzi.u32.f32 %r315, %f693;
  1443. shl.b32 %r316, %r315, 5;
  1444. and.b32 %r317, %r316, 65504;
  1445. cvt.rzi.u32.f32 %r318, %f696;
  1446. or.b32 %r319, %r318, %r314;
  1447. or.b32 %r38, %r319, %r317;
  1448. mul.f32 %f697, %f690, 0f3D043571;
  1449. mul.f32 %f698, %f693, 0f3C81B119;
  1450. mul.f32 %f699, %f696, 0f3D043571;
  1451. mul.f32 %f700, %f685, %f685;
  1452. mul.f32 %f701, %f686, %f686;
  1453. mul.f32 %f702, %f687, %f687;
  1454. mul.f32 %f703, %f697, %f697;
  1455. mul.f32 %f704, %f698, %f698;
  1456. mul.f32 %f705, %f699, %f699;
  1457. mul.f32 %f706, %f643, %f703;
  1458. mul.f32 %f707, %f643, %f704;
  1459. mul.f32 %f708, %f643, %f705;
  1460. fma.rn.f32 %f709, %f642, %f700, %f706;
  1461. fma.rn.f32 %f710, %f642, %f701, %f707;
  1462. fma.rn.f32 %f711, %f642, %f702, %f708;
  1463. mul.f32 %f712, %f685, %f697;
  1464. mul.f32 %f713, %f686, %f698;
  1465. mul.f32 %f714, %f687, %f699;
  1466. mul.f32 %f715, %f644, %f712;
  1467. mul.f32 %f716, %f644, %f713;
  1468. mul.f32 %f717, %f644, %f714;
  1469. mul.f32 %f718, %f639, %f685;
  1470. mul.f32 %f719, %f640, %f686;
  1471. mul.f32 %f720, %f641, %f687;
  1472. sub.f32 %f721, %f715, %f718;
  1473. sub.f32 %f722, %f716, %f719;
  1474. sub.f32 %f723, %f717, %f720;
  1475. mul.f32 %f724, %f645, %f697;
  1476. mul.f32 %f725, %f646, %f698;
  1477. mul.f32 %f726, %f647, %f699;
  1478. sub.f32 %f727, %f721, %f724;
  1479. sub.f32 %f728, %f722, %f725;
  1480. sub.f32 %f729, %f723, %f726;
  1481. fma.rn.f32 %f730, %f727, 0f40000000, %f709;
  1482. fma.rn.f32 %f731, %f728, 0f40000000, %f710;
  1483. fma.rn.f32 %f732, %f729, 0f40000000, %f711;
  1484. mul.f32 %f733, %f731, %f14;
  1485. fma.rn.f32 %f734, %f730, %f13, %f733;
  1486. fma.rn.f32 %f735, %f15, %f732, %f734;
  1487. mul.f32 %f120, %f735, 0f3E800000;
  1488. setp.geu.f32 %p58, %f120, %f745;
  1489. @%p58 bra BB0_66;
  1490.  
  1491. cvt.u16.u32 %rs25, %r38;
  1492. setp.gt.u16 %p59, %rs13, %rs25;
  1493. not.b32 %r320, %r37;
  1494. shr.u32 %r321, %r320, 1;
  1495. and.b32 %r322, %r321, 1431655765;
  1496. selp.b16 %rs33, %rs25, %rs13, %p59;
  1497. selp.b16 %rs32, %rs13, %rs25, %p59;
  1498. selp.b32 %r323, %r322, 0, %p59;
  1499. xor.b32 %r434, %r323, %r37;
  1500. mov.f32 %f745, %f120;
  1501.  
  1502. BB0_66:
  1503. add.s32 %r433, %r433, 1;
  1504. setp.lt.s32 %p60, %r433, 3;
  1505. @%p60 bra BB0_63;
  1506.  
  1507. BB0_67:
  1508. shl.b32 %r324, %r1, 2;
  1509. mov.u32 %r325, _ZZ8compressPKjS0_P5uint2iE6errors;
  1510. add.s32 %r44, %r325, %r324;
  1511. st.shared.f32 [%r44], %f745;
  1512. mov.u32 %r326, _ZZ12findMinErrorPfN18cooperative_groups4__v112thread_blockEE7indices;
  1513. add.s32 %r45, %r326, %r324;
  1514. st.shared.u32 [%r45], %r1;
  1515. barrier.sync 0;
  1516. ld.shared.f32 %f123, [%r44];
  1517. add.s32 %r46, %r1, 32;
  1518. mov.f32 %f748, 0f7F7FFFFF;
  1519. setp.gt.s32 %p61, %r46, 63;
  1520. @%p61 bra BB0_69;
  1521.  
  1522. ld.shared.f32 %f748, [%r44+128];
  1523.  
  1524. BB0_69:
  1525. mov.u32 %r437, 0;
  1526. @%p61 bra BB0_71;
  1527.  
  1528. ld.shared.u32 %r437, [%r45+128];
  1529.  
  1530. BB0_71:
  1531. barrier.sync 0;
  1532. setp.geu.f32 %p63, %f748, %f123;
  1533. @%p63 bra BB0_73;
  1534.  
  1535. st.shared.f32 [%r44], %f748;
  1536. st.shared.u32 [%r45], %r437;
  1537.  
  1538. BB0_73:
  1539. barrier.sync 0;
  1540. ld.shared.f32 %f126, [%r44];
  1541. add.s32 %r49, %r1, 16;
  1542. mov.f32 %f749, 0f7F7FFFFF;
  1543. setp.gt.s32 %p64, %r49, 63;
  1544. @%p64 bra BB0_75;
  1545.  
  1546. ld.shared.f32 %f749, [%r44+64];
  1547.  
  1548. BB0_75:
  1549. mov.u32 %r438, 0;
  1550. @%p64 bra BB0_77;
  1551.  
  1552. ld.shared.u32 %r438, [%r45+64];
  1553.  
  1554. BB0_77:
  1555. barrier.sync 0;
  1556. setp.geu.f32 %p66, %f749, %f126;
  1557. @%p66 bra BB0_79;
  1558.  
  1559. st.shared.f32 [%r44], %f749;
  1560. st.shared.u32 [%r45], %r438;
  1561.  
  1562. BB0_79:
  1563. barrier.sync 0;
  1564. ld.shared.f32 %f129, [%r44];
  1565. add.s32 %r52, %r1, 8;
  1566. mov.f32 %f750, 0f7F7FFFFF;
  1567. setp.gt.s32 %p67, %r52, 63;
  1568. @%p67 bra BB0_81;
  1569.  
  1570. ld.shared.f32 %f750, [%r44+32];
  1571.  
  1572. BB0_81:
  1573. mov.u32 %r439, 0;
  1574. @%p67 bra BB0_83;
  1575.  
  1576. ld.shared.u32 %r439, [%r45+32];
  1577.  
  1578. BB0_83:
  1579. barrier.sync 0;
  1580. setp.geu.f32 %p69, %f750, %f129;
  1581. @%p69 bra BB0_85;
  1582.  
  1583. st.shared.f32 [%r44], %f750;
  1584. st.shared.u32 [%r45], %r439;
  1585.  
  1586. BB0_85:
  1587. barrier.sync 0;
  1588. ld.shared.f32 %f132, [%r44];
  1589. add.s32 %r55, %r1, 4;
  1590. mov.f32 %f751, 0f7F7FFFFF;
  1591. setp.gt.s32 %p70, %r55, 63;
  1592. @%p70 bra BB0_87;
  1593.  
  1594. ld.shared.f32 %f751, [%r44+16];
  1595.  
  1596. BB0_87:
  1597. mov.u32 %r440, 0;
  1598. @%p70 bra BB0_89;
  1599.  
  1600. ld.shared.u32 %r440, [%r45+16];
  1601.  
  1602. BB0_89:
  1603. barrier.sync 0;
  1604. setp.geu.f32 %p72, %f751, %f132;
  1605. @%p72 bra BB0_91;
  1606.  
  1607. st.shared.f32 [%r44], %f751;
  1608. st.shared.u32 [%r45], %r440;
  1609.  
  1610. BB0_91:
  1611. barrier.sync 0;
  1612. ld.shared.f32 %f135, [%r44];
  1613. add.s32 %r58, %r1, 2;
  1614. mov.f32 %f752, 0f7F7FFFFF;
  1615. setp.gt.s32 %p73, %r58, 63;
  1616. @%p73 bra BB0_93;
  1617.  
  1618. ld.shared.f32 %f752, [%r44+8];
  1619.  
  1620. BB0_93:
  1621. mov.u32 %r441, 0;
  1622. @%p73 bra BB0_95;
  1623.  
  1624. ld.shared.u32 %r441, [%r45+8];
  1625.  
  1626. BB0_95:
  1627. barrier.sync 0;
  1628. setp.geu.f32 %p75, %f752, %f135;
  1629. @%p75 bra BB0_97;
  1630.  
  1631. st.shared.f32 [%r44], %f752;
  1632. st.shared.u32 [%r45], %r441;
  1633.  
  1634. BB0_97:
  1635. barrier.sync 0;
  1636. ld.shared.f32 %f138, [%r44];
  1637. add.s32 %r61, %r1, 1;
  1638. mov.f32 %f753, 0f7F7FFFFF;
  1639. setp.gt.s32 %p76, %r61, 63;
  1640. @%p76 bra BB0_99;
  1641.  
  1642. ld.shared.f32 %f753, [%r44+4];
  1643.  
  1644. BB0_99:
  1645. mov.u32 %r442, 0;
  1646. @%p76 bra BB0_101;
  1647.  
  1648. ld.shared.u32 %r442, [%r45+4];
  1649.  
  1650. BB0_101:
  1651. barrier.sync 0;
  1652. setp.geu.f32 %p78, %f753, %f138;
  1653. @%p78 bra BB0_103;
  1654.  
  1655. st.shared.f32 [%r44], %f753;
  1656. st.shared.u32 [%r45], %r442;
  1657.  
  1658. BB0_103:
  1659. barrier.sync 0;
  1660. ld.shared.u32 %r64, [_ZZ12findMinErrorPfN18cooperative_groups4__v112thread_blockEE7indices];
  1661. barrier.sync 0;
  1662. setp.ne.s32 %p79, %r1, %r64;
  1663. @%p79 bra BB0_105;
  1664.  
  1665. ld.param.u64 %rd142, [_Z8compressPKjS0_P5uint2i_param_2];
  1666. ld.param.u32 %r424, [_Z8compressPKjS0_P5uint2i_param_3];
  1667. setp.eq.s16 %p80, %rs33, %rs32;
  1668. selp.b32 %r339, 0, %r434, %p80;
  1669. ld.shared.u32 %r340, [_ZZ8compressPKjS0_P5uint2iE5xrefs];
  1670. shl.b32 %r341, %r340, 1;
  1671. shr.u32 %r342, %r339, %r341;
  1672. and.b32 %r343, %r342, 3;
  1673. ld.shared.u32 %r344, [_ZZ8compressPKjS0_P5uint2iE5xrefs+4];
  1674. shl.b32 %r345, %r344, 1;
  1675. shr.u32 %r346, %r339, %r345;
  1676. and.b32 %r347, %r346, 3;
  1677. bfi.b32 %r348, %r347, %r343, 2, 2;
  1678. ld.shared.u32 %r349, [_ZZ8compressPKjS0_P5uint2iE5xrefs+8];
  1679. shl.b32 %r350, %r349, 1;
  1680. shr.u32 %r351, %r339, %r350;
  1681. and.b32 %r352, %r351, 3;
  1682. bfi.b32 %r353, %r352, %r348, 4, 2;
  1683. ld.shared.u32 %r354, [_ZZ8compressPKjS0_P5uint2iE5xrefs+12];
  1684. shl.b32 %r355, %r354, 1;
  1685. shr.u32 %r356, %r339, %r355;
  1686. and.b32 %r357, %r356, 3;
  1687. bfi.b32 %r358, %r357, %r353, 6, 2;
  1688. ld.shared.u32 %r359, [_ZZ8compressPKjS0_P5uint2iE5xrefs+16];
  1689. shl.b32 %r360, %r359, 1;
  1690. shr.u32 %r361, %r339, %r360;
  1691. and.b32 %r362, %r361, 3;
  1692. bfi.b32 %r363, %r362, %r358, 8, 2;
  1693. ld.shared.u32 %r364, [_ZZ8compressPKjS0_P5uint2iE5xrefs+20];
  1694. shl.b32 %r365, %r364, 1;
  1695. shr.u32 %r366, %r339, %r365;
  1696. and.b32 %r367, %r366, 3;
  1697. bfi.b32 %r368, %r367, %r363, 10, 2;
  1698. ld.shared.u32 %r369, [_ZZ8compressPKjS0_P5uint2iE5xrefs+24];
  1699. shl.b32 %r370, %r369, 1;
  1700. shr.u32 %r371, %r339, %r370;
  1701. and.b32 %r372, %r371, 3;
  1702. bfi.b32 %r373, %r372, %r368, 12, 2;
  1703. ld.shared.u32 %r374, [_ZZ8compressPKjS0_P5uint2iE5xrefs+28];
  1704. shl.b32 %r375, %r374, 1;
  1705. shr.u32 %r376, %r339, %r375;
  1706. and.b32 %r377, %r376, 3;
  1707. bfi.b32 %r378, %r377, %r373, 14, 2;
  1708. ld.shared.u32 %r379, [_ZZ8compressPKjS0_P5uint2iE5xrefs+32];
  1709. shl.b32 %r380, %r379, 1;
  1710. shr.u32 %r381, %r339, %r380;
  1711. and.b32 %r382, %r381, 3;
  1712. bfi.b32 %r383, %r382, %r378, 16, 2;
  1713. ld.shared.u32 %r384, [_ZZ8compressPKjS0_P5uint2iE5xrefs+36];
  1714. shl.b32 %r385, %r384, 1;
  1715. shr.u32 %r386, %r339, %r385;
  1716. and.b32 %r387, %r386, 3;
  1717. bfi.b32 %r388, %r387, %r383, 18, 2;
  1718. ld.shared.u32 %r389, [_ZZ8compressPKjS0_P5uint2iE5xrefs+40];
  1719. shl.b32 %r390, %r389, 1;
  1720. shr.u32 %r391, %r339, %r390;
  1721. and.b32 %r392, %r391, 3;
  1722. bfi.b32 %r393, %r392, %r388, 20, 2;
  1723. ld.shared.u32 %r394, [_ZZ8compressPKjS0_P5uint2iE5xrefs+44];
  1724. shl.b32 %r395, %r394, 1;
  1725. shr.u32 %r396, %r339, %r395;
  1726. and.b32 %r397, %r396, 3;
  1727. bfi.b32 %r398, %r397, %r393, 22, 2;
  1728. ld.shared.u32 %r399, [_ZZ8compressPKjS0_P5uint2iE5xrefs+48];
  1729. shl.b32 %r400, %r399, 1;
  1730. shr.u32 %r401, %r339, %r400;
  1731. and.b32 %r402, %r401, 3;
  1732. bfi.b32 %r403, %r402, %r398, 24, 2;
  1733. ld.shared.u32 %r404, [_ZZ8compressPKjS0_P5uint2iE5xrefs+52];
  1734. shl.b32 %r405, %r404, 1;
  1735. shr.u32 %r406, %r339, %r405;
  1736. and.b32 %r407, %r406, 3;
  1737. bfi.b32 %r408, %r407, %r403, 26, 2;
  1738. ld.shared.u32 %r409, [_ZZ8compressPKjS0_P5uint2iE5xrefs+56];
  1739. shl.b32 %r410, %r409, 1;
  1740. shr.u32 %r411, %r339, %r410;
  1741. and.b32 %r412, %r411, 3;
  1742. bfi.b32 %r413, %r412, %r408, 28, 2;
  1743. ld.shared.u32 %r414, [_ZZ8compressPKjS0_P5uint2iE5xrefs+60];
  1744. shl.b32 %r415, %r414, 1;
  1745. shr.u32 %r416, %r339, %r415;
  1746. cvt.u32.u16 %r417, %rs32;
  1747. cvt.u32.u16 %r418, %rs33;
  1748. mov.u32 %r419, %ctaid.x;
  1749. add.s32 %r420, %r419, %r424;
  1750. cvta.to.global.u64 %rd138, %rd142;
  1751. mul.wide.s32 %rd139, %r420, 8;
  1752. add.s64 %rd140, %rd138, %rd139;
  1753. bfi.b32 %r421, %r416, %r413, 30, 2;
  1754. prmt.b32 %r422, %r417, %r418, 4180;
  1755. st.global.v2.u32 [%rd140], {%r422, %r421};
  1756.  
  1757. BB0_105:
  1758. ret;
  1759. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement