Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- //
- // Generated by NVIDIA NVVM Compiler
- //
- // Compiler Build ID: CL-24817639
- // Cuda compilation tools, release 10.0, V10.0.130
- // Based on LLVM 3.4svn
- //
- .version 6.3
- .target sm_30
- .address_size 64
- // .globl _Z8compressPKjS0_P5uint2i
- // _ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance has been demoted
- .const .align 4 .b8 kColorMetric[12] = {0, 0, 128, 63, 0, 0, 128, 63, 0, 0, 128, 63};
- // _ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps has been demoted
- .const .align 4 .b8 alphaTable4[16] = {0, 0, 16, 65, 0, 0, 0, 0, 0, 0, 192, 64, 0, 0, 64, 64};
- .const .align 4 .b8 alphaTable3[16] = {0, 0, 128, 64, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 64};
- .const .align 4 .b8 prods4[16] = {0, 0, 9, 0, 0, 9, 0, 0, 2, 1, 4, 0, 2, 4, 1, 0};
- .const .align 4 .b8 prods3[16] = {0, 0, 4, 0, 0, 4, 0, 0, 1, 1, 4, 0, 1, 4, 1, 0};
- // _ZZ19evalAllPermutationsPK6float3PKjRtS4_RjPfS_N18cooperative_groups4__v112thread_blockEE14s_permutations has been demoted
- // _ZZ12findMinErrorPfN18cooperative_groups4__v112thread_blockEE7indices has been demoted
- // _ZZ8compressPKjS0_P5uint2iE6colors has been demoted
- // _ZZ8compressPKjS0_P5uint2iE4sums has been demoted
- // _ZZ8compressPKjS0_P5uint2iE5xrefs has been demoted
- // _ZZ8compressPKjS0_P5uint2iE6errors has been demoted
- .visible .entry _Z8compressPKjS0_P5uint2i(
- .param .u64 _Z8compressPKjS0_P5uint2i_param_0,
- .param .u64 _Z8compressPKjS0_P5uint2i_param_1,
- .param .u64 _Z8compressPKjS0_P5uint2i_param_2,
- .param .u32 _Z8compressPKjS0_P5uint2i_param_3
- )
- {
- .reg .pred %p<81>;
- .reg .b16 %rs<38>;
- .reg .f32 %f<754>;
- .reg .b32 %r<443>;
- .reg .b64 %rd<143>;
- // demoted variable
- .shared .align 4 .b8 _ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance[384];
- // demoted variable
- .shared .align 4 .b8 _ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps[64];
- // demoted variable
- .shared .align 4 .b8 _ZZ19evalAllPermutationsPK6float3PKjRtS4_RjPfS_N18cooperative_groups4__v112thread_blockEE14s_permutations[640];
- // demoted variable
- .shared .align 4 .b8 _ZZ12findMinErrorPfN18cooperative_groups4__v112thread_blockEE7indices[256];
- // demoted variable
- .shared .align 4 .b8 _ZZ8compressPKjS0_P5uint2iE6colors[192];
- // demoted variable
- .shared .align 4 .b8 _ZZ8compressPKjS0_P5uint2iE4sums[192];
- // demoted variable
- .shared .align 4 .b8 _ZZ8compressPKjS0_P5uint2iE5xrefs[64];
- // demoted variable
- .shared .align 4 .b8 _ZZ8compressPKjS0_P5uint2iE6errors[256];
- ld.param.u64 %rd2, [_Z8compressPKjS0_P5uint2i_param_1];
- ld.param.u32 %r65, [_Z8compressPKjS0_P5uint2i_param_3];
- mov.u32 %r1, %tid.x;
- mov.u32 %r67, %ntid.y;
- mov.u32 %r68, %tid.z;
- mov.u32 %r69, %tid.y;
- mad.lo.s32 %r70, %r67, %r68, %r69;
- mov.u32 %r71, %ntid.x;
- mad.lo.s32 %r72, %r70, %r71, %r1;
- and.b32 %r73, %r72, -16;
- mul.lo.s32 %r74, %r67, %r71;
- mov.u32 %r75, %ntid.z;
- mul.lo.s32 %r76, %r74, %r75;
- sub.s32 %r77, %r76, %r73;
- mov.u32 %r78, 16;
- min.u32 %r79, %r77, %r78;
- mov.u32 %r80, 32;
- sub.s32 %r81, %r80, %r79;
- mov.u32 %r82, -1;
- shr.u32 %r83, %r82, %r81;
- // inline asm
- mov.u32 %r66, %laneid;
- // inline asm
- and.b32 %r84, %r66, -16;
- shl.b32 %r2, %r83, %r84;
- setp.gt.s32 %p1, %r1, 15;
- @%p1 bra BB0_55;
- cvta.to.global.u64 %rd4, %rd2;
- mov.u32 %r85, %ctaid.x;
- add.s32 %r86, %r85, %r65;
- shl.b32 %r87, %r86, 4;
- add.s32 %r88, %r87, %r1;
- mul.wide.s32 %rd5, %r88, 4;
- add.s64 %rd6, %rd4, %rd5;
- ld.global.u32 %r89, [%rd6];
- and.b32 %r90, %r89, 255;
- cvt.rn.f32.u32 %f141, %r90;
- mul.f32 %f142, %f141, 0f3B808081;
- mov.u32 %r91, _ZZ8compressPKjS0_P5uint2iE6colors;
- mad.lo.s32 %r3, %r1, 12, %r91;
- st.shared.f32 [%r3], %f142;
- bfe.u32 %r92, %r89, 8, 8;
- cvt.rn.f32.u32 %f143, %r92;
- mul.f32 %f144, %f143, 0f3B808081;
- st.shared.f32 [%r3+4], %f144;
- bfe.u32 %r93, %r89, 16, 8;
- cvt.rn.f32.u32 %f145, %r93;
- mul.f32 %f146, %f145, 0f3B808081;
- st.shared.f32 [%r3+8], %f146;
- bar.warp.sync %r2;
- ld.shared.f32 %f147, [%r3];
- ld.shared.f32 %f148, [%r3+4];
- ld.shared.f32 %f149, [%r3+8];
- mov.u32 %r94, _ZZ8compressPKjS0_P5uint2iE4sums;
- mad.lo.s32 %r4, %r1, 12, %r94;
- st.shared.f32 [%r4+8], %f149;
- st.shared.f32 [%r4+4], %f148;
- st.shared.f32 [%r4], %f147;
- bar.warp.sync %r2;
- xor.b32 %r95, %r1, 8;
- mad.lo.s32 %r97, %r95, 12, %r94;
- ld.shared.f32 %f150, [%r97+8];
- ld.shared.f32 %f151, [%r97+4];
- ld.shared.f32 %f152, [%r97];
- ld.shared.f32 %f153, [%r4];
- add.f32 %f154, %f153, %f152;
- ld.shared.f32 %f155, [%r4+4];
- ld.shared.f32 %f156, [%r4+8];
- st.shared.f32 [%r4], %f154;
- add.f32 %f157, %f151, %f155;
- st.shared.f32 [%r4+4], %f157;
- add.f32 %f158, %f150, %f156;
- st.shared.f32 [%r4+8], %f158;
- bar.warp.sync %r2;
- xor.b32 %r98, %r1, 4;
- mad.lo.s32 %r100, %r98, 12, %r94;
- ld.shared.f32 %f159, [%r100+8];
- ld.shared.f32 %f160, [%r100+4];
- ld.shared.f32 %f161, [%r100];
- ld.shared.f32 %f162, [%r4];
- add.f32 %f163, %f162, %f161;
- ld.shared.f32 %f164, [%r4+4];
- ld.shared.f32 %f165, [%r4+8];
- st.shared.f32 [%r4], %f163;
- add.f32 %f166, %f160, %f164;
- st.shared.f32 [%r4+4], %f166;
- add.f32 %f167, %f159, %f165;
- st.shared.f32 [%r4+8], %f167;
- bar.warp.sync %r2;
- xor.b32 %r101, %r1, 2;
- mad.lo.s32 %r103, %r101, 12, %r94;
- ld.shared.f32 %f168, [%r103+8];
- ld.shared.f32 %f169, [%r103+4];
- ld.shared.f32 %f170, [%r103];
- ld.shared.f32 %f171, [%r4];
- add.f32 %f172, %f171, %f170;
- ld.shared.f32 %f173, [%r4+4];
- ld.shared.f32 %f174, [%r4+8];
- st.shared.f32 [%r4], %f172;
- add.f32 %f175, %f169, %f173;
- st.shared.f32 [%r4+4], %f175;
- add.f32 %f176, %f168, %f174;
- st.shared.f32 [%r4+8], %f176;
- bar.warp.sync %r2;
- xor.b32 %r104, %r1, 1;
- mad.lo.s32 %r106, %r104, 12, %r94;
- ld.shared.f32 %f177, [%r106+8];
- ld.shared.f32 %f178, [%r106+4];
- ld.shared.f32 %f179, [%r106];
- ld.shared.f32 %f180, [%r4];
- add.f32 %f181, %f180, %f179;
- ld.shared.f32 %f182, [%r4+4];
- ld.shared.f32 %f183, [%r4+8];
- st.shared.f32 [%r4], %f181;
- add.f32 %f184, %f178, %f182;
- st.shared.f32 [%r4+4], %f184;
- add.f32 %f185, %f177, %f183;
- st.shared.f32 [%r4+8], %f185;
- bar.warp.sync %r2;
- ld.shared.f32 %f186, [_ZZ8compressPKjS0_P5uint2iE4sums+8];
- ld.shared.f32 %f187, [_ZZ8compressPKjS0_P5uint2iE4sums+4];
- ld.shared.f32 %f188, [_ZZ8compressPKjS0_P5uint2iE4sums];
- mul.f32 %f189, %f188, 0f3D800000;
- mul.f32 %f190, %f187, 0f3D800000;
- mul.f32 %f191, %f186, 0f3D800000;
- ld.shared.f32 %f192, [%r3+8];
- ld.shared.f32 %f193, [%r3+4];
- ld.shared.f32 %f194, [%r3];
- sub.f32 %f195, %f194, %f189;
- sub.f32 %f196, %f193, %f190;
- sub.f32 %f197, %f192, %f191;
- mul.f32 %f198, %f195, %f195;
- mul.lo.s32 %r107, %r1, 6;
- shl.b32 %r108, %r107, 2;
- mov.u32 %r109, _ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance;
- add.s32 %r5, %r109, %r108;
- st.shared.f32 [%r5], %f198;
- mul.f32 %f199, %f195, %f196;
- st.shared.f32 [%r5+4], %f199;
- mul.f32 %f200, %f195, %f197;
- st.shared.f32 [%r5+8], %f200;
- mul.f32 %f201, %f196, %f196;
- st.shared.f32 [%r5+12], %f201;
- mul.f32 %f202, %f196, %f197;
- st.shared.f32 [%r5+16], %f202;
- mul.f32 %f203, %f197, %f197;
- st.shared.f32 [%r5+20], %f203;
- bar.warp.sync %r2;
- setp.gt.s32 %p2, %r1, 7;
- @%p2 bra BB0_3;
- ld.shared.f32 %f204, [%r5];
- ld.shared.f32 %f205, [%r5+192];
- add.f32 %f206, %f205, %f204;
- ld.shared.f32 %f207, [%r5+4];
- ld.shared.f32 %f208, [%r5+196];
- ld.shared.f32 %f209, [%r5+8];
- ld.shared.f32 %f210, [%r5+200];
- ld.shared.f32 %f211, [%r5+12];
- ld.shared.f32 %f212, [%r5+204];
- ld.shared.f32 %f213, [%r5+16];
- ld.shared.f32 %f214, [%r5+208];
- ld.shared.f32 %f215, [%r5+20];
- ld.shared.f32 %f216, [%r5+212];
- st.shared.f32 [%r5], %f206;
- add.f32 %f217, %f208, %f207;
- st.shared.f32 [%r5+4], %f217;
- add.f32 %f218, %f210, %f209;
- st.shared.f32 [%r5+8], %f218;
- add.f32 %f219, %f212, %f211;
- st.shared.f32 [%r5+12], %f219;
- add.f32 %f220, %f214, %f213;
- st.shared.f32 [%r5+16], %f220;
- add.f32 %f221, %f216, %f215;
- st.shared.f32 [%r5+20], %f221;
- BB0_3:
- bar.warp.sync %r2;
- setp.gt.s32 %p3, %r1, 3;
- @%p3 bra BB0_5;
- ld.shared.f32 %f222, [%r5];
- ld.shared.f32 %f223, [%r5+96];
- add.f32 %f224, %f223, %f222;
- ld.shared.f32 %f225, [%r5+4];
- ld.shared.f32 %f226, [%r5+100];
- ld.shared.f32 %f227, [%r5+8];
- ld.shared.f32 %f228, [%r5+104];
- ld.shared.f32 %f229, [%r5+12];
- ld.shared.f32 %f230, [%r5+108];
- ld.shared.f32 %f231, [%r5+16];
- ld.shared.f32 %f232, [%r5+112];
- ld.shared.f32 %f233, [%r5+20];
- ld.shared.f32 %f234, [%r5+116];
- st.shared.f32 [%r5], %f224;
- add.f32 %f235, %f226, %f225;
- st.shared.f32 [%r5+4], %f235;
- add.f32 %f236, %f228, %f227;
- st.shared.f32 [%r5+8], %f236;
- add.f32 %f237, %f230, %f229;
- st.shared.f32 [%r5+12], %f237;
- add.f32 %f238, %f232, %f231;
- st.shared.f32 [%r5+16], %f238;
- add.f32 %f239, %f234, %f233;
- st.shared.f32 [%r5+20], %f239;
- BB0_5:
- bar.warp.sync %r2;
- setp.gt.s32 %p4, %r1, 1;
- @%p4 bra BB0_7;
- ld.shared.f32 %f240, [%r5];
- ld.shared.f32 %f241, [%r5+48];
- add.f32 %f242, %f241, %f240;
- ld.shared.f32 %f243, [%r5+4];
- ld.shared.f32 %f244, [%r5+52];
- ld.shared.f32 %f245, [%r5+8];
- ld.shared.f32 %f246, [%r5+56];
- ld.shared.f32 %f247, [%r5+12];
- ld.shared.f32 %f248, [%r5+60];
- ld.shared.f32 %f249, [%r5+16];
- ld.shared.f32 %f250, [%r5+64];
- ld.shared.f32 %f251, [%r5+20];
- ld.shared.f32 %f252, [%r5+68];
- st.shared.f32 [%r5], %f242;
- add.f32 %f253, %f244, %f243;
- st.shared.f32 [%r5+4], %f253;
- add.f32 %f254, %f246, %f245;
- st.shared.f32 [%r5+8], %f254;
- add.f32 %f255, %f248, %f247;
- st.shared.f32 [%r5+12], %f255;
- add.f32 %f256, %f250, %f249;
- st.shared.f32 [%r5+16], %f256;
- add.f32 %f257, %f252, %f251;
- st.shared.f32 [%r5+20], %f257;
- BB0_7:
- bar.warp.sync %r2;
- setp.gt.s32 %p5, %r1, 0;
- @%p5 bra BB0_9;
- ld.shared.f32 %f258, [%r5];
- ld.shared.f32 %f259, [%r5+24];
- add.f32 %f260, %f259, %f258;
- ld.shared.f32 %f261, [%r5+4];
- ld.shared.f32 %f262, [%r5+28];
- ld.shared.f32 %f263, [%r5+8];
- ld.shared.f32 %f264, [%r5+32];
- ld.shared.f32 %f265, [%r5+12];
- ld.shared.f32 %f266, [%r5+36];
- ld.shared.f32 %f267, [%r5+16];
- ld.shared.f32 %f268, [%r5+40];
- ld.shared.f32 %f269, [%r5+20];
- ld.shared.f32 %f270, [%r5+44];
- st.shared.f32 [%r5], %f260;
- add.f32 %f271, %f262, %f261;
- st.shared.f32 [%r5+4], %f271;
- add.f32 %f272, %f264, %f263;
- st.shared.f32 [%r5+8], %f272;
- add.f32 %f273, %f266, %f265;
- st.shared.f32 [%r5+12], %f273;
- add.f32 %f274, %f268, %f267;
- st.shared.f32 [%r5+16], %f274;
- add.f32 %f275, %f270, %f269;
- st.shared.f32 [%r5+20], %f275;
- BB0_9:
- bar.warp.sync %r2;
- ld.shared.f32 %f276, [_ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance+4];
- ld.shared.f32 %f277, [_ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance];
- add.f32 %f278, %f277, %f276;
- ld.shared.f32 %f279, [_ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance+8];
- add.f32 %f280, %f278, %f279;
- ld.shared.f32 %f281, [_ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance+12];
- add.f32 %f282, %f276, %f281;
- ld.shared.f32 %f283, [_ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance+16];
- add.f32 %f284, %f282, %f283;
- add.f32 %f285, %f283, %f279;
- ld.shared.f32 %f286, [_ZZ11bestFitLinePK6float3S_N18cooperative_groups4__v112thread_groupEE10covariance+20];
- add.f32 %f287, %f285, %f286;
- max.f32 %f288, %f280, %f284;
- max.f32 %f289, %f288, %f287;
- rcp.rn.f32 %f290, %f289;
- mul.f32 %f291, %f280, %f290;
- mul.f32 %f292, %f284, %f290;
- mul.f32 %f293, %f287, %f290;
- mul.f32 %f294, %f292, %f276;
- fma.rn.f32 %f295, %f291, %f277, %f294;
- fma.rn.f32 %f296, %f293, %f279, %f295;
- mul.f32 %f297, %f292, %f281;
- fma.rn.f32 %f298, %f291, %f276, %f297;
- fma.rn.f32 %f299, %f293, %f283, %f298;
- mul.f32 %f300, %f291, %f279;
- fma.rn.f32 %f301, %f292, %f283, %f300;
- fma.rn.f32 %f302, %f293, %f286, %f301;
- max.f32 %f303, %f296, %f299;
- max.f32 %f304, %f303, %f302;
- rcp.rn.f32 %f305, %f304;
- mul.f32 %f306, %f296, %f305;
- mul.f32 %f307, %f299, %f305;
- mul.f32 %f308, %f302, %f305;
- mul.f32 %f309, %f307, %f276;
- fma.rn.f32 %f310, %f306, %f277, %f309;
- fma.rn.f32 %f311, %f308, %f279, %f310;
- mul.f32 %f312, %f307, %f281;
- fma.rn.f32 %f313, %f306, %f276, %f312;
- fma.rn.f32 %f314, %f308, %f283, %f313;
- mul.f32 %f315, %f306, %f279;
- fma.rn.f32 %f316, %f307, %f283, %f315;
- fma.rn.f32 %f317, %f308, %f286, %f316;
- max.f32 %f318, %f311, %f314;
- max.f32 %f319, %f318, %f317;
- rcp.rn.f32 %f320, %f319;
- mul.f32 %f321, %f311, %f320;
- mul.f32 %f322, %f314, %f320;
- mul.f32 %f323, %f317, %f320;
- mul.f32 %f324, %f322, %f276;
- fma.rn.f32 %f325, %f321, %f277, %f324;
- fma.rn.f32 %f326, %f323, %f279, %f325;
- mul.f32 %f327, %f322, %f281;
- fma.rn.f32 %f328, %f321, %f276, %f327;
- fma.rn.f32 %f329, %f323, %f283, %f328;
- mul.f32 %f330, %f321, %f279;
- fma.rn.f32 %f331, %f322, %f283, %f330;
- fma.rn.f32 %f332, %f323, %f286, %f331;
- max.f32 %f333, %f326, %f329;
- max.f32 %f334, %f333, %f332;
- rcp.rn.f32 %f335, %f334;
- mul.f32 %f336, %f326, %f335;
- mul.f32 %f337, %f329, %f335;
- mul.f32 %f338, %f332, %f335;
- mul.f32 %f339, %f337, %f276;
- fma.rn.f32 %f340, %f336, %f277, %f339;
- fma.rn.f32 %f341, %f338, %f279, %f340;
- mul.f32 %f342, %f337, %f281;
- fma.rn.f32 %f343, %f336, %f276, %f342;
- fma.rn.f32 %f344, %f338, %f283, %f343;
- mul.f32 %f345, %f336, %f279;
- fma.rn.f32 %f346, %f337, %f283, %f345;
- fma.rn.f32 %f347, %f338, %f286, %f346;
- max.f32 %f348, %f341, %f344;
- max.f32 %f349, %f348, %f347;
- rcp.rn.f32 %f350, %f349;
- mul.f32 %f351, %f341, %f350;
- mul.f32 %f352, %f344, %f350;
- mul.f32 %f353, %f347, %f350;
- mul.f32 %f354, %f352, %f276;
- fma.rn.f32 %f355, %f351, %f277, %f354;
- fma.rn.f32 %f356, %f353, %f279, %f355;
- mul.f32 %f357, %f352, %f281;
- fma.rn.f32 %f358, %f351, %f276, %f357;
- fma.rn.f32 %f359, %f353, %f283, %f358;
- mul.f32 %f360, %f351, %f279;
- fma.rn.f32 %f361, %f352, %f283, %f360;
- fma.rn.f32 %f362, %f353, %f286, %f361;
- max.f32 %f363, %f356, %f359;
- max.f32 %f364, %f363, %f362;
- rcp.rn.f32 %f365, %f364;
- mul.f32 %f366, %f356, %f365;
- mul.f32 %f367, %f359, %f365;
- mul.f32 %f368, %f362, %f365;
- mul.f32 %f369, %f367, %f276;
- fma.rn.f32 %f370, %f366, %f277, %f369;
- fma.rn.f32 %f371, %f368, %f279, %f370;
- mul.f32 %f372, %f367, %f281;
- fma.rn.f32 %f373, %f366, %f276, %f372;
- fma.rn.f32 %f374, %f368, %f283, %f373;
- mul.f32 %f375, %f366, %f279;
- fma.rn.f32 %f376, %f367, %f283, %f375;
- fma.rn.f32 %f377, %f368, %f286, %f376;
- max.f32 %f378, %f371, %f374;
- max.f32 %f379, %f378, %f377;
- rcp.rn.f32 %f380, %f379;
- mul.f32 %f381, %f371, %f380;
- mul.f32 %f382, %f374, %f380;
- mul.f32 %f383, %f377, %f380;
- mul.f32 %f384, %f382, %f276;
- fma.rn.f32 %f385, %f381, %f277, %f384;
- fma.rn.f32 %f386, %f383, %f279, %f385;
- mul.f32 %f387, %f382, %f281;
- fma.rn.f32 %f388, %f381, %f276, %f387;
- fma.rn.f32 %f389, %f383, %f283, %f388;
- mul.f32 %f390, %f381, %f279;
- fma.rn.f32 %f391, %f382, %f283, %f390;
- fma.rn.f32 %f392, %f383, %f286, %f391;
- max.f32 %f393, %f386, %f389;
- max.f32 %f394, %f393, %f392;
- rcp.rn.f32 %f395, %f394;
- mul.f32 %f1, %f386, %f395;
- mul.f32 %f2, %f389, %f395;
- mul.f32 %f3, %f392, %f395;
- bar.warp.sync %r2;
- mov.u32 %r426, _ZZ8compressPKjS0_P5uint2iE6colors;
- mad.lo.s32 %r425, %r1, 12, %r426;
- ld.shared.f32 %f396, [%r425+8];
- ld.shared.f32 %f397, [%r425];
- ld.shared.f32 %f398, [%r425+4];
- mul.f32 %f399, %f2, %f398;
- fma.rn.f32 %f400, %f1, %f397, %f399;
- fma.rn.f32 %f401, %f3, %f396, %f400;
- shl.b32 %r110, %r1, 2;
- mov.u32 %r111, _ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps;
- add.s32 %r6, %r111, %r110;
- st.shared.f32 [%r6], %f401;
- bar.warp.sync %r2;
- ld.shared.f32 %f402, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps];
- ld.shared.f32 %f403, [%r6];
- setp.lt.f32 %p6, %f402, %f403;
- selp.u32 %r112, 1, 0, %p6;
- ld.shared.f32 %f404, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+4];
- setp.lt.f32 %p7, %f404, %f403;
- selp.u32 %r113, 1, 0, %p7;
- add.s32 %r114, %r113, %r112;
- ld.shared.f32 %f405, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+8];
- setp.lt.f32 %p8, %f405, %f403;
- selp.u32 %r115, 1, 0, %p8;
- add.s32 %r116, %r115, %r114;
- ld.shared.f32 %f406, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+12];
- setp.lt.f32 %p9, %f406, %f403;
- selp.u32 %r117, 1, 0, %p9;
- add.s32 %r118, %r117, %r116;
- ld.shared.f32 %f407, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+16];
- setp.lt.f32 %p10, %f407, %f403;
- selp.u32 %r119, 1, 0, %p10;
- add.s32 %r120, %r119, %r118;
- ld.shared.f32 %f408, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+20];
- setp.lt.f32 %p11, %f408, %f403;
- selp.u32 %r121, 1, 0, %p11;
- add.s32 %r122, %r121, %r120;
- ld.shared.f32 %f409, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+24];
- setp.lt.f32 %p12, %f409, %f403;
- selp.u32 %r123, 1, 0, %p12;
- add.s32 %r124, %r123, %r122;
- ld.shared.f32 %f410, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+28];
- setp.lt.f32 %p13, %f410, %f403;
- selp.u32 %r125, 1, 0, %p13;
- add.s32 %r126, %r125, %r124;
- ld.shared.f32 %f411, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+32];
- setp.lt.f32 %p14, %f411, %f403;
- selp.u32 %r127, 1, 0, %p14;
- add.s32 %r128, %r127, %r126;
- ld.shared.f32 %f412, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+36];
- setp.lt.f32 %p15, %f412, %f403;
- selp.u32 %r129, 1, 0, %p15;
- add.s32 %r130, %r129, %r128;
- ld.shared.f32 %f413, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+40];
- setp.lt.f32 %p16, %f413, %f403;
- selp.u32 %r131, 1, 0, %p16;
- add.s32 %r132, %r131, %r130;
- ld.shared.f32 %f414, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+44];
- setp.lt.f32 %p17, %f414, %f403;
- selp.u32 %r133, 1, 0, %p17;
- add.s32 %r134, %r133, %r132;
- ld.shared.f32 %f415, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+48];
- setp.lt.f32 %p18, %f415, %f403;
- selp.u32 %r135, 1, 0, %p18;
- add.s32 %r136, %r135, %r134;
- ld.shared.f32 %f416, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+52];
- setp.lt.f32 %p19, %f416, %f403;
- selp.u32 %r137, 1, 0, %p19;
- add.s32 %r138, %r137, %r136;
- ld.shared.f32 %f417, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+56];
- setp.lt.f32 %p20, %f417, %f403;
- selp.u32 %r139, 1, 0, %p20;
- add.s32 %r140, %r139, %r138;
- ld.shared.f32 %f418, [_ZZ14loadColorBlockPKjP6float3S2_PiiN18cooperative_groups4__v112thread_blockEE3dps+60];
- setp.lt.f32 %p21, %f418, %f403;
- selp.u32 %r141, 1, 0, %p21;
- add.s32 %r142, %r141, %r140;
- mov.u32 %r144, _ZZ8compressPKjS0_P5uint2iE5xrefs;
- add.s32 %r7, %r144, %r110;
- st.shared.u32 [%r7], %r142;
- bar.warp.sync %r2;
- setp.lt.s32 %p22, %r1, 1;
- @%p22 bra BB0_12;
- ld.shared.u32 %r8, [%r7];
- ld.shared.u32 %r145, [_ZZ8compressPKjS0_P5uint2iE5xrefs];
- setp.ne.s32 %p23, %r8, %r145;
- @%p23 bra BB0_12;
- add.s32 %r146, %r8, 1;
- st.shared.u32 [%r7], %r146;
- BB0_12:
- bar.warp.sync %r2;
- setp.lt.s32 %p24, %r1, 2;
- @%p24 bra BB0_15;
- ld.shared.u32 %r9, [%r7];
- ld.shared.u32 %r147, [_ZZ8compressPKjS0_P5uint2iE5xrefs+4];
- setp.ne.s32 %p25, %r9, %r147;
- @%p25 bra BB0_15;
- add.s32 %r148, %r9, 1;
- st.shared.u32 [%r7], %r148;
- BB0_15:
- bar.warp.sync %r2;
- setp.lt.s32 %p26, %r1, 3;
- @%p26 bra BB0_18;
- ld.shared.u32 %r10, [%r7];
- ld.shared.u32 %r149, [_ZZ8compressPKjS0_P5uint2iE5xrefs+8];
- setp.ne.s32 %p27, %r10, %r149;
- @%p27 bra BB0_18;
- add.s32 %r150, %r10, 1;
- st.shared.u32 [%r7], %r150;
- BB0_18:
- bar.warp.sync %r2;
- setp.lt.s32 %p28, %r1, 4;
- @%p28 bra BB0_21;
- ld.shared.u32 %r11, [%r7];
- ld.shared.u32 %r151, [_ZZ8compressPKjS0_P5uint2iE5xrefs+12];
- setp.ne.s32 %p29, %r11, %r151;
- @%p29 bra BB0_21;
- add.s32 %r152, %r11, 1;
- st.shared.u32 [%r7], %r152;
- BB0_21:
- bar.warp.sync %r2;
- setp.lt.s32 %p30, %r1, 5;
- @%p30 bra BB0_24;
- ld.shared.u32 %r12, [%r7];
- ld.shared.u32 %r153, [_ZZ8compressPKjS0_P5uint2iE5xrefs+16];
- setp.ne.s32 %p31, %r12, %r153;
- @%p31 bra BB0_24;
- add.s32 %r154, %r12, 1;
- st.shared.u32 [%r7], %r154;
- BB0_24:
- bar.warp.sync %r2;
- setp.lt.s32 %p32, %r1, 6;
- @%p32 bra BB0_27;
- ld.shared.u32 %r13, [%r7];
- ld.shared.u32 %r155, [_ZZ8compressPKjS0_P5uint2iE5xrefs+20];
- setp.ne.s32 %p33, %r13, %r155;
- @%p33 bra BB0_27;
- add.s32 %r156, %r13, 1;
- st.shared.u32 [%r7], %r156;
- BB0_27:
- bar.warp.sync %r2;
- setp.lt.s32 %p34, %r1, 7;
- @%p34 bra BB0_30;
- ld.shared.u32 %r14, [%r7];
- ld.shared.u32 %r157, [_ZZ8compressPKjS0_P5uint2iE5xrefs+24];
- setp.ne.s32 %p35, %r14, %r157;
- @%p35 bra BB0_30;
- add.s32 %r158, %r14, 1;
- st.shared.u32 [%r7], %r158;
- BB0_30:
- bar.warp.sync %r2;
- setp.lt.s32 %p36, %r1, 8;
- @%p36 bra BB0_33;
- ld.shared.u32 %r15, [%r7];
- ld.shared.u32 %r159, [_ZZ8compressPKjS0_P5uint2iE5xrefs+28];
- setp.ne.s32 %p37, %r15, %r159;
- @%p37 bra BB0_33;
- add.s32 %r160, %r15, 1;
- st.shared.u32 [%r7], %r160;
- BB0_33:
- bar.warp.sync %r2;
- setp.lt.s32 %p38, %r1, 9;
- @%p38 bra BB0_36;
- ld.shared.u32 %r16, [%r7];
- ld.shared.u32 %r161, [_ZZ8compressPKjS0_P5uint2iE5xrefs+32];
- setp.ne.s32 %p39, %r16, %r161;
- @%p39 bra BB0_36;
- add.s32 %r162, %r16, 1;
- st.shared.u32 [%r7], %r162;
- BB0_36:
- bar.warp.sync %r2;
- setp.lt.s32 %p40, %r1, 10;
- @%p40 bra BB0_39;
- ld.shared.u32 %r17, [%r7];
- ld.shared.u32 %r163, [_ZZ8compressPKjS0_P5uint2iE5xrefs+36];
- setp.ne.s32 %p41, %r17, %r163;
- @%p41 bra BB0_39;
- add.s32 %r164, %r17, 1;
- st.shared.u32 [%r7], %r164;
- BB0_39:
- bar.warp.sync %r2;
- setp.lt.s32 %p42, %r1, 11;
- @%p42 bra BB0_42;
- ld.shared.u32 %r18, [%r7];
- ld.shared.u32 %r165, [_ZZ8compressPKjS0_P5uint2iE5xrefs+40];
- setp.ne.s32 %p43, %r18, %r165;
- @%p43 bra BB0_42;
- add.s32 %r166, %r18, 1;
- st.shared.u32 [%r7], %r166;
- BB0_42:
- bar.warp.sync %r2;
- setp.lt.s32 %p44, %r1, 12;
- @%p44 bra BB0_45;
- ld.shared.u32 %r19, [%r7];
- ld.shared.u32 %r167, [_ZZ8compressPKjS0_P5uint2iE5xrefs+44];
- setp.ne.s32 %p45, %r19, %r167;
- @%p45 bra BB0_45;
- add.s32 %r168, %r19, 1;
- st.shared.u32 [%r7], %r168;
- BB0_45:
- bar.warp.sync %r2;
- setp.lt.s32 %p46, %r1, 13;
- @%p46 bra BB0_48;
- ld.shared.u32 %r20, [%r7];
- ld.shared.u32 %r169, [_ZZ8compressPKjS0_P5uint2iE5xrefs+48];
- setp.ne.s32 %p47, %r20, %r169;
- @%p47 bra BB0_48;
- add.s32 %r170, %r20, 1;
- st.shared.u32 [%r7], %r170;
- BB0_48:
- bar.warp.sync %r2;
- setp.lt.s32 %p48, %r1, 14;
- @%p48 bra BB0_51;
- ld.shared.u32 %r21, [%r7];
- ld.shared.u32 %r171, [_ZZ8compressPKjS0_P5uint2iE5xrefs+52];
- setp.ne.s32 %p49, %r21, %r171;
- @%p49 bra BB0_51;
- add.s32 %r172, %r21, 1;
- st.shared.u32 [%r7], %r172;
- BB0_51:
- bar.warp.sync %r2;
- setp.lt.s32 %p50, %r1, 15;
- @%p50 bra BB0_54;
- ld.shared.u32 %r22, [%r7];
- ld.shared.u32 %r173, [_ZZ8compressPKjS0_P5uint2iE5xrefs+56];
- setp.ne.s32 %p51, %r22, %r173;
- @%p51 bra BB0_54;
- add.s32 %r174, %r22, 1;
- st.shared.u32 [%r7], %r174;
- BB0_54:
- bar.warp.sync %r2;
- bar.warp.sync %r2;
- mov.u32 %r428, _ZZ8compressPKjS0_P5uint2iE6colors;
- mad.lo.s32 %r427, %r1, 12, %r428;
- ld.shared.f32 %f6, [%r427+8];
- ld.shared.f32 %f5, [%r427+4];
- ld.shared.f32 %f4, [%r427];
- bar.warp.sync %r2;
- mov.u32 %r423, _ZZ8compressPKjS0_P5uint2iE6colors;
- ld.shared.u32 %r175, [%r7];
- mad.lo.s32 %r177, %r175, 12, %r423;
- st.shared.f32 [%r177+8], %f6;
- st.shared.f32 [%r177+4], %f5;
- st.shared.f32 [%r177], %f4;
- BB0_55:
- barrier.sync 0;
- ld.param.u64 %rd141, [_Z8compressPKjS0_P5uint2i_param_0];
- ld.shared.f32 %f11, [_ZZ8compressPKjS0_P5uint2iE4sums+8];
- ld.shared.f32 %f9, [_ZZ8compressPKjS0_P5uint2iE4sums+4];
- ld.shared.f32 %f7, [_ZZ8compressPKjS0_P5uint2iE4sums];
- mul.f32 %f8, %f7, 0f41100000;
- mul.f32 %f10, %f9, 0f41100000;
- mul.f32 %f12, %f11, 0f41100000;
- ld.const.f32 %f13, [kColorMetric];
- ld.const.f32 %f14, [kColorMetric+4];
- ld.const.f32 %f15, [kColorMetric+8];
- ld.shared.f32 %f16, [_ZZ8compressPKjS0_P5uint2iE6colors];
- ld.shared.f32 %f17, [_ZZ8compressPKjS0_P5uint2iE6colors+4];
- ld.shared.f32 %f18, [_ZZ8compressPKjS0_P5uint2iE6colors+8];
- ld.shared.f32 %f19, [_ZZ8compressPKjS0_P5uint2iE6colors+12];
- ld.shared.f32 %f20, [_ZZ8compressPKjS0_P5uint2iE6colors+16];
- ld.shared.f32 %f21, [_ZZ8compressPKjS0_P5uint2iE6colors+20];
- ld.shared.f32 %f22, [_ZZ8compressPKjS0_P5uint2iE6colors+24];
- ld.shared.f32 %f23, [_ZZ8compressPKjS0_P5uint2iE6colors+28];
- ld.shared.f32 %f24, [_ZZ8compressPKjS0_P5uint2iE6colors+32];
- ld.shared.f32 %f25, [_ZZ8compressPKjS0_P5uint2iE6colors+36];
- ld.shared.f32 %f26, [_ZZ8compressPKjS0_P5uint2iE6colors+40];
- ld.shared.f32 %f27, [_ZZ8compressPKjS0_P5uint2iE6colors+44];
- ld.shared.f32 %f28, [_ZZ8compressPKjS0_P5uint2iE6colors+48];
- ld.shared.f32 %f29, [_ZZ8compressPKjS0_P5uint2iE6colors+52];
- ld.shared.f32 %f30, [_ZZ8compressPKjS0_P5uint2iE6colors+56];
- ld.shared.f32 %f31, [_ZZ8compressPKjS0_P5uint2iE6colors+60];
- ld.shared.f32 %f32, [_ZZ8compressPKjS0_P5uint2iE6colors+64];
- ld.shared.f32 %f33, [_ZZ8compressPKjS0_P5uint2iE6colors+68];
- ld.shared.f32 %f34, [_ZZ8compressPKjS0_P5uint2iE6colors+72];
- ld.shared.f32 %f35, [_ZZ8compressPKjS0_P5uint2iE6colors+76];
- ld.shared.f32 %f36, [_ZZ8compressPKjS0_P5uint2iE6colors+80];
- ld.shared.f32 %f37, [_ZZ8compressPKjS0_P5uint2iE6colors+84];
- ld.shared.f32 %f38, [_ZZ8compressPKjS0_P5uint2iE6colors+88];
- ld.shared.f32 %f39, [_ZZ8compressPKjS0_P5uint2iE6colors+92];
- ld.shared.f32 %f40, [_ZZ8compressPKjS0_P5uint2iE6colors+96];
- ld.shared.f32 %f41, [_ZZ8compressPKjS0_P5uint2iE6colors+100];
- ld.shared.f32 %f42, [_ZZ8compressPKjS0_P5uint2iE6colors+104];
- ld.shared.f32 %f43, [_ZZ8compressPKjS0_P5uint2iE6colors+108];
- ld.shared.f32 %f44, [_ZZ8compressPKjS0_P5uint2iE6colors+112];
- ld.shared.f32 %f45, [_ZZ8compressPKjS0_P5uint2iE6colors+116];
- ld.shared.f32 %f46, [_ZZ8compressPKjS0_P5uint2iE6colors+120];
- ld.shared.f32 %f47, [_ZZ8compressPKjS0_P5uint2iE6colors+124];
- ld.shared.f32 %f48, [_ZZ8compressPKjS0_P5uint2iE6colors+128];
- ld.shared.f32 %f49, [_ZZ8compressPKjS0_P5uint2iE6colors+132];
- ld.shared.f32 %f50, [_ZZ8compressPKjS0_P5uint2iE6colors+136];
- ld.shared.f32 %f51, [_ZZ8compressPKjS0_P5uint2iE6colors+140];
- ld.shared.f32 %f52, [_ZZ8compressPKjS0_P5uint2iE6colors+144];
- ld.shared.f32 %f53, [_ZZ8compressPKjS0_P5uint2iE6colors+148];
- ld.shared.f32 %f54, [_ZZ8compressPKjS0_P5uint2iE6colors+152];
- ld.shared.f32 %f55, [_ZZ8compressPKjS0_P5uint2iE6colors+156];
- ld.shared.f32 %f56, [_ZZ8compressPKjS0_P5uint2iE6colors+160];
- ld.shared.f32 %f57, [_ZZ8compressPKjS0_P5uint2iE6colors+164];
- ld.shared.f32 %f58, [_ZZ8compressPKjS0_P5uint2iE6colors+168];
- ld.shared.f32 %f59, [_ZZ8compressPKjS0_P5uint2iE6colors+172];
- ld.shared.f32 %f60, [_ZZ8compressPKjS0_P5uint2iE6colors+176];
- ld.shared.f32 %f61, [_ZZ8compressPKjS0_P5uint2iE6colors+180];
- ld.shared.f32 %f62, [_ZZ8compressPKjS0_P5uint2iE6colors+184];
- ld.shared.f32 %f63, [_ZZ8compressPKjS0_P5uint2iE6colors+188];
- mov.f32 %f745, 0f7F7FFFFF;
- mov.u32 %r429, 0;
- cvta.to.global.u64 %rd7, %rd141;
- BB0_56:
- shl.b32 %r181, %r429, 6;
- add.s32 %r25, %r181, %r1;
- setp.gt.s32 %p52, %r25, 991;
- @%p52 bra BB0_62;
- mul.wide.s32 %rd8, %r25, 4;
- add.s64 %rd9, %rd7, %rd8;
- ld.global.u32 %r26, [%rd9];
- setp.gt.s32 %p53, %r25, 159;
- @%p53 bra BB0_59;
- shl.b32 %r182, %r25, 2;
- mov.u32 %r183, _ZZ19evalAllPermutationsPK6float3PKjRtS4_RjPfS_N18cooperative_groups4__v112thread_blockEE14s_permutations;
- add.s32 %r184, %r183, %r182;
- st.shared.u32 [%r184], %r26;
- BB0_59:
- and.b32 %r185, %r26, 3;
- mul.wide.u32 %rd10, %r185, 4;
- mov.u64 %rd11, alphaTable4;
- add.s64 %rd12, %rd11, %rd10;
- ld.const.f32 %f420, [%rd12];
- fma.rn.f32 %f421, %f420, %f16, 0f00000000;
- fma.rn.f32 %f422, %f420, %f17, 0f00000000;
- fma.rn.f32 %f423, %f420, %f18, 0f00000000;
- mov.u64 %rd13, prods4;
- add.s64 %rd14, %rd13, %rd10;
- shr.u32 %r186, %r26, 2;
- mul.wide.u32 %rd15, %r186, 4;
- and.b64 %rd16, %rd15, 12;
- add.s64 %rd17, %rd11, %rd16;
- ld.const.f32 %f424, [%rd17];
- fma.rn.f32 %f425, %f424, %f19, %f421;
- fma.rn.f32 %f426, %f424, %f20, %f422;
- fma.rn.f32 %f427, %f424, %f21, %f423;
- add.s64 %rd18, %rd13, %rd16;
- ld.const.u32 %r187, [%rd18];
- ld.const.u32 %r188, [%rd14];
- add.s32 %r189, %r187, %r188;
- shr.u32 %r190, %r26, 4;
- mul.wide.u32 %rd19, %r190, 4;
- and.b64 %rd20, %rd19, 12;
- add.s64 %rd21, %rd11, %rd20;
- ld.const.f32 %f428, [%rd21];
- fma.rn.f32 %f429, %f428, %f22, %f425;
- fma.rn.f32 %f430, %f428, %f23, %f426;
- fma.rn.f32 %f431, %f428, %f24, %f427;
- add.s64 %rd22, %rd13, %rd20;
- ld.const.u32 %r191, [%rd22];
- add.s32 %r192, %r191, %r189;
- shr.u32 %r193, %r26, 6;
- mul.wide.u32 %rd23, %r193, 4;
- and.b64 %rd24, %rd23, 12;
- add.s64 %rd25, %rd11, %rd24;
- ld.const.f32 %f432, [%rd25];
- fma.rn.f32 %f433, %f432, %f25, %f429;
- fma.rn.f32 %f434, %f432, %f26, %f430;
- fma.rn.f32 %f435, %f432, %f27, %f431;
- add.s64 %rd26, %rd13, %rd24;
- ld.const.u32 %r194, [%rd26];
- add.s32 %r195, %r194, %r192;
- shr.u32 %r196, %r26, 8;
- mul.wide.u32 %rd27, %r196, 4;
- and.b64 %rd28, %rd27, 12;
- add.s64 %rd29, %rd11, %rd28;
- ld.const.f32 %f436, [%rd29];
- fma.rn.f32 %f437, %f436, %f28, %f433;
- fma.rn.f32 %f438, %f436, %f29, %f434;
- fma.rn.f32 %f439, %f436, %f30, %f435;
- add.s64 %rd30, %rd13, %rd28;
- ld.const.u32 %r197, [%rd30];
- add.s32 %r198, %r197, %r195;
- shr.u32 %r199, %r26, 10;
- mul.wide.u32 %rd31, %r199, 4;
- and.b64 %rd32, %rd31, 12;
- add.s64 %rd33, %rd11, %rd32;
- ld.const.f32 %f440, [%rd33];
- fma.rn.f32 %f441, %f440, %f31, %f437;
- fma.rn.f32 %f442, %f440, %f32, %f438;
- fma.rn.f32 %f443, %f440, %f33, %f439;
- add.s64 %rd34, %rd13, %rd32;
- ld.const.u32 %r200, [%rd34];
- add.s32 %r201, %r200, %r198;
- shr.u32 %r202, %r26, 12;
- mul.wide.u32 %rd35, %r202, 4;
- and.b64 %rd36, %rd35, 12;
- add.s64 %rd37, %rd11, %rd36;
- ld.const.f32 %f444, [%rd37];
- fma.rn.f32 %f445, %f444, %f34, %f441;
- fma.rn.f32 %f446, %f444, %f35, %f442;
- fma.rn.f32 %f447, %f444, %f36, %f443;
- add.s64 %rd38, %rd13, %rd36;
- ld.const.u32 %r203, [%rd38];
- add.s32 %r204, %r203, %r201;
- shr.u32 %r205, %r26, 14;
- mul.wide.u32 %rd39, %r205, 4;
- and.b64 %rd40, %rd39, 12;
- add.s64 %rd41, %rd11, %rd40;
- ld.const.f32 %f448, [%rd41];
- fma.rn.f32 %f449, %f448, %f37, %f445;
- fma.rn.f32 %f450, %f448, %f38, %f446;
- fma.rn.f32 %f451, %f448, %f39, %f447;
- add.s64 %rd42, %rd13, %rd40;
- ld.const.u32 %r206, [%rd42];
- add.s32 %r207, %r206, %r204;
- shr.u32 %r208, %r26, 16;
- mul.wide.u32 %rd43, %r208, 4;
- and.b64 %rd44, %rd43, 12;
- add.s64 %rd45, %rd11, %rd44;
- ld.const.f32 %f452, [%rd45];
- fma.rn.f32 %f453, %f452, %f40, %f449;
- fma.rn.f32 %f454, %f452, %f41, %f450;
- fma.rn.f32 %f455, %f452, %f42, %f451;
- add.s64 %rd46, %rd13, %rd44;
- ld.const.u32 %r209, [%rd46];
- add.s32 %r210, %r209, %r207;
- shr.u32 %r211, %r26, 18;
- mul.wide.u32 %rd47, %r211, 4;
- and.b64 %rd48, %rd47, 12;
- add.s64 %rd49, %rd11, %rd48;
- ld.const.f32 %f456, [%rd49];
- fma.rn.f32 %f457, %f456, %f43, %f453;
- fma.rn.f32 %f458, %f456, %f44, %f454;
- fma.rn.f32 %f459, %f456, %f45, %f455;
- add.s64 %rd50, %rd13, %rd48;
- ld.const.u32 %r212, [%rd50];
- add.s32 %r213, %r212, %r210;
- shr.u32 %r214, %r26, 20;
- mul.wide.u32 %rd51, %r214, 4;
- and.b64 %rd52, %rd51, 12;
- add.s64 %rd53, %rd11, %rd52;
- ld.const.f32 %f460, [%rd53];
- fma.rn.f32 %f461, %f460, %f46, %f457;
- fma.rn.f32 %f462, %f460, %f47, %f458;
- fma.rn.f32 %f463, %f460, %f48, %f459;
- add.s64 %rd54, %rd13, %rd52;
- ld.const.u32 %r215, [%rd54];
- add.s32 %r216, %r215, %r213;
- shr.u32 %r217, %r26, 22;
- mul.wide.u32 %rd55, %r217, 4;
- and.b64 %rd56, %rd55, 12;
- add.s64 %rd57, %rd11, %rd56;
- ld.const.f32 %f464, [%rd57];
- fma.rn.f32 %f465, %f464, %f49, %f461;
- fma.rn.f32 %f466, %f464, %f50, %f462;
- fma.rn.f32 %f467, %f464, %f51, %f463;
- add.s64 %rd58, %rd13, %rd56;
- ld.const.u32 %r218, [%rd58];
- add.s32 %r219, %r218, %r216;
- shr.u32 %r220, %r26, 24;
- mul.wide.u32 %rd59, %r220, 4;
- and.b64 %rd60, %rd59, 12;
- add.s64 %rd61, %rd11, %rd60;
- ld.const.f32 %f468, [%rd61];
- fma.rn.f32 %f469, %f468, %f52, %f465;
- fma.rn.f32 %f470, %f468, %f53, %f466;
- fma.rn.f32 %f471, %f468, %f54, %f467;
- add.s64 %rd62, %rd13, %rd60;
- ld.const.u32 %r221, [%rd62];
- add.s32 %r222, %r221, %r219;
- shr.u32 %r223, %r26, 26;
- mul.wide.u32 %rd63, %r223, 4;
- and.b64 %rd64, %rd63, 12;
- add.s64 %rd65, %rd11, %rd64;
- ld.const.f32 %f472, [%rd65];
- fma.rn.f32 %f473, %f472, %f55, %f469;
- fma.rn.f32 %f474, %f472, %f56, %f470;
- fma.rn.f32 %f475, %f472, %f57, %f471;
- add.s64 %rd66, %rd13, %rd64;
- ld.const.u32 %r224, [%rd66];
- add.s32 %r225, %r224, %r222;
- shr.u32 %r226, %r26, 28;
- mul.wide.u32 %rd67, %r226, 4;
- and.b64 %rd68, %rd67, 12;
- add.s64 %rd69, %rd11, %rd68;
- ld.const.f32 %f476, [%rd69];
- fma.rn.f32 %f477, %f476, %f58, %f473;
- fma.rn.f32 %f478, %f476, %f59, %f474;
- fma.rn.f32 %f479, %f476, %f60, %f475;
- add.s64 %rd70, %rd13, %rd68;
- ld.const.u32 %r227, [%rd70];
- add.s32 %r228, %r227, %r225;
- shr.u32 %r229, %r26, 30;
- mul.wide.u32 %rd71, %r229, 4;
- add.s64 %rd72, %rd11, %rd71;
- ld.const.f32 %f480, [%rd72];
- fma.rn.f32 %f481, %f480, %f61, %f477;
- fma.rn.f32 %f482, %f480, %f62, %f478;
- fma.rn.f32 %f483, %f480, %f63, %f479;
- add.s64 %rd73, %rd13, %rd71;
- ld.const.u32 %r230, [%rd73];
- add.s32 %r231, %r230, %r228;
- shr.s32 %r232, %r231, 16;
- cvt.rn.f32.s32 %f484, %r232;
- bfe.u32 %r233, %r231, 8, 8;
- cvt.rn.f32.s32 %f485, %r233;
- and.b32 %r234, %r231, 255;
- cvt.rn.f32.s32 %f486, %r234;
- sub.f32 %f487, %f8, %f481;
- sub.f32 %f488, %f10, %f482;
- sub.f32 %f489, %f12, %f483;
- mul.f32 %f490, %f484, %f485;
- mul.f32 %f491, %f486, %f486;
- sub.f32 %f492, %f490, %f491;
- rcp.rn.f32 %f493, %f492;
- mul.f32 %f494, %f481, %f485;
- mul.f32 %f495, %f482, %f485;
- mul.f32 %f496, %f483, %f485;
- mul.f32 %f497, %f487, %f486;
- mul.f32 %f498, %f488, %f486;
- mul.f32 %f499, %f489, %f486;
- sub.f32 %f500, %f494, %f497;
- sub.f32 %f501, %f495, %f498;
- sub.f32 %f502, %f496, %f499;
- mul.f32 %f503, %f500, %f493;
- mul.f32 %f504, %f501, %f493;
- mul.f32 %f505, %f502, %f493;
- mul.f32 %f506, %f487, %f484;
- mul.f32 %f507, %f488, %f484;
- mul.f32 %f508, %f489, %f484;
- mul.f32 %f509, %f481, %f486;
- mul.f32 %f510, %f482, %f486;
- mul.f32 %f511, %f483, %f486;
- sub.f32 %f512, %f506, %f509;
- sub.f32 %f513, %f507, %f510;
- sub.f32 %f514, %f508, %f511;
- mul.f32 %f515, %f512, %f493;
- mul.f32 %f516, %f513, %f493;
- mul.f32 %f517, %f514, %f493;
- cvt.sat.f32.f32 %f518, %f503;
- mul.f32 %f519, %f518, 0f41F80000;
- cvt.rni.f32.f32 %f520, %f519;
- cvt.sat.f32.f32 %f521, %f504;
- mul.f32 %f522, %f521, 0f427C0000;
- cvt.rni.f32.f32 %f523, %f522;
- cvt.sat.f32.f32 %f524, %f505;
- mul.f32 %f525, %f524, 0f41F80000;
- cvt.rni.f32.f32 %f526, %f525;
- cvt.rzi.u32.f32 %r235, %f520;
- shl.b32 %r236, %r235, 11;
- and.b32 %r237, %r236, 63488;
- cvt.rzi.u32.f32 %r238, %f523;
- shl.b32 %r239, %r238, 5;
- and.b32 %r240, %r239, 65504;
- cvt.rzi.u32.f32 %r241, %f526;
- or.b32 %r242, %r241, %r237;
- or.b32 %r27, %r242, %r240;
- mul.f32 %f527, %f520, 0f3D043571;
- mul.f32 %f528, %f523, 0f3C81B119;
- mul.f32 %f529, %f526, 0f3D043571;
- cvt.sat.f32.f32 %f530, %f515;
- mul.f32 %f531, %f530, 0f41F80000;
- cvt.rni.f32.f32 %f532, %f531;
- cvt.sat.f32.f32 %f533, %f516;
- mul.f32 %f534, %f533, 0f427C0000;
- cvt.rni.f32.f32 %f535, %f534;
- cvt.sat.f32.f32 %f536, %f517;
- mul.f32 %f537, %f536, 0f41F80000;
- cvt.rni.f32.f32 %f538, %f537;
- cvt.rzi.u32.f32 %r243, %f532;
- shl.b32 %r244, %r243, 11;
- and.b32 %r245, %r244, 63488;
- cvt.rzi.u32.f32 %r246, %f535;
- shl.b32 %r247, %r246, 5;
- and.b32 %r248, %r247, 65504;
- cvt.rzi.u32.f32 %r249, %f538;
- or.b32 %r250, %r249, %r245;
- or.b32 %r28, %r250, %r248;
- mul.f32 %f539, %f532, 0f3D043571;
- mul.f32 %f540, %f535, 0f3C81B119;
- mul.f32 %f541, %f538, 0f3D043571;
- mul.f32 %f542, %f527, %f527;
- mul.f32 %f543, %f528, %f528;
- mul.f32 %f544, %f529, %f529;
- mul.f32 %f545, %f539, %f539;
- mul.f32 %f546, %f540, %f540;
- mul.f32 %f547, %f541, %f541;
- mul.f32 %f548, %f485, %f545;
- mul.f32 %f549, %f485, %f546;
- mul.f32 %f550, %f485, %f547;
- fma.rn.f32 %f551, %f484, %f542, %f548;
- fma.rn.f32 %f552, %f484, %f543, %f549;
- fma.rn.f32 %f553, %f484, %f544, %f550;
- mul.f32 %f554, %f527, %f539;
- mul.f32 %f555, %f528, %f540;
- mul.f32 %f556, %f529, %f541;
- mul.f32 %f557, %f486, %f554;
- mul.f32 %f558, %f486, %f555;
- mul.f32 %f559, %f486, %f556;
- mul.f32 %f560, %f481, %f527;
- mul.f32 %f561, %f482, %f528;
- mul.f32 %f562, %f483, %f529;
- sub.f32 %f563, %f557, %f560;
- sub.f32 %f564, %f558, %f561;
- sub.f32 %f565, %f559, %f562;
- mul.f32 %f566, %f487, %f539;
- mul.f32 %f567, %f488, %f540;
- mul.f32 %f568, %f489, %f541;
- sub.f32 %f569, %f563, %f566;
- sub.f32 %f570, %f564, %f567;
- sub.f32 %f571, %f565, %f568;
- fma.rn.f32 %f572, %f569, 0f40000000, %f551;
- fma.rn.f32 %f573, %f570, 0f40000000, %f552;
- fma.rn.f32 %f574, %f571, 0f40000000, %f553;
- mul.f32 %f575, %f573, %f14;
- fma.rn.f32 %f576, %f572, %f13, %f575;
- fma.rn.f32 %f577, %f15, %f574, %f576;
- mul.f32 %f65, %f577, 0f3DE38E39;
- setp.geu.f32 %p54, %f65, %f745;
- @%p54 bra BB0_61;
- cvt.u16.u32 %rs26, %r28;
- cvt.u16.u32 %rs27, %r27;
- mov.u32 %r430, %r26;
- mov.f32 %f745, %f65;
- BB0_61:
- add.s32 %r429, %r429, 1;
- setp.lt.s32 %p55, %r429, 16;
- @%p55 bra BB0_56;
- BB0_62:
- setp.lt.u16 %p56, %rs27, %rs26;
- selp.b16 %rs33, %rs26, %rs27, %p56;
- selp.b16 %rs32, %rs27, %rs26, %p56;
- xor.b32 %r251, %r430, 1431655765;
- selp.b32 %r434, %r251, %r430, %p56;
- barrier.sync 0;
- mul.f32 %f68, %f7, 0f40800000;
- mul.f32 %f69, %f9, 0f40800000;
- mul.f32 %f70, %f11, 0f40800000;
- ld.shared.f32 %f71, [_ZZ8compressPKjS0_P5uint2iE6colors];
- ld.shared.f32 %f72, [_ZZ8compressPKjS0_P5uint2iE6colors+4];
- ld.shared.f32 %f73, [_ZZ8compressPKjS0_P5uint2iE6colors+8];
- ld.shared.f32 %f74, [_ZZ8compressPKjS0_P5uint2iE6colors+12];
- ld.shared.f32 %f75, [_ZZ8compressPKjS0_P5uint2iE6colors+16];
- ld.shared.f32 %f76, [_ZZ8compressPKjS0_P5uint2iE6colors+20];
- ld.shared.f32 %f77, [_ZZ8compressPKjS0_P5uint2iE6colors+24];
- ld.shared.f32 %f78, [_ZZ8compressPKjS0_P5uint2iE6colors+28];
- ld.shared.f32 %f79, [_ZZ8compressPKjS0_P5uint2iE6colors+32];
- ld.shared.f32 %f80, [_ZZ8compressPKjS0_P5uint2iE6colors+36];
- ld.shared.f32 %f81, [_ZZ8compressPKjS0_P5uint2iE6colors+40];
- ld.shared.f32 %f82, [_ZZ8compressPKjS0_P5uint2iE6colors+44];
- ld.shared.f32 %f83, [_ZZ8compressPKjS0_P5uint2iE6colors+48];
- ld.shared.f32 %f84, [_ZZ8compressPKjS0_P5uint2iE6colors+52];
- ld.shared.f32 %f85, [_ZZ8compressPKjS0_P5uint2iE6colors+56];
- ld.shared.f32 %f86, [_ZZ8compressPKjS0_P5uint2iE6colors+60];
- ld.shared.f32 %f87, [_ZZ8compressPKjS0_P5uint2iE6colors+64];
- ld.shared.f32 %f88, [_ZZ8compressPKjS0_P5uint2iE6colors+68];
- ld.shared.f32 %f89, [_ZZ8compressPKjS0_P5uint2iE6colors+72];
- ld.shared.f32 %f90, [_ZZ8compressPKjS0_P5uint2iE6colors+76];
- ld.shared.f32 %f91, [_ZZ8compressPKjS0_P5uint2iE6colors+80];
- ld.shared.f32 %f92, [_ZZ8compressPKjS0_P5uint2iE6colors+84];
- ld.shared.f32 %f93, [_ZZ8compressPKjS0_P5uint2iE6colors+88];
- ld.shared.f32 %f94, [_ZZ8compressPKjS0_P5uint2iE6colors+92];
- ld.shared.f32 %f95, [_ZZ8compressPKjS0_P5uint2iE6colors+96];
- ld.shared.f32 %f96, [_ZZ8compressPKjS0_P5uint2iE6colors+100];
- ld.shared.f32 %f97, [_ZZ8compressPKjS0_P5uint2iE6colors+104];
- ld.shared.f32 %f98, [_ZZ8compressPKjS0_P5uint2iE6colors+108];
- ld.shared.f32 %f99, [_ZZ8compressPKjS0_P5uint2iE6colors+112];
- ld.shared.f32 %f100, [_ZZ8compressPKjS0_P5uint2iE6colors+116];
- ld.shared.f32 %f101, [_ZZ8compressPKjS0_P5uint2iE6colors+120];
- ld.shared.f32 %f102, [_ZZ8compressPKjS0_P5uint2iE6colors+124];
- ld.shared.f32 %f103, [_ZZ8compressPKjS0_P5uint2iE6colors+128];
- ld.shared.f32 %f104, [_ZZ8compressPKjS0_P5uint2iE6colors+132];
- ld.shared.f32 %f105, [_ZZ8compressPKjS0_P5uint2iE6colors+136];
- ld.shared.f32 %f106, [_ZZ8compressPKjS0_P5uint2iE6colors+140];
- ld.shared.f32 %f107, [_ZZ8compressPKjS0_P5uint2iE6colors+144];
- ld.shared.f32 %f108, [_ZZ8compressPKjS0_P5uint2iE6colors+148];
- ld.shared.f32 %f109, [_ZZ8compressPKjS0_P5uint2iE6colors+152];
- ld.shared.f32 %f110, [_ZZ8compressPKjS0_P5uint2iE6colors+156];
- ld.shared.f32 %f111, [_ZZ8compressPKjS0_P5uint2iE6colors+160];
- ld.shared.f32 %f112, [_ZZ8compressPKjS0_P5uint2iE6colors+164];
- ld.shared.f32 %f113, [_ZZ8compressPKjS0_P5uint2iE6colors+168];
- ld.shared.f32 %f114, [_ZZ8compressPKjS0_P5uint2iE6colors+172];
- ld.shared.f32 %f115, [_ZZ8compressPKjS0_P5uint2iE6colors+176];
- ld.shared.f32 %f116, [_ZZ8compressPKjS0_P5uint2iE6colors+180];
- ld.shared.f32 %f117, [_ZZ8compressPKjS0_P5uint2iE6colors+184];
- ld.shared.f32 %f118, [_ZZ8compressPKjS0_P5uint2iE6colors+188];
- mov.u32 %r433, 0;
- BB0_63:
- shl.b32 %r253, %r433, 6;
- add.s32 %r36, %r253, %r1;
- setp.gt.s32 %p57, %r36, 159;
- @%p57 bra BB0_67;
- shl.b32 %r254, %r36, 2;
- mov.u32 %r255, _ZZ19evalAllPermutationsPK6float3PKjRtS4_RjPfS_N18cooperative_groups4__v112thread_blockEE14s_permutations;
- add.s32 %r256, %r255, %r254;
- ld.shared.u32 %r37, [%r256];
- and.b32 %r257, %r37, 3;
- mul.wide.u32 %rd74, %r257, 4;
- mov.u64 %rd75, alphaTable3;
- add.s64 %rd76, %rd75, %rd74;
- ld.const.f32 %f578, [%rd76];
- fma.rn.f32 %f579, %f578, %f71, 0f00000000;
- fma.rn.f32 %f580, %f578, %f72, 0f00000000;
- fma.rn.f32 %f581, %f578, %f73, 0f00000000;
- mov.u64 %rd77, prods3;
- add.s64 %rd78, %rd77, %rd74;
- shr.u32 %r258, %r37, 2;
- mul.wide.u32 %rd79, %r258, 4;
- and.b64 %rd80, %rd79, 12;
- add.s64 %rd81, %rd75, %rd80;
- ld.const.f32 %f582, [%rd81];
- fma.rn.f32 %f583, %f582, %f74, %f579;
- fma.rn.f32 %f584, %f582, %f75, %f580;
- fma.rn.f32 %f585, %f582, %f76, %f581;
- add.s64 %rd82, %rd77, %rd80;
- ld.const.u32 %r259, [%rd82];
- ld.const.u32 %r260, [%rd78];
- add.s32 %r261, %r259, %r260;
- shr.u32 %r262, %r37, 4;
- mul.wide.u32 %rd83, %r262, 4;
- and.b64 %rd84, %rd83, 12;
- add.s64 %rd85, %rd75, %rd84;
- ld.const.f32 %f586, [%rd85];
- fma.rn.f32 %f587, %f586, %f77, %f583;
- fma.rn.f32 %f588, %f586, %f78, %f584;
- fma.rn.f32 %f589, %f586, %f79, %f585;
- add.s64 %rd86, %rd77, %rd84;
- ld.const.u32 %r263, [%rd86];
- add.s32 %r264, %r263, %r261;
- shr.u32 %r265, %r37, 6;
- mul.wide.u32 %rd87, %r265, 4;
- and.b64 %rd88, %rd87, 12;
- add.s64 %rd89, %rd75, %rd88;
- ld.const.f32 %f590, [%rd89];
- fma.rn.f32 %f591, %f590, %f80, %f587;
- fma.rn.f32 %f592, %f590, %f81, %f588;
- fma.rn.f32 %f593, %f590, %f82, %f589;
- add.s64 %rd90, %rd77, %rd88;
- ld.const.u32 %r266, [%rd90];
- add.s32 %r267, %r266, %r264;
- shr.u32 %r268, %r37, 8;
- mul.wide.u32 %rd91, %r268, 4;
- and.b64 %rd92, %rd91, 12;
- add.s64 %rd93, %rd75, %rd92;
- ld.const.f32 %f594, [%rd93];
- fma.rn.f32 %f595, %f594, %f83, %f591;
- fma.rn.f32 %f596, %f594, %f84, %f592;
- fma.rn.f32 %f597, %f594, %f85, %f593;
- add.s64 %rd94, %rd77, %rd92;
- ld.const.u32 %r269, [%rd94];
- add.s32 %r270, %r269, %r267;
- shr.u32 %r271, %r37, 10;
- mul.wide.u32 %rd95, %r271, 4;
- and.b64 %rd96, %rd95, 12;
- add.s64 %rd97, %rd75, %rd96;
- ld.const.f32 %f598, [%rd97];
- fma.rn.f32 %f599, %f598, %f86, %f595;
- fma.rn.f32 %f600, %f598, %f87, %f596;
- fma.rn.f32 %f601, %f598, %f88, %f597;
- add.s64 %rd98, %rd77, %rd96;
- ld.const.u32 %r272, [%rd98];
- add.s32 %r273, %r272, %r270;
- shr.u32 %r274, %r37, 12;
- mul.wide.u32 %rd99, %r274, 4;
- and.b64 %rd100, %rd99, 12;
- add.s64 %rd101, %rd75, %rd100;
- ld.const.f32 %f602, [%rd101];
- fma.rn.f32 %f603, %f602, %f89, %f599;
- fma.rn.f32 %f604, %f602, %f90, %f600;
- fma.rn.f32 %f605, %f602, %f91, %f601;
- add.s64 %rd102, %rd77, %rd100;
- ld.const.u32 %r275, [%rd102];
- add.s32 %r276, %r275, %r273;
- shr.u32 %r277, %r37, 14;
- mul.wide.u32 %rd103, %r277, 4;
- and.b64 %rd104, %rd103, 12;
- add.s64 %rd105, %rd75, %rd104;
- ld.const.f32 %f606, [%rd105];
- fma.rn.f32 %f607, %f606, %f92, %f603;
- fma.rn.f32 %f608, %f606, %f93, %f604;
- fma.rn.f32 %f609, %f606, %f94, %f605;
- add.s64 %rd106, %rd77, %rd104;
- ld.const.u32 %r278, [%rd106];
- add.s32 %r279, %r278, %r276;
- shr.u32 %r280, %r37, 16;
- mul.wide.u32 %rd107, %r280, 4;
- and.b64 %rd108, %rd107, 12;
- add.s64 %rd109, %rd75, %rd108;
- ld.const.f32 %f610, [%rd109];
- fma.rn.f32 %f611, %f610, %f95, %f607;
- fma.rn.f32 %f612, %f610, %f96, %f608;
- fma.rn.f32 %f613, %f610, %f97, %f609;
- add.s64 %rd110, %rd77, %rd108;
- ld.const.u32 %r281, [%rd110];
- add.s32 %r282, %r281, %r279;
- shr.u32 %r283, %r37, 18;
- mul.wide.u32 %rd111, %r283, 4;
- and.b64 %rd112, %rd111, 12;
- add.s64 %rd113, %rd75, %rd112;
- ld.const.f32 %f614, [%rd113];
- fma.rn.f32 %f615, %f614, %f98, %f611;
- fma.rn.f32 %f616, %f614, %f99, %f612;
- fma.rn.f32 %f617, %f614, %f100, %f613;
- add.s64 %rd114, %rd77, %rd112;
- ld.const.u32 %r284, [%rd114];
- add.s32 %r285, %r284, %r282;
- shr.u32 %r286, %r37, 20;
- mul.wide.u32 %rd115, %r286, 4;
- and.b64 %rd116, %rd115, 12;
- add.s64 %rd117, %rd75, %rd116;
- ld.const.f32 %f618, [%rd117];
- fma.rn.f32 %f619, %f618, %f101, %f615;
- fma.rn.f32 %f620, %f618, %f102, %f616;
- fma.rn.f32 %f621, %f618, %f103, %f617;
- add.s64 %rd118, %rd77, %rd116;
- ld.const.u32 %r287, [%rd118];
- add.s32 %r288, %r287, %r285;
- shr.u32 %r289, %r37, 22;
- mul.wide.u32 %rd119, %r289, 4;
- and.b64 %rd120, %rd119, 12;
- add.s64 %rd121, %rd75, %rd120;
- ld.const.f32 %f622, [%rd121];
- fma.rn.f32 %f623, %f622, %f104, %f619;
- fma.rn.f32 %f624, %f622, %f105, %f620;
- fma.rn.f32 %f625, %f622, %f106, %f621;
- add.s64 %rd122, %rd77, %rd120;
- ld.const.u32 %r290, [%rd122];
- add.s32 %r291, %r290, %r288;
- shr.u32 %r292, %r37, 24;
- mul.wide.u32 %rd123, %r292, 4;
- and.b64 %rd124, %rd123, 12;
- add.s64 %rd125, %rd75, %rd124;
- ld.const.f32 %f626, [%rd125];
- fma.rn.f32 %f627, %f626, %f107, %f623;
- fma.rn.f32 %f628, %f626, %f108, %f624;
- fma.rn.f32 %f629, %f626, %f109, %f625;
- add.s64 %rd126, %rd77, %rd124;
- ld.const.u32 %r293, [%rd126];
- add.s32 %r294, %r293, %r291;
- shr.u32 %r295, %r37, 26;
- mul.wide.u32 %rd127, %r295, 4;
- and.b64 %rd128, %rd127, 12;
- add.s64 %rd129, %rd75, %rd128;
- ld.const.f32 %f630, [%rd129];
- fma.rn.f32 %f631, %f630, %f110, %f627;
- fma.rn.f32 %f632, %f630, %f111, %f628;
- fma.rn.f32 %f633, %f630, %f112, %f629;
- add.s64 %rd130, %rd77, %rd128;
- ld.const.u32 %r296, [%rd130];
- add.s32 %r297, %r296, %r294;
- shr.u32 %r298, %r37, 28;
- mul.wide.u32 %rd131, %r298, 4;
- and.b64 %rd132, %rd131, 12;
- add.s64 %rd133, %rd75, %rd132;
- ld.const.f32 %f634, [%rd133];
- fma.rn.f32 %f635, %f634, %f113, %f631;
- fma.rn.f32 %f636, %f634, %f114, %f632;
- fma.rn.f32 %f637, %f634, %f115, %f633;
- add.s64 %rd134, %rd77, %rd132;
- ld.const.u32 %r299, [%rd134];
- add.s32 %r300, %r299, %r297;
- shr.u32 %r301, %r37, 30;
- mul.wide.u32 %rd135, %r301, 4;
- add.s64 %rd136, %rd75, %rd135;
- ld.const.f32 %f638, [%rd136];
- fma.rn.f32 %f639, %f638, %f116, %f635;
- fma.rn.f32 %f640, %f638, %f117, %f636;
- fma.rn.f32 %f641, %f638, %f118, %f637;
- add.s64 %rd137, %rd77, %rd135;
- ld.const.u32 %r302, [%rd137];
- add.s32 %r303, %r302, %r300;
- shr.s32 %r304, %r303, 16;
- cvt.rn.f32.s32 %f642, %r304;
- bfe.u32 %r305, %r303, 8, 8;
- cvt.rn.f32.s32 %f643, %r305;
- and.b32 %r306, %r303, 255;
- cvt.rn.f32.s32 %f644, %r306;
- sub.f32 %f645, %f68, %f639;
- sub.f32 %f646, %f69, %f640;
- sub.f32 %f647, %f70, %f641;
- mul.f32 %f648, %f642, %f643;
- mul.f32 %f649, %f644, %f644;
- sub.f32 %f650, %f648, %f649;
- rcp.rn.f32 %f651, %f650;
- mul.f32 %f652, %f639, %f643;
- mul.f32 %f653, %f640, %f643;
- mul.f32 %f654, %f641, %f643;
- mul.f32 %f655, %f645, %f644;
- mul.f32 %f656, %f646, %f644;
- mul.f32 %f657, %f647, %f644;
- sub.f32 %f658, %f652, %f655;
- sub.f32 %f659, %f653, %f656;
- sub.f32 %f660, %f654, %f657;
- mul.f32 %f661, %f658, %f651;
- mul.f32 %f662, %f659, %f651;
- mul.f32 %f663, %f660, %f651;
- mul.f32 %f664, %f645, %f642;
- mul.f32 %f665, %f646, %f642;
- mul.f32 %f666, %f647, %f642;
- mul.f32 %f667, %f639, %f644;
- mul.f32 %f668, %f640, %f644;
- mul.f32 %f669, %f641, %f644;
- sub.f32 %f670, %f664, %f667;
- sub.f32 %f671, %f665, %f668;
- sub.f32 %f672, %f666, %f669;
- mul.f32 %f673, %f670, %f651;
- mul.f32 %f674, %f671, %f651;
- mul.f32 %f675, %f672, %f651;
- cvt.sat.f32.f32 %f676, %f661;
- mul.f32 %f677, %f676, 0f41F80000;
- cvt.rni.f32.f32 %f678, %f677;
- cvt.sat.f32.f32 %f679, %f662;
- mul.f32 %f680, %f679, 0f427C0000;
- cvt.rni.f32.f32 %f681, %f680;
- cvt.sat.f32.f32 %f682, %f663;
- mul.f32 %f683, %f682, 0f41F80000;
- cvt.rni.f32.f32 %f684, %f683;
- cvt.rzi.u32.f32 %r307, %f678;
- shl.b32 %r308, %r307, 11;
- cvt.rzi.u32.f32 %r309, %f681;
- shl.b32 %r310, %r309, 5;
- cvt.rzi.u32.f32 %r311, %f684;
- cvt.u16.u32 %rs21, %r311;
- cvt.u16.u32 %rs22, %r308;
- or.b16 %rs23, %rs21, %rs22;
- cvt.u16.u32 %rs24, %r310;
- or.b16 %rs13, %rs23, %rs24;
- mul.f32 %f685, %f678, 0f3D043571;
- mul.f32 %f686, %f681, 0f3C81B119;
- mul.f32 %f687, %f684, 0f3D043571;
- cvt.sat.f32.f32 %f688, %f673;
- mul.f32 %f689, %f688, 0f41F80000;
- cvt.rni.f32.f32 %f690, %f689;
- cvt.sat.f32.f32 %f691, %f674;
- mul.f32 %f692, %f691, 0f427C0000;
- cvt.rni.f32.f32 %f693, %f692;
- cvt.sat.f32.f32 %f694, %f675;
- mul.f32 %f695, %f694, 0f41F80000;
- cvt.rni.f32.f32 %f696, %f695;
- cvt.rzi.u32.f32 %r312, %f690;
- shl.b32 %r313, %r312, 11;
- and.b32 %r314, %r313, 63488;
- cvt.rzi.u32.f32 %r315, %f693;
- shl.b32 %r316, %r315, 5;
- and.b32 %r317, %r316, 65504;
- cvt.rzi.u32.f32 %r318, %f696;
- or.b32 %r319, %r318, %r314;
- or.b32 %r38, %r319, %r317;
- mul.f32 %f697, %f690, 0f3D043571;
- mul.f32 %f698, %f693, 0f3C81B119;
- mul.f32 %f699, %f696, 0f3D043571;
- mul.f32 %f700, %f685, %f685;
- mul.f32 %f701, %f686, %f686;
- mul.f32 %f702, %f687, %f687;
- mul.f32 %f703, %f697, %f697;
- mul.f32 %f704, %f698, %f698;
- mul.f32 %f705, %f699, %f699;
- mul.f32 %f706, %f643, %f703;
- mul.f32 %f707, %f643, %f704;
- mul.f32 %f708, %f643, %f705;
- fma.rn.f32 %f709, %f642, %f700, %f706;
- fma.rn.f32 %f710, %f642, %f701, %f707;
- fma.rn.f32 %f711, %f642, %f702, %f708;
- mul.f32 %f712, %f685, %f697;
- mul.f32 %f713, %f686, %f698;
- mul.f32 %f714, %f687, %f699;
- mul.f32 %f715, %f644, %f712;
- mul.f32 %f716, %f644, %f713;
- mul.f32 %f717, %f644, %f714;
- mul.f32 %f718, %f639, %f685;
- mul.f32 %f719, %f640, %f686;
- mul.f32 %f720, %f641, %f687;
- sub.f32 %f721, %f715, %f718;
- sub.f32 %f722, %f716, %f719;
- sub.f32 %f723, %f717, %f720;
- mul.f32 %f724, %f645, %f697;
- mul.f32 %f725, %f646, %f698;
- mul.f32 %f726, %f647, %f699;
- sub.f32 %f727, %f721, %f724;
- sub.f32 %f728, %f722, %f725;
- sub.f32 %f729, %f723, %f726;
- fma.rn.f32 %f730, %f727, 0f40000000, %f709;
- fma.rn.f32 %f731, %f728, 0f40000000, %f710;
- fma.rn.f32 %f732, %f729, 0f40000000, %f711;
- mul.f32 %f733, %f731, %f14;
- fma.rn.f32 %f734, %f730, %f13, %f733;
- fma.rn.f32 %f735, %f15, %f732, %f734;
- mul.f32 %f120, %f735, 0f3E800000;
- setp.geu.f32 %p58, %f120, %f745;
- @%p58 bra BB0_66;
- cvt.u16.u32 %rs25, %r38;
- setp.gt.u16 %p59, %rs13, %rs25;
- not.b32 %r320, %r37;
- shr.u32 %r321, %r320, 1;
- and.b32 %r322, %r321, 1431655765;
- selp.b16 %rs33, %rs25, %rs13, %p59;
- selp.b16 %rs32, %rs13, %rs25, %p59;
- selp.b32 %r323, %r322, 0, %p59;
- xor.b32 %r434, %r323, %r37;
- mov.f32 %f745, %f120;
- BB0_66:
- add.s32 %r433, %r433, 1;
- setp.lt.s32 %p60, %r433, 3;
- @%p60 bra BB0_63;
- BB0_67:
- shl.b32 %r324, %r1, 2;
- mov.u32 %r325, _ZZ8compressPKjS0_P5uint2iE6errors;
- add.s32 %r44, %r325, %r324;
- st.shared.f32 [%r44], %f745;
- mov.u32 %r326, _ZZ12findMinErrorPfN18cooperative_groups4__v112thread_blockEE7indices;
- add.s32 %r45, %r326, %r324;
- st.shared.u32 [%r45], %r1;
- barrier.sync 0;
- ld.shared.f32 %f123, [%r44];
- add.s32 %r46, %r1, 32;
- mov.f32 %f748, 0f7F7FFFFF;
- setp.gt.s32 %p61, %r46, 63;
- @%p61 bra BB0_69;
- ld.shared.f32 %f748, [%r44+128];
- BB0_69:
- mov.u32 %r437, 0;
- @%p61 bra BB0_71;
- ld.shared.u32 %r437, [%r45+128];
- BB0_71:
- barrier.sync 0;
- setp.geu.f32 %p63, %f748, %f123;
- @%p63 bra BB0_73;
- st.shared.f32 [%r44], %f748;
- st.shared.u32 [%r45], %r437;
- BB0_73:
- barrier.sync 0;
- ld.shared.f32 %f126, [%r44];
- add.s32 %r49, %r1, 16;
- mov.f32 %f749, 0f7F7FFFFF;
- setp.gt.s32 %p64, %r49, 63;
- @%p64 bra BB0_75;
- ld.shared.f32 %f749, [%r44+64];
- BB0_75:
- mov.u32 %r438, 0;
- @%p64 bra BB0_77;
- ld.shared.u32 %r438, [%r45+64];
- BB0_77:
- barrier.sync 0;
- setp.geu.f32 %p66, %f749, %f126;
- @%p66 bra BB0_79;
- st.shared.f32 [%r44], %f749;
- st.shared.u32 [%r45], %r438;
- BB0_79:
- barrier.sync 0;
- ld.shared.f32 %f129, [%r44];
- add.s32 %r52, %r1, 8;
- mov.f32 %f750, 0f7F7FFFFF;
- setp.gt.s32 %p67, %r52, 63;
- @%p67 bra BB0_81;
- ld.shared.f32 %f750, [%r44+32];
- BB0_81:
- mov.u32 %r439, 0;
- @%p67 bra BB0_83;
- ld.shared.u32 %r439, [%r45+32];
- BB0_83:
- barrier.sync 0;
- setp.geu.f32 %p69, %f750, %f129;
- @%p69 bra BB0_85;
- st.shared.f32 [%r44], %f750;
- st.shared.u32 [%r45], %r439;
- BB0_85:
- barrier.sync 0;
- ld.shared.f32 %f132, [%r44];
- add.s32 %r55, %r1, 4;
- mov.f32 %f751, 0f7F7FFFFF;
- setp.gt.s32 %p70, %r55, 63;
- @%p70 bra BB0_87;
- ld.shared.f32 %f751, [%r44+16];
- BB0_87:
- mov.u32 %r440, 0;
- @%p70 bra BB0_89;
- ld.shared.u32 %r440, [%r45+16];
- BB0_89:
- barrier.sync 0;
- setp.geu.f32 %p72, %f751, %f132;
- @%p72 bra BB0_91;
- st.shared.f32 [%r44], %f751;
- st.shared.u32 [%r45], %r440;
- BB0_91:
- barrier.sync 0;
- ld.shared.f32 %f135, [%r44];
- add.s32 %r58, %r1, 2;
- mov.f32 %f752, 0f7F7FFFFF;
- setp.gt.s32 %p73, %r58, 63;
- @%p73 bra BB0_93;
- ld.shared.f32 %f752, [%r44+8];
- BB0_93:
- mov.u32 %r441, 0;
- @%p73 bra BB0_95;
- ld.shared.u32 %r441, [%r45+8];
- BB0_95:
- barrier.sync 0;
- setp.geu.f32 %p75, %f752, %f135;
- @%p75 bra BB0_97;
- st.shared.f32 [%r44], %f752;
- st.shared.u32 [%r45], %r441;
- BB0_97:
- barrier.sync 0;
- ld.shared.f32 %f138, [%r44];
- add.s32 %r61, %r1, 1;
- mov.f32 %f753, 0f7F7FFFFF;
- setp.gt.s32 %p76, %r61, 63;
- @%p76 bra BB0_99;
- ld.shared.f32 %f753, [%r44+4];
- BB0_99:
- mov.u32 %r442, 0;
- @%p76 bra BB0_101;
- ld.shared.u32 %r442, [%r45+4];
- BB0_101:
- barrier.sync 0;
- setp.geu.f32 %p78, %f753, %f138;
- @%p78 bra BB0_103;
- st.shared.f32 [%r44], %f753;
- st.shared.u32 [%r45], %r442;
- BB0_103:
- barrier.sync 0;
- ld.shared.u32 %r64, [_ZZ12findMinErrorPfN18cooperative_groups4__v112thread_blockEE7indices];
- barrier.sync 0;
- setp.ne.s32 %p79, %r1, %r64;
- @%p79 bra BB0_105;
- ld.param.u64 %rd142, [_Z8compressPKjS0_P5uint2i_param_2];
- ld.param.u32 %r424, [_Z8compressPKjS0_P5uint2i_param_3];
- setp.eq.s16 %p80, %rs33, %rs32;
- selp.b32 %r339, 0, %r434, %p80;
- ld.shared.u32 %r340, [_ZZ8compressPKjS0_P5uint2iE5xrefs];
- shl.b32 %r341, %r340, 1;
- shr.u32 %r342, %r339, %r341;
- and.b32 %r343, %r342, 3;
- ld.shared.u32 %r344, [_ZZ8compressPKjS0_P5uint2iE5xrefs+4];
- shl.b32 %r345, %r344, 1;
- shr.u32 %r346, %r339, %r345;
- and.b32 %r347, %r346, 3;
- bfi.b32 %r348, %r347, %r343, 2, 2;
- ld.shared.u32 %r349, [_ZZ8compressPKjS0_P5uint2iE5xrefs+8];
- shl.b32 %r350, %r349, 1;
- shr.u32 %r351, %r339, %r350;
- and.b32 %r352, %r351, 3;
- bfi.b32 %r353, %r352, %r348, 4, 2;
- ld.shared.u32 %r354, [_ZZ8compressPKjS0_P5uint2iE5xrefs+12];
- shl.b32 %r355, %r354, 1;
- shr.u32 %r356, %r339, %r355;
- and.b32 %r357, %r356, 3;
- bfi.b32 %r358, %r357, %r353, 6, 2;
- ld.shared.u32 %r359, [_ZZ8compressPKjS0_P5uint2iE5xrefs+16];
- shl.b32 %r360, %r359, 1;
- shr.u32 %r361, %r339, %r360;
- and.b32 %r362, %r361, 3;
- bfi.b32 %r363, %r362, %r358, 8, 2;
- ld.shared.u32 %r364, [_ZZ8compressPKjS0_P5uint2iE5xrefs+20];
- shl.b32 %r365, %r364, 1;
- shr.u32 %r366, %r339, %r365;
- and.b32 %r367, %r366, 3;
- bfi.b32 %r368, %r367, %r363, 10, 2;
- ld.shared.u32 %r369, [_ZZ8compressPKjS0_P5uint2iE5xrefs+24];
- shl.b32 %r370, %r369, 1;
- shr.u32 %r371, %r339, %r370;
- and.b32 %r372, %r371, 3;
- bfi.b32 %r373, %r372, %r368, 12, 2;
- ld.shared.u32 %r374, [_ZZ8compressPKjS0_P5uint2iE5xrefs+28];
- shl.b32 %r375, %r374, 1;
- shr.u32 %r376, %r339, %r375;
- and.b32 %r377, %r376, 3;
- bfi.b32 %r378, %r377, %r373, 14, 2;
- ld.shared.u32 %r379, [_ZZ8compressPKjS0_P5uint2iE5xrefs+32];
- shl.b32 %r380, %r379, 1;
- shr.u32 %r381, %r339, %r380;
- and.b32 %r382, %r381, 3;
- bfi.b32 %r383, %r382, %r378, 16, 2;
- ld.shared.u32 %r384, [_ZZ8compressPKjS0_P5uint2iE5xrefs+36];
- shl.b32 %r385, %r384, 1;
- shr.u32 %r386, %r339, %r385;
- and.b32 %r387, %r386, 3;
- bfi.b32 %r388, %r387, %r383, 18, 2;
- ld.shared.u32 %r389, [_ZZ8compressPKjS0_P5uint2iE5xrefs+40];
- shl.b32 %r390, %r389, 1;
- shr.u32 %r391, %r339, %r390;
- and.b32 %r392, %r391, 3;
- bfi.b32 %r393, %r392, %r388, 20, 2;
- ld.shared.u32 %r394, [_ZZ8compressPKjS0_P5uint2iE5xrefs+44];
- shl.b32 %r395, %r394, 1;
- shr.u32 %r396, %r339, %r395;
- and.b32 %r397, %r396, 3;
- bfi.b32 %r398, %r397, %r393, 22, 2;
- ld.shared.u32 %r399, [_ZZ8compressPKjS0_P5uint2iE5xrefs+48];
- shl.b32 %r400, %r399, 1;
- shr.u32 %r401, %r339, %r400;
- and.b32 %r402, %r401, 3;
- bfi.b32 %r403, %r402, %r398, 24, 2;
- ld.shared.u32 %r404, [_ZZ8compressPKjS0_P5uint2iE5xrefs+52];
- shl.b32 %r405, %r404, 1;
- shr.u32 %r406, %r339, %r405;
- and.b32 %r407, %r406, 3;
- bfi.b32 %r408, %r407, %r403, 26, 2;
- ld.shared.u32 %r409, [_ZZ8compressPKjS0_P5uint2iE5xrefs+56];
- shl.b32 %r410, %r409, 1;
- shr.u32 %r411, %r339, %r410;
- and.b32 %r412, %r411, 3;
- bfi.b32 %r413, %r412, %r408, 28, 2;
- ld.shared.u32 %r414, [_ZZ8compressPKjS0_P5uint2iE5xrefs+60];
- shl.b32 %r415, %r414, 1;
- shr.u32 %r416, %r339, %r415;
- cvt.u32.u16 %r417, %rs32;
- cvt.u32.u16 %r418, %rs33;
- mov.u32 %r419, %ctaid.x;
- add.s32 %r420, %r419, %r424;
- cvta.to.global.u64 %rd138, %rd142;
- mul.wide.s32 %rd139, %r420, 8;
- add.s64 %rd140, %rd138, %rd139;
- bfi.b32 %r421, %r416, %r413, 30, 2;
- prmt.b32 %r422, %r417, %r418, 4180;
- st.global.v2.u32 [%rd140], {%r422, %r421};
- BB0_105:
- ret;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement