Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- .version 1.4
- .target sm_10, map_f64_to_f32
- // compiled with /usr/local/gpu/cuda-toolkit-4.1.28/cuda/open64/lib//be
- // nvopencc 4.1 built on 2012-01-12
- //-----------------------------------------------------------
- // Compiling /tmp/tmpxft_00002172_00000000-9_d2q9.cpp3.i (/tmp/ccBI#.tyRWSL)
- //-----------------------------------------------------------
- //-----------------------------------------------------------
- // Options:
- //-----------------------------------------------------------
- // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64
- // -O3 (Optimization level)
- // -g0 (Debug level)
- // -m2 (Report advisories)
- //-----------------------------------------------------------
- .file 1 "<command-line>"
- .file 2 "/tmp/tmpxft_00002172_00000000-8_d2q9.cudafe2.gpu"
- .file 3 "d2q9.cu"
- .file 4 "/cm/shared/apps/gcc/4.4.6/lib/gcc/x86_64-unknown-linux-gnu/4.4.6/include/stddef.h"
- .file 5 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/crt/device_runtime.h"
- .file 6 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/host_defines.h"
- .file 7 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/builtin_types.h"
- .file 8 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/device_types.h"
- .file 9 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/driver_types.h"
- .file 10 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/surface_types.h"
- .file 11 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/texture_types.h"
- .file 12 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/vector_types.h"
- .file 13 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/device_launch_parameters.h"
- .file 14 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/crt/storage_class.h"
- .file 15 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/common_functions.h"
- .file 16 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/math_functions.h"
- .file 17 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/math_constants.h"
- .file 18 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/device_functions.h"
- .file 19 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/sm_11_atomic_functions.h"
- .file 20 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/sm_12_atomic_functions.h"
- .file 21 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/sm_13_double_functions.h"
- .file 22 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/sm_20_atomic_functions.h"
- .file 23 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/sm_20_intrinsics.h"
- .file 24 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/surface_functions.h"
- .file 25 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/texture_fetch_functions.h"
- .file 26 "/usr/local/gpu/cuda-toolkit-4.1.28/cuda/bin/../include/math_functions_dbl_ptx1.h"
- .entry _Z9collisionPK7t_speedPKcPS_Pfi (
- .param .u64 __cudaparm__Z9collisionPK7t_speedPKcPS_Pfi_cells,
- .param .u64 __cudaparm__Z9collisionPK7t_speedPKcPS_Pfi_obstacles,
- .param .u64 __cudaparm__Z9collisionPK7t_speedPKcPS_Pfi_tmpy_cells,
- .param .u64 __cudaparm__Z9collisionPK7t_speedPKcPS_Pfi_av_vels,
- .param .s32 __cudaparm__Z9collisionPK7t_speedPKcPS_Pfi_ii)
- {
- .reg .u32 %r<30>;
- .reg .u64 %rd<21>;
- .reg .f32 %f<124>;
- .reg .f64 %fd<127>;
- .reg .pred %p<11>;
- .shared .align 4 .b8 __cuda___cuda_local_var_15345_35_non_const_sdata36[128];
- .loc 3 36 0
- $LDWbegin__Z9collisionPK7t_speedPKcPS_Pfi:
- .loc 3 49 0
- cvt.u32.u16 %r1, %ctaid.x;
- cvt.u32.u16 %r2, %ntid.x;
- mul.lo.u32 %r3, %r2, %r1;
- cvt.u32.u16 %r4, %tid.x;
- add.u32 %r5, %r4, %r3;
- cvt.s64.s32 %rd1, %r5;
- mul.wide.s32 %rd2, %r5, 36;
- ld.param.u64 %rd3, [__cudaparm__Z9collisionPK7t_speedPKcPS_Pfi_cells];
- add.u64 %rd4, %rd3, %rd2;
- ld.global.f32 %f1, [%rd4+0];
- mov.f32 %f2, %f1;
- ld.global.f32 %f3, [%rd4+4];
- mov.f32 %f4, %f3;
- ld.global.f32 %f5, [%rd4+8];
- mov.f32 %f6, %f5;
- ld.global.f32 %f7, [%rd4+12];
- mov.f32 %f8, %f7;
- ld.global.f32 %f9, [%rd4+16];
- mov.f32 %f10, %f9;
- ld.global.f32 %f11, [%rd4+20];
- mov.f32 %f12, %f11;
- ld.global.f32 %f13, [%rd4+24];
- mov.f32 %f14, %f13;
- ld.global.f32 %f15, [%rd4+28];
- mov.f32 %f16, %f15;
- ld.global.f32 %f17, [%rd4+32];
- mov.f32 %f18, %f17;
- .loc 3 50 0
- ld.param.u64 %rd5, [__cudaparm__Z9collisionPK7t_speedPKcPS_Pfi_obstacles];
- add.u64 %rd6, %rd1, %rd5;
- ld.global.s8 %r6, [%rd6+0];
- mov.s32 %r7, 0;
- setp.eq.s32 %p1, %r6, %r7;
- mov.s32 %r8, 458129845;
- mov.s32 %r9, 0;
- setp.lt.s32 %p2, %r5, %r9;
- abs.s32 %r10, %r5;
- mul.hi.u32 %r11, %r10, %r8;
- shr.s32 %r12, %r11, 5;
- @%p2 sub.s32 %r12, %r9, %r12;
- mov.s32 %r13, %r12;
- mul.lo.s32 %r14, %r13, 300;
- sub.s32 %r15, %r5, %r14;
- mov.u32 %r16, 0;
- setp.ne.s32 %p3, %r15, %r16;
- @%p3 bra $Lt_0_11522;
- .loc 3 57 0
- @!%p1 bra $Lt_0_11522;
- mov.f32 %f19, 0fb8690454; // -5.55556e-05
- add.f32 %f20, %f7, %f19;
- mov.f32 %f21, 0f00000000; // 0
- setp.gt.f32 %p4, %f20, %f21;
- @!%p4 bra $Lt_0_11522;
- mov.f32 %f22, 0fb7690454; // -1.38889e-05
- add.f32 %f23, %f13, %f22;
- mov.f32 %f24, 0f00000000; // 0
- setp.gt.f32 %p5, %f23, %f24;
- @!%p5 bra $Lt_0_11522;
- mov.f32 %f25, 0fb7690454; // -1.38889e-05
- add.f32 %f26, %f15, %f25;
- mov.f32 %f27, 0f00000000; // 0
- setp.gt.f32 %p6, %f26, %f27;
- @!%p6 bra $Lt_0_11522;
- .loc 3 62 0
- mov.f32 %f28, 0f38690454; // 5.55556e-05
- add.f32 %f3, %f3, %f28;
- mov.f32 %f4, %f3;
- .loc 3 63 0
- mov.f32 %f29, 0f37690454; // 1.38889e-05
- add.f32 %f11, %f11, %f29;
- mov.f32 %f12, %f11;
- .loc 3 64 0
- mov.f32 %f30, 0f37690454; // 1.38889e-05
- add.f32 %f17, %f17, %f30;
- mov.f32 %f18, %f17;
- .loc 3 65 0
- mov.f32 %f7, %f20;
- mov.f32 %f8, %f7;
- .loc 3 66 0
- mov.f32 %f13, %f23;
- mov.f32 %f14, %f13;
- .loc 3 67 0
- mov.f32 %f15, %f26;
- mov.f32 %f16, %f15;
- $Lt_0_11522:
- $L_0_6914:
- $Lt_0_7938:
- @!%p1 bra $Lt_0_8706;
- .loc 3 97 0
- add.f32 %f31, %f3, %f1;
- add.f32 %f32, %f11, %f3;
- add.f32 %f33, %f13, %f7;
- add.f32 %f34, %f11, %f5;
- add.f32 %f35, %f15, %f9;
- add.f32 %f36, %f31, %f5;
- add.f32 %f37, %f32, %f17;
- add.f32 %f38, %f33, %f15;
- add.f32 %f39, %f34, %f13;
- add.f32 %f40, %f35, %f17;
- add.f32 %f41, %f36, %f7;
- sub.f32 %f42, %f37, %f38;
- sub.f32 %f43, %f39, %f40;
- add.f32 %f44, %f41, %f9;
- mul.f32 %f45, %f42, %f42;
- mul.f32 %f46, %f43, %f43;
- add.f32 %f47, %f44, %f11;
- cvt.f64.f32 %fd1, %f45;
- cvt.f64.f32 %fd2, %f46;
- add.f32 %f48, %f47, %f13;
- add.f32 %f49, %f48, %f15;
- add.f32 %f50, %f49, %f17;
- mul.f32 %f51, %f50, %f50;
- mov.f32 %f52, 0fbf59999a; // -0.85
- mul.f32 %f53, %f1, %f52;
- cvt.f64.f32 %fd3, %f53;
- mov.f32 %f54, 0f3f9dddde; // 1.23333
- div.full.f32 %f55, %f54, %f50;
- cvt.f64.f32 %fd4, %f55;
- cvt.f64.f32 %fd5, %f51;
- mov.f64 %fd6, 0d3fe5555555555555; // 0.666667
- mul.f64 %fd7, %fd5, %fd6;
- sub.f64 %fd8, %fd7, %fd1;
- sub.f64 %fd9, %fd8, %fd2;
- mad.rn.f64 %fd10, %fd4, %fd9, %fd3;
- cvt.rn.f32.f64 %f56, %fd10;
- mov.f32 %f57, %f56;
- .loc 3 98 0
- cvt.f64.f32 %fd11, %f42;
- cvt.f64.f32 %fd12, %f50;
- mov.f32 %f58, 0fbf59999a; // -0.85
- mul.f32 %f59, %f3, %f58;
- cvt.f64.f32 %fd13, %f59;
- add.f32 %f60, %f45, %f45;
- sub.f32 %f61, %f60, %f46;
- cvt.f64.f32 %fd14, %f61;
- add.f64 %fd15, %fd11, %fd11;
- mad.rn.f64 %fd16, %fd12, %fd15, %fd14;
- cvt.rn.f32.f64 %f62, %fd16;
- div.full.f32 %f63, %f62, %f51;
- cvt.f64.f32 %fd17, %f63;
- mov.f64 %fd18, 0d3fe5555555555555; // 0.666667
- add.f64 %fd19, %fd17, %fd18;
- mov.f64 %fd20, 0d3fd3bbbbc2777778; // 0.308333
- mul.f64 %fd21, %fd12, %fd20;
- mad.rn.f64 %fd22, %fd19, %fd21, %fd13;
- cvt.rn.f32.f64 %f64, %fd22;
- .loc 3 99 0
- cvt.f64.f32 %fd23, %f5;
- mul.f32 %f65, %f43, %f50;
- mov.f32 %f66, 0f3de38e39; // 0.111111
- mul.f32 %f67, %f50, %f66;
- cvt.f64.f32 %fd24, %f67;
- add.f32 %f68, %f46, %f65;
- cvt.f64.f32 %fd25, %f68;
- mov.f64 %fd26, 0d3fe0000000000000; // 0.5
- mul.f64 %fd27, %fd1, %fd26;
- sub.f64 %fd28, %fd25, %fd27;
- mov.f64 %fd29, 0d4008000000000000; // 3
- mul.f64 %fd30, %fd28, %fd29;
- cvt.rn.f32.f64 %f69, %fd30;
- div.full.f32 %f70, %f69, %f51;
- cvt.f64.f32 %fd31, %f70;
- mov.f64 %fd32, 0d3ff0000000000000; // 1
- add.f64 %fd33, %fd31, %fd32;
- mul.f64 %fd34, %fd24, %fd33;
- sub.f64 %fd35, %fd34, %fd23;
- mov.f64 %fd36, 0d3ffd9999a0000000; // 1.85
- mad.rn.f64 %fd37, %fd35, %fd36, %fd23;
- cvt.rn.f32.f64 %f71, %fd37;
- .loc 3 100 0
- cvt.f64.f32 %fd38, %f7;
- mul.f32 %f72, %f42, %f50;
- sub.f32 %f73, %f45, %f72;
- cvt.f64.f32 %fd39, %f73;
- mov.f64 %fd40, 0d3fe0000000000000; // 0.5
- mul.f64 %fd41, %fd2, %fd40;
- sub.f64 %fd42, %fd39, %fd41;
- mov.f64 %fd43, 0d4008000000000000; // 3
- mul.f64 %fd44, %fd42, %fd43;
- cvt.rn.f32.f64 %f74, %fd44;
- div.full.f32 %f75, %f74, %f51;
- cvt.f64.f32 %fd45, %f75;
- mov.f64 %fd46, 0d3ff0000000000000; // 1
- add.f64 %fd47, %fd45, %fd46;
- mul.f64 %fd48, %fd24, %fd47;
- sub.f64 %fd49, %fd48, %fd38;
- mov.f64 %fd50, 0d3ffd9999a0000000; // 1.85
- mad.rn.f64 %fd51, %fd49, %fd50, %fd38;
- cvt.rn.f32.f64 %f76, %fd51;
- add.f32 %f77, %f56, %f64;
- add.f32 %f78, %f71, %f77;
- add.f32 %f79, %f76, %f78;
- .loc 3 101 0
- cvt.f64.f32 %fd52, %f9;
- sub.f32 %f80, %f46, %f65;
- cvt.f64.f32 %fd53, %f80;
- mov.f64 %fd54, 0d3fe0000000000000; // 0.5
- mul.f64 %fd55, %fd1, %fd54;
- sub.f64 %fd56, %fd53, %fd55;
- mov.f64 %fd57, 0d4008000000000000; // 3
- mul.f64 %fd58, %fd56, %fd57;
- cvt.rn.f32.f64 %f81, %fd58;
- div.full.f32 %f82, %f81, %f51;
- cvt.f64.f32 %fd59, %f82;
- mov.f64 %fd60, 0d3ff0000000000000; // 1
- add.f64 %fd61, %fd59, %fd60;
- mul.f64 %fd62, %fd24, %fd61;
- sub.f64 %fd63, %fd62, %fd52;
- mov.f64 %fd64, 0d3ffd9999a0000000; // 1.85
- mad.rn.f64 %fd65, %fd63, %fd64, %fd52;
- cvt.rn.f32.f64 %f83, %fd65;
- add.f32 %f84, %f79, %f83;
- .loc 18 1319 0
- add.f32 %f85, %f42, %f43;
- mul.f32 %f86, %f42, %f43;
- cvt.f64.f32 %fd66, %f86;
- mov.f64 %fd67, 0d4008000000000000; // 3
- mul.f64 %fd68, %fd12, %fd67;
- add.f32 %f87, %f45, %f46;
- cvt.f64.f32 %fd69, %f87;
- mov.f64 %fd70, 0d4008000000000000; // 3
- mad.rn.f64 %fd71, %fd66, %fd70, %fd69;
- mov.f64 %fd72, 0d4008000000000000; // 3
- mul.f64 %fd73, %fd71, %fd72;
- cvt.f64.f32 %fd74, %f85;
- mad.rn.f64 %fd75, %fd74, %fd68, %fd73;
- cvt.rn.f32.f64 %f88, %fd75;
- div.full.f32 %f89, %f88, %f51;
- cvt.f64.f32 %fd76, %f89;
- .loc 3 102 0
- cvt.f64.f32 %fd77, %f11;
- mov.f32 %f90, 0f3ce38e39; // 0.0277778
- mul.f32 %f91, %f50, %f90;
- cvt.f64.f32 %fd78, %f91;
- mov.f64 %fd79, 0d3ff0000000000000; // 1
- add.f64 %fd80, %fd76, %fd79;
- mul.f64 %fd81, %fd78, %fd80;
- sub.f64 %fd82, %fd81, %fd77;
- mov.f64 %fd83, 0d3ffd9999a0000000; // 1.85
- mad.rn.f64 %fd84, %fd82, %fd83, %fd77;
- cvt.rn.f32.f64 %f92, %fd84;
- mov.f32 %f93, %f92;
- add.f32 %f94, %f92, %f84;
- .loc 18 1319 0
- mov.f64 %fd85, 0d4022000000000000; // 9
- mul.f64 %fd86, %fd66, %fd85;
- mov.f64 %fd87, 0d4008000000000000; // 3
- mul.f64 %fd88, %fd1, %fd87;
- sub.f64 %fd89, %fd88, %fd86;
- mov.f64 %fd90, 0d4008000000000000; // 3
- mad.rn.f64 %fd91, %fd2, %fd90, %fd89;
- sub.f32 %f95, %f43, %f42;
- cvt.f64.f32 %fd92, %f95;
- mad.rn.f64 %fd93, %fd92, %fd68, %fd91;
- cvt.rn.f32.f64 %f96, %fd93;
- div.full.f32 %f97, %f96, %f51;
- cvt.f64.f32 %fd94, %f97;
- .loc 3 103 0
- cvt.f64.f32 %fd95, %f13;
- mov.f64 %fd96, 0d3ff0000000000000; // 1
- add.f64 %fd97, %fd94, %fd96;
- mul.f64 %fd98, %fd78, %fd97;
- sub.f64 %fd99, %fd98, %fd95;
- mov.f64 %fd100, 0d3ffd9999a0000000; // 1.85
- mad.rn.f64 %fd101, %fd99, %fd100, %fd95;
- cvt.rn.f32.f64 %f98, %fd101;
- mov.f32 %f99, %f98;
- add.f32 %f100, %f98, %f94;
- .loc 18 1319 0
- mov.f64 %fd102, 0d4008000000000000; // 3
- mad.rn.f64 %fd103, %fd1, %fd102, %fd86;
- mov.f64 %fd104, 0d4008000000000000; // 3
- mad.rn.f64 %fd105, %fd2, %fd104, %fd103;
- neg.f32 %f101, %f85;
- cvt.f64.f32 %fd106, %f101;
- mad.rn.f64 %fd107, %fd106, %fd68, %fd105;
- cvt.rn.f32.f64 %f102, %fd107;
- div.full.f32 %f103, %f102, %f51;
- cvt.f64.f32 %fd108, %f103;
- .loc 3 104 0
- cvt.f64.f32 %fd109, %f15;
- mov.f64 %fd110, 0d3ff0000000000000; // 1
- add.f64 %fd111, %fd108, %fd110;
- mul.f64 %fd112, %fd78, %fd111;
- sub.f64 %fd113, %fd112, %fd109;
- mov.f64 %fd114, 0d3ffd9999a0000000; // 1.85
- mad.rn.f64 %fd115, %fd113, %fd114, %fd109;
- cvt.rn.f32.f64 %f104, %fd115;
- mov.f32 %f105, %f104;
- add.f32 %f106, %f104, %f100;
- .loc 18 1319 0
- sub.f32 %f107, %f42, %f43;
- cvt.f64.f32 %fd116, %f107;
- mad.rn.f64 %fd117, %fd116, %fd68, %fd91;
- cvt.rn.f32.f64 %f108, %fd117;
- div.full.f32 %f109, %f108, %f51;
- cvt.f64.f32 %fd118, %f109;
- .loc 3 105 0
- cvt.f64.f32 %fd119, %f17;
- mov.f64 %fd120, 0d3ff0000000000000; // 1
- add.f64 %fd121, %fd118, %fd120;
- mul.f64 %fd122, %fd78, %fd121;
- sub.f64 %fd123, %fd122, %fd119;
- mov.f64 %fd124, 0d3ffd9999a0000000; // 1.85
- mad.rn.f64 %fd125, %fd123, %fd124, %fd119;
- cvt.rn.f32.f64 %f110, %fd125;
- mov.f32 %f111, %f110;
- add.f32 %f112, %f110, %f106;
- .loc 3 107 0
- add.f32 %f113, %f92, %f64;
- add.f32 %f114, %f110, %f113;
- add.f32 %f115, %f98, %f76;
- add.f32 %f116, %f104, %f115;
- sub.f32 %f117, %f114, %f116;
- div.full.f32 %f118, %f117, %f112;
- bra.uni $Lt_0_8450;
- $Lt_0_8706:
- .loc 3 117 0
- mov.f32 %f57, %f1;
- .loc 3 118 0
- mov.f32 %f76, %f3;
- .loc 3 119 0
- mov.f32 %f83, %f5;
- .loc 3 120 0
- mov.f32 %f64, %f7;
- .loc 3 121 0
- mov.f32 %f71, %f9;
- .loc 3 122 0
- mov.f32 %f105, %f11;
- .loc 3 123 0
- mov.f32 %f111, %f13;
- .loc 3 124 0
- mov.f32 %f93, %f15;
- .loc 3 125 0
- mov.f32 %f99, %f17;
- .loc 3 126 0
- mov.f32 %f118, 0f00000000; // 0
- $Lt_0_8450:
- mov.u64 %rd7, __cuda___cuda_local_var_15345_35_non_const_sdata36;
- .loc 3 133 0
- ld.param.u64 %rd8, [__cudaparm__Z9collisionPK7t_speedPKcPS_Pfi_tmpy_cells];
- add.u64 %rd9, %rd8, %rd2;
- st.global.f32 [%rd9+0], %f57;
- st.global.f32 [%rd9+4], %f64;
- st.global.f32 [%rd9+8], %f71;
- st.global.f32 [%rd9+12], %f76;
- st.global.f32 [%rd9+16], %f83;
- st.global.f32 [%rd9+20], %f93;
- st.global.f32 [%rd9+24], %f99;
- st.global.f32 [%rd9+28], %f105;
- st.global.f32 [%rd9+32], %f111;
- .loc 3 137 0
- cvt.s32.u16 %r17, %tid.x;
- cvt.s64.s32 %rd10, %r17;
- mul.wide.s32 %rd11, %r17, 4;
- add.u64 %rd12, %rd7, %rd11;
- st.shared.f32 [%rd12+0], %f118;
- .loc 3 141 0
- bar.sync 0;
- mov.s32 %r18, 16;
- $Lt_0_9474:
- //<loop> Loop body line 141, nesting depth: 1, estimated iterations: unknown
- setp.le.s32 %p7, %r18, %r17;
- @%p7 bra $Lt_0_9730;
- .loc 3 149 0
- ld.shared.f32 %f119, [%rd12+0];
- add.s32 %r19, %r18, %r17;
- cvt.s64.s32 %rd13, %r19;
- mul.wide.s32 %rd14, %r19, 4;
- add.u64 %rd15, %rd7, %rd14;
- ld.shared.f32 %f120, [%rd15+0];
- add.f32 %f121, %f119, %f120;
- st.shared.f32 [%rd12+0], %f121;
- $Lt_0_9730:
- .loc 3 151 0
- bar.sync 0;
- .loc 3 143 0
- shr.s32 %r20, %r18, 31;
- mov.s32 %r21, 1;
- and.b32 %r22, %r20, %r21;
- add.s32 %r23, %r22, %r18;
- shr.s32 %r18, %r23, 1;
- mov.u32 %r24, 0;
- setp.gt.s32 %p8, %r18, %r24;
- @%p8 bra $Lt_0_9474;
- mov.u32 %r25, 0;
- setp.ne.s32 %p9, %r17, %r25;
- @%p9 bra $Lt_0_10498;
- .loc 3 154 0
- ld.shared.f32 %f122, [__cuda___cuda_local_var_15345_35_non_const_sdata36+0];
- ld.param.u64 %rd16, [__cudaparm__Z9collisionPK7t_speedPKcPS_Pfi_av_vels];
- ld.param.s32 %r26, [__cudaparm__Z9collisionPK7t_speedPKcPS_Pfi_ii];
- mul.lo.s32 %r27, %r26, 1875;
- add.u32 %r28, %r1, %r27;
- cvt.u64.u32 %rd17, %r28;
- mul.wide.u32 %rd18, %r28, 4;
- add.u64 %rd19, %rd16, %rd18;
- st.global.f32 [%rd19+0], %f122;
- $Lt_0_10498:
- .loc 3 156 0
- exit;
- $LDWend__Z9collisionPK7t_speedPKcPS_Pfi:
- } // _Z9collisionPK7t_speedPKcPS_Pfi
- .entry _Z9propagateP7t_speedPKS_ (
- .param .u64 __cudaparm__Z9propagateP7t_speedPKS__cells,
- .param .u64 __cudaparm__Z9propagateP7t_speedPKS__tmpy_cells)
- {
- .reg .u16 %rh<4>;
- .reg .u32 %r<52>;
- .reg .u64 %rd<32>;
- .reg .f32 %f<15>;
- .reg .pred %p<7>;
- .loc 3 161 0
- $LDWbegin__Z9propagateP7t_speedPKS_:
- .loc 3 204 0
- mov.u16 %rh1, %ctaid.x;
- mov.u16 %rh2, %ntid.x;
- mul.wide.u16 %r1, %rh1, %rh2;
- cvt.u32.u16 %r2, %tid.x;
- add.u32 %r3, %r2, %r1;
- cvt.rn.f32.s32 %f1, %r3;
- mov.s32 %r4, 458129845;
- mov.s32 %r5, 0;
- setp.lt.s32 %p1, %r3, %r5;
- abs.s32 %r6, %r3;
- mul.hi.u32 %r7, %r6, %r4;
- shr.s32 %r8, %r7, 5;
- @%p1 sub.s32 %r8, %r5, %r8;
- mov.s32 %r9, %r8;
- mul.lo.s32 %r10, %r9, 300;
- sub.s32 %r11, %r3, %r10;
- mov.f32 %f2, 0f43960000; // 300
- div.full.f32 %f3, %f1, %f2;
- cvt.rmi.f32.f32 %f4, %f3;
- cvt.rzi.s32.f32 %r12, %f4;
- mul.lo.s32 %r13, %r12, 300;
- add.s32 %r14, %r11, %r13;
- cvt.s64.s32 %rd1, %r14;
- mul.wide.s32 %rd2, %r14, 36;
- ld.param.u64 %rd3, [__cudaparm__Z9propagateP7t_speedPKS__tmpy_cells];
- add.u64 %rd4, %rd3, %rd2;
- ld.global.f32 %f5, [%rd4+0];
- ld.global.f32 %f6, [%rd4+4];
- ld.global.f32 %f7, [%rd4+8];
- ld.global.f32 %f8, [%rd4+12];
- ld.global.f32 %f9, [%rd4+16];
- ld.global.f32 %f10, [%rd4+20];
- ld.global.f32 %f11, [%rd4+24];
- ld.global.f32 %f12, [%rd4+28];
- ld.global.f32 %f13, [%rd4+32];
- .loc 3 210 0
- ld.param.u64 %rd5, [__cudaparm__Z9propagateP7t_speedPKS__cells];
- add.u64 %rd6, %rd2, %rd5;
- st.global.f32 [%rd6+0], %f5;
- .loc 3 211 0
- add.s32 %r15, %r11, 1;
- mov.s32 %r16, 458129845;
- mov.s32 %r17, 0;
- setp.lt.s32 %p2, %r15, %r17;
- abs.s32 %r18, %r15;
- mul.hi.u32 %r19, %r18, %r16;
- shr.s32 %r20, %r19, 5;
- @%p2 sub.s32 %r20, %r17, %r20;
- mov.s32 %r21, %r20;
- mul.lo.s32 %r22, %r21, 300;
- sub.s32 %r23, %r15, %r22;
- add.s32 %r24, %r23, %r13;
- cvt.s64.s32 %rd7, %r24;
- mul.wide.s32 %rd8, %r24, 36;
- add.u64 %rd9, %rd5, %rd8;
- st.global.f32 [%rd9+4], %f6;
- .loc 3 212 0
- add.s32 %r25, %r12, 1;
- mov.s32 %r26, 1374389535;
- mov.s32 %r27, 0;
- setp.lt.s32 %p3, %r25, %r27;
- abs.s32 %r28, %r25;
- mul.hi.u32 %r29, %r28, %r26;
- shr.s32 %r30, %r29, 6;
- @%p3 sub.s32 %r30, %r27, %r30;
- mov.s32 %r31, %r30;
- mul.lo.s32 %r32, %r31, 200;
- sub.s32 %r33, %r25, %r32;
- mul.lo.s32 %r34, %r33, 300;
- add.s32 %r35, %r11, %r34;
- cvt.s64.s32 %rd10, %r35;
- mul.wide.s32 %rd11, %r35, 36;
- add.u64 %rd12, %rd5, %rd11;
- st.global.f32 [%rd12+8], %f7;
- .loc 3 213 0
- mov.s32 %r36, 0;
- setp.ne.s32 %p4, %r11, %r36;
- sub.s32 %r37, %r11, 1;
- mov.s32 %r38, 299;
- selp.s32 %r39, %r37, %r38, %p4;
- add.s32 %r40, %r39, %r13;
- cvt.s64.s32 %rd13, %r40;
- mul.wide.s32 %rd14, %r40, 36;
- add.u64 %rd15, %rd5, %rd14;
- st.global.f32 [%rd15+12], %f8;
- .loc 3 214 0
- mov.s32 %r41, 0;
- setp.ne.s32 %p5, %r12, %r41;
- sub.s32 %r42, %r12, 1;
- mov.s32 %r43, 199;
- selp.s32 %r44, %r42, %r43, %p5;
- mul.lo.s32 %r45, %r44, 300;
- add.s32 %r46, %r11, %r45;
- cvt.s64.s32 %rd16, %r46;
- mul.wide.s32 %rd17, %r46, 36;
- add.u64 %rd18, %rd5, %rd17;
- st.global.f32 [%rd18+16], %f9;
- .loc 3 215 0
- add.s32 %r47, %r23, %r34;
- cvt.s64.s32 %rd19, %r47;
- mul.wide.s32 %rd20, %r47, 36;
- add.u64 %rd21, %rd5, %rd20;
- st.global.f32 [%rd21+20], %f10;
- .loc 3 216 0
- add.s32 %r48, %r39, %r34;
- cvt.s64.s32 %rd22, %r48;
- mul.wide.s32 %rd23, %r48, 36;
- add.u64 %rd24, %rd5, %rd23;
- st.global.f32 [%rd24+24], %f11;
- .loc 3 217 0
- add.s32 %r49, %r39, %r45;
- cvt.s64.s32 %rd25, %r49;
- mul.wide.s32 %rd26, %r49, 36;
- add.u64 %rd27, %rd5, %rd26;
- st.global.f32 [%rd27+28], %f12;
- .loc 3 218 0
- add.s32 %r50, %r23, %r45;
- cvt.s64.s32 %rd28, %r50;
- mul.wide.s32 %rd29, %r50, 36;
- add.u64 %rd30, %rd5, %rd29;
- st.global.f32 [%rd30+32], %f13;
- .loc 3 221 0
- exit;
- $LDWend__Z9propagateP7t_speedPKS_:
- } // _Z9propagateP7t_speedPKS_
Advertisement
Add Comment
Please, Sign In to add comment