- How to force var4 in registers.
- __global__ void func()
- {
- register ushort4 result = make_ushort4(__float2half_rn(0.5), __float2half_rn(0.5), __float2half_rn(0.5), __float2half_rn(1.0));
- }
- __global__ void func(ushort4 *out)
- {
- ushort4 result = make_ushort4(__float2half_rn(0.5), __float2half_rn(0.5),
- __float2half_rn(0.5), __float2half_rn(1.0));
- out[threadIdx.x+blockDim.x*blockIdx.x] = result;
- }
- >nvcc -cubin -arch=sm_20 -Xptxas="-v" ushort4.cu
- ushort4.cu
- ushort4.cu
- tmpxft_000010b8_00000000-3_ushort4.cudafe1.gpu
- tmpxft_000010b8_00000000-10_ushort4.cudafe2.gpu
- ptxas info : Compiling entry function '_Z4funcP7ushort4' for 'sm_20'
- ptxas info : Function properties for _Z4funcP7ushort4
- 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
- ptxas info : Used 8 registers, 36 bytes cmem[0]
- >cuobjdump --dump-sass ushort4.cubin
- code for sm_20
- Function : _Z4funcP7ushort4
- /*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
- /*0008*/ /*0x01101c041000cfc0*/ F2F.F16.F32 R0, 0x3f000;
- /*0010*/ /*0x94009c042c000000*/ S2R R2, SR_CTAid_X;
- /*0018*/ /*0x8400dc042c000000*/ S2R R3, SR_Tid_X;
- /*0020*/ /*0x01111c041000cfe0*/ F2F.F16.F32 R4, 0x3f800;
- /*0028*/ /*0x00915c041c000000*/ I2I.U16.U16 R5, R0;
- /*0030*/ /*0x20209c0320064000*/ IMAD.U32.U32 R2, R2, c [0x0] [0x8], R3;
- /*0038*/ /*0x40019c03280ac040*/ BFI R6, R0, 0x1010, R5;
- /*0040*/ /*0x4041dc03280ac040*/ BFI R7, R4, 0x1010, R5;
- /*0048*/ /*0x80201c6340004000*/ ISCADD R0, R2, c [0x0] [0x20], 0x3;
- /*0050*/ /*0x00019ca590000000*/ ST.64 [R0], R6;
- /*0058*/ /*0x00001de780000000*/ EXIT;
- .................................
- >nvcc --version
- nvcc: NVIDIA (R) Cuda compiler driver
- Copyright (c) 2005-2011 NVIDIA Corporation
- Built on Fri_May_13_02:42:40_PDT_2011
- Cuda compilation tools, release 4.0, V0.2.1221
- >cl.exe
- Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 15.00.30729.01 for 80x86
- Copyright (C) Microsoft Corporation. All rights reserved.
- usage: cl [ option... ] filename... [ /link linkoption... ]
- >nvcc -cubin -arch=sm_11 -Xptxas=-v ushort4.cu
- ushort4.cu
- ushort4.cu
- tmpxft_00001788_00000000-3_ushort4.cudafe1.gpu
- tmpxft_00001788_00000000-10_ushort4.cudafe2.gpu
- ptxas info : Compiling entry function '_Z4funcP7ushort4' for 'sm_11'
- ptxas info : Used 4 registers, 4+16 bytes smem