Don't like ads? PRO users don't see any ads ;-)
Guest

Untitled

By: a guest on May 5th, 2012  |  syntax: None  |  size: 2.49 KB  |  hits: 13  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. How to force var4 in registers.
  2. __global__ void func()
  3. {
  4.     register ushort4 result = make_ushort4(__float2half_rn(0.5), __float2half_rn(0.5), __float2half_rn(0.5), __float2half_rn(1.0));
  5. }
  6.        
  7. __global__ void func(ushort4 *out)
  8. {
  9.     ushort4 result = make_ushort4(__float2half_rn(0.5), __float2half_rn(0.5),
  10.             __float2half_rn(0.5), __float2half_rn(1.0));
  11.  
  12.     out[threadIdx.x+blockDim.x*blockIdx.x] = result;
  13. }
  14.        
  15. >nvcc -cubin -arch=sm_20 -Xptxas="-v" ushort4.cu
  16. ushort4.cu
  17. ushort4.cu
  18. tmpxft_000010b8_00000000-3_ushort4.cudafe1.gpu
  19. tmpxft_000010b8_00000000-10_ushort4.cudafe2.gpu
  20. ptxas info    : Compiling entry function '_Z4funcP7ushort4' for 'sm_20'
  21. ptxas info    : Function properties for _Z4funcP7ushort4
  22.     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
  23. ptxas info    : Used 8 registers, 36 bytes cmem[0]
  24.        
  25. >cuobjdump --dump-sass ushort4.cubin
  26.  
  27.         code for sm_20
  28.                 Function : _Z4funcP7ushort4
  29.         /*0000*/     /*0x00005de428004404*/     MOV R1, c [0x1] [0x100];
  30.         /*0008*/     /*0x01101c041000cfc0*/     F2F.F16.F32 R0, 0x3f000;
  31.         /*0010*/     /*0x94009c042c000000*/     S2R R2, SR_CTAid_X;
  32.         /*0018*/     /*0x8400dc042c000000*/     S2R R3, SR_Tid_X;
  33.         /*0020*/     /*0x01111c041000cfe0*/     F2F.F16.F32 R4, 0x3f800;
  34.         /*0028*/     /*0x00915c041c000000*/     I2I.U16.U16 R5, R0;
  35.         /*0030*/     /*0x20209c0320064000*/     IMAD.U32.U32 R2, R2, c [0x0] [0x8], R3;
  36.         /*0038*/     /*0x40019c03280ac040*/     BFI R6, R0, 0x1010, R5;
  37.         /*0040*/     /*0x4041dc03280ac040*/     BFI R7, R4, 0x1010, R5;
  38.         /*0048*/     /*0x80201c6340004000*/     ISCADD R0, R2, c [0x0] [0x20], 0x3;
  39.         /*0050*/     /*0x00019ca590000000*/     ST.64 [R0], R6;
  40.         /*0058*/     /*0x00001de780000000*/     EXIT;
  41.                 .................................
  42.        
  43. >nvcc --version
  44. nvcc: NVIDIA (R) Cuda compiler driver
  45. Copyright (c) 2005-2011 NVIDIA Corporation
  46. Built on Fri_May_13_02:42:40_PDT_2011
  47. Cuda compilation tools, release 4.0, V0.2.1221
  48.  
  49. >cl.exe
  50. Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 15.00.30729.01 for 80x86
  51. Copyright (C) Microsoft Corporation.  All rights reserved.
  52.  
  53. usage: cl [ option... ] filename... [ /link linkoption... ]
  54.  
  55.  
  56. >nvcc -cubin -arch=sm_11 -Xptxas=-v ushort4.cu
  57. ushort4.cu
  58. ushort4.cu
  59. tmpxft_00001788_00000000-3_ushort4.cudafe1.gpu
  60. tmpxft_00001788_00000000-10_ushort4.cudafe2.gpu
  61. ptxas info    : Compiling entry function '_Z4funcP7ushort4' for 'sm_11'
  62. ptxas info    : Used 4 registers, 4+16 bytes smem