Advertisement
Maratyszcza

Convert.asm (4-component output)

Sep 13th, 2011
268
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ; Processing speed on Core i7 2630QM (2GHz, Signle-Channel DDR3-1333 (PC3-10700))
  2. ; * Data in memory: 8700 MB/s (270 MPix/s)
  3. ; * Data in L3: 17700 MB/s (550 MPix/s)
  4. ; * Data in L2: 19000 MB/s (590 MPix/s)
  5. ; Compile: nasm -f win64 -o Convert.obj Convert.asm
  6. ; Use:
  7. ; extern "C" void convert_pixels(const float* source, float* destination, size_t length)
  8. ;
  9. ; int main(int argc, char** argv) {
  10. ;     size_t pixels = 64*1024*1024;
  11. ;     float* inputBuffer = (float*)malloc(pixels*sizeof(float)*4)
  12. ;     float* outputBuffer = (float*)malloc(pixels*sizeof(float)*4);
  13. ;     convert_pixels(inputBuffer, outputBuffer, pixels);
  14. ; }
  15.  
  16. SECTION .rdata
  17. align 32
  18.     c0 dd 0.44, 0.33, 0.22, 0.11, 0.44, 0.33, 0.22, 0.11
  19.     c1 dd 0.88, 0.77, 0.66, 0.55, 0.88, 0.77, 0.66, 0.55
  20.     c2 dd 1.04, 1.03, 1.02, 1.01, 1.04, 1.03, 1.02, 1.01
  21.  
  22. SECTION .text
  23.  
  24. global convert_pixels
  25.  
  26. ; extern "C" void convert_pixels(const float* source, float* destination, size_t length)
  27. convert_pixels:
  28.     ; rcx - source
  29.     ; rdx - destination
  30.     ; r8 - length
  31.  
  32.     vzeroupper
  33.     vmovaps ymm12, [c0]
  34.     vmovaps ymm13, [c1]
  35.     vmovaps ymm14, [c2]
  36.     vxorps  ymm15,  ymm15,  ymm15  
  37.  
  38.     mov rax, r8
  39.     shr r8, 1
  40.     align 32
  41. .main_processing_loop:
  42.     prefetchnta [rcx + 1408]
  43.     prefetchnta [rcx + rax*8 + 1408]
  44.  
  45.     vmovaps ymm2,  [rcx]
  46.     vmovaps ymm5,  [rcx + 32]
  47.     vmovaps ymm8,  [rcx + rax*8]
  48.     vmovaps ymm11, [rcx + rax*8 + 32]
  49.    
  50.     vmulps ymm0,  ymm2,  ymm12
  51.     vmulps ymm1,  ymm2,  ymm13
  52.     vmulps ymm2,  ymm2,  ymm14
  53.    
  54.     vmulps ymm3,  ymm5,  ymm12
  55.     vmulps ymm4,  ymm5,  ymm13
  56.     vmulps ymm5,  ymm5,  ymm14
  57.    
  58.     vmulps ymm6,  ymm8,  ymm12
  59.     vmulps ymm7,  ymm8,  ymm13
  60.     vmulps ymm8,  ymm8,  ymm14
  61.  
  62.     vmulps ymm9,  ymm11, ymm12
  63.     vmulps ymm10, ymm11, ymm13
  64.     vmulps ymm11, ymm11, ymm14
  65.  
  66.     vhaddps ymm0,  ymm0,  ymm1
  67.     vhaddps ymm2,  ymm2,  ymm15
  68.    
  69.     vhaddps ymm3,  ymm3,  ymm4
  70.     vhaddps ymm5,  ymm5,  ymm15
  71.    
  72.     vhaddps ymm6,  ymm6,  ymm7
  73.     vhaddps ymm8,  ymm8,  ymm15
  74.    
  75.     vhaddps ymm9,  ymm9,  ymm10
  76.     vhaddps ymm11, ymm11, ymm15
  77.  
  78.     vhaddps ymm0,  ymm0,  ymm2
  79.     vhaddps ymm3,  ymm3,  ymm5
  80.     vhaddps ymm6,  ymm6,  ymm8
  81.     vhaddps ymm9,  ymm9,  ymm11
  82.  
  83.     vmovntps [rdx], ymm0
  84.     vmovntps [rdx + 32], ymm3
  85.     vmovntps [rdx + rax*8], ymm6
  86.     vmovntps [rdx + rax*8 + 32], ymm9
  87.    
  88.     add rcx, 64
  89.     add rdx, 64
  90.     sub r8, 4
  91.     jnz .main_processing_loop
  92.     vzeroupper
  93.  
  94.     ret
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement