SHARE
TWEET

Convert.asm

Maratyszcza Sep 12th, 2011 79 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ; Processing speed on Core i7 2630QM (2GHz, Signle-Channel DDR3-1333 (PC3-10700))
  2. ; * Data in memory: 8400 MB/s (300 MPix/s)
  3. ; * Data in L3: 11300 MB/s (400 MPix/s)
  4. ; * Data in L2: 11500 MB/s (410 MPix/s)
  5. ; Compile: nasm -f win64 -o Convert.obj Convert.asm
  6. ; Use:
  7. ; extern "C" void convert_pixels(const float* source, float* destination, size_t length)
  8. ;
  9. ; int main(int argc, char** argv) {
  10. ;     size_t pixels = 64*1024*1024;
  11. ;     float* inputBuffer = (float*)malloc(pixels*sizeof(float)*4)
  12. ;     float* outputBuffer = (float*)malloc(pixels*sizeof(float)*3);
  13. ;     convert_pixels(inputBuffer, outputBuffer, pixels);
  14. ; }
  15.  
  16. SECTION .rdata
  17. align 32
  18.         c1c0 dd 0.88, 0.77, 0.66, 0.55, 0.44, 0.33, 0.22, 0.11
  19.         c2c1 dd 1.04, 1.03, 1.02, 1.01, 0.88, 0.77, 0.66, 0.55
  20.         c0c2 dd 0.44, 0.33, 0.22, 0.11, 1.04, 1.03, 1.02, 1.01
  21.  
  22. SECTION .text
  23.  
  24. global convert_pixels
  25.  
  26. ; extern "C" void convert_pixels(const float* source, float* destination, size_t length)
  27. convert_pixels:
  28.         ; rcx - source
  29.         ; rdx - destination
  30.         ; r8 - length
  31.  
  32.         vzeroupper
  33.         vmovaps ymm13, [c0c2]
  34.         vmovaps ymm14, [c1c0]
  35.         vmovaps ymm15, [c2c1]
  36.         align 32
  37. .main_processing_loop:
  38.         prefetchnta [rcx + 1408]
  39.         ; ymm1  = ( i07, i06, i05, i04, i03, i02, i01, i00 )
  40.         vmovaps ymm1, [rcx]
  41.         ; ymm12 = ( i15, i14, i13, i12, i11, i10, i09, i08 )
  42.         vmovaps ymm12, [rcx + 32]
  43.         ; ymm10 = ( i23, i22, i21, i20, i19, i18, i17, i16 )
  44.         vmovaps ymm10, [rcx + 64]
  45.         ; ymm11 = ( i31, i30, i29, i28, i27, i26, i25, i24 )
  46.         vmovaps ymm11, [rcx + 96]
  47.        
  48.         ; ymm2 = ( i11, i10, i09, i08, i03, i02, i01, i00 )
  49.         vperm2f128 ymm2, ymm1,  ymm12, 00100000b
  50.         ; ymm3 = ( i11, i10, i09, i08, i07, i06, i05, i04 )
  51.         vperm2f128 ymm3, ymm1,  ymm12, 00100001b
  52.         ; ymm4 = ( i19, i18, i17, i16, i11, i10, i09, i08 )
  53.         vperm2f128 ymm4, ymm12, ymm10, 00100000b
  54.         ; ymm6 = ( i19, i18, i17, i16, i15, i14, i13, i12 )
  55.         vperm2f128 ymm6, ymm12, ymm10, 00100001b
  56.         ; ymm7 = ( i23, i22, i21, i20, i15, i14, i13, i12 )
  57.         vperm2f128 ymm7, ymm12, ymm10, 00110001b
  58.         ; ymm8 = ( i27, i26, i25, i24, i23, i22, i21, i20 )
  59.         vperm2f128 ymm8, ymm10, ymm11, 00100001b
  60.         ; ymm9 = ( i31, i30, i29, i28, i23, i22, i21, i20 )
  61.         vperm2f128 ymm9, ymm10, ymm11, 00110001b
  62.  
  63.         ; ymm0  = ( i07*c13, i06*c12, i05*c11, i04*c10, i03*c03, i02*c02, i01*c01, i00*c00 )
  64.         vmulps ymm0,  ymm1,  ymm14
  65.         ; ymm1  = ( i07*c23, i06*c22, i05*c21, i04*c20, i03*c13, i02*c12, i01*c11, i00*c10 )
  66.         vmulps ymm1,  ymm1,  ymm15
  67.         ; ymm2  = ( i11*c03, i10*c02, i09*c01, i08*c00, i03*c23, i02*c22, i01*c21, i00*c20 )
  68.         vmulps ymm2,  ymm2,  ymm13
  69.         ; ymm3  = ( i11*c13, i10*c12, i09*c11, i08*c10, i07*c03, i06*c02, i05*c01, i04*c00 )
  70.         vmulps ymm3,  ymm3,  ymm14
  71.         ; ymm4  = ( i19*c03, i18*c02, i17*c01, i16*c00, i11*c23, i10*c22, i09*c21, i08*c20 )
  72.         vmulps ymm4,  ymm4,  ymm13
  73.         ; ymm5  = ( i19*c13, i18*c12, i17*c11, i16*c10, i15*c03, i14*c02, i13*c01, i12*c00 )
  74.         vmulps ymm5,  ymm6,  ymm14
  75.         ; ymm6  = ( i19*c23, i18*c22, i17*c21, i16*c20, i15*c13, i14*c12, i13*c11, i12*c10 )
  76.         vmulps ymm6,  ymm6,  ymm15
  77.         ; ymm7  = ( i23*c03, i22*c02, i21*c01, i20*c00, i15*c23, i14*c22, i13*c21, i12*c20 )
  78.         vmulps ymm7,  ymm7,  ymm13
  79.         ; ymm8  = ( i27*c23, i26*c22, i25*c21, i24*c20, i23*c13, i22*c12, i21*c11, i20*c10 )
  80.         vmulps ymm8,  ymm8,  ymm15
  81.         ; ymm9  = ( i31*c03, i30*c02, i29*c01, i28*c00, i23*c23, i22*c22, i21*c21, i20*c20 )
  82.         vmulps ymm9,  ymm9,  ymm13
  83.         ; ymm10 = ( i31*c13, i30*c12, i29*c11, i28*c10, i27*c03, i26*c02, i25*c01, i24*c00 )
  84.         vmulps ymm10, ymm11, ymm14
  85.         ; ymm11 = ( i31*c23, i30*c22, i29*c21, i28*c20, i27*c13, i26*c12, i25*c11, i24*c10 )
  86.         vmulps ymm11, ymm11, ymm15
  87.  
  88.         ; ymm0  = ( i06*c22+i07*c23, i04*c20+i05*c21, i06*c12+i07*c13, i04*c10+i05*c11, i02*c12+i03*c13, i00*c10+i01*c11, i02*c02+i03*c03, i00*c00+i01*c01 )
  89.         vhaddps ymm0,  ymm0,  ymm1
  90.         ; ymm2  = ( i10*c12+i11*c13, i08*c10+i09*c11, i10*c02+i11*c03, i08*c00+i09*c01, i06*c02+i07*c03, i04*c00+i05*c01, i02*c22+i03*c23, i00*c20+i01*c21 )
  91.         vhaddps ymm2,  ymm2,  ymm3
  92.         ; ymm4  = ( i18*c12+i19*c13, i16*c10+i17*c11, i18*c02+i19*c03, i16*c00+i17*c01, i14*c02+i15*c03, i12*c00+i13*c01, i10*c22+i11*c23, i08*c20+i09*c21 )
  93.         vhaddps ymm4,  ymm4,  ymm5
  94.         ; ymm6  = ( i22*c02+i23*c03, i20*c00+i21*c01, i18*c22+i19*c23, i16*c20+i17*c21, i14*c22+i15*c23, i12*c20+i13*c21, i14*c12+i15*c13, i12*c10+i13*c11 )
  95.         vhaddps ymm6,  ymm6,  ymm7
  96.         ; ymm8  = ( i30*c02+i31*c03, i28*c00+i29*c01, i26*c22+i27*c23, i24*c20+i25*c21, i22*c22+i23*c23, i20*c20+i21*c21, i22*c12+i23*c13, i20*c10+i21*c11 )
  97.         vhaddps ymm8,  ymm8,  ymm9
  98.         ; ymm10 = ( i30*c22+i31*c23, i28*c20+i29*c21, i30*c12+i31*c13, i28*c10+i29*c11, i26*c12+i27*c13, i24*c10+i25*c11, i26*c02+i27*c03, i24*c00+i25*c01 )
  99.         vhaddps ymm10, ymm10, ymm11
  100.        
  101.         ; o00 = i00*c00+i01*c01+i02*c02+i03*c03
  102.         ; o01 = i00*c10+i01*c11+i02*c12+i03*c13
  103.         ; o02 = i00*c20+i01*c21+i02*c22+i03*c23
  104.         ; o03 = i04*c00+i05*c01+i06*c02+i07*c03
  105.         ; o04 = i04*c10+i05*c11+i06*c12+i07*c13
  106.         ; o05 = i04*c20+i05*c21+i06*c22+i07*c23
  107.         ; o06 = i08*c00+i09*c01+i10*c02+i11*c03
  108.         ; o07 = i08*c10+i09*c11+i10*c12+i11*c13
  109.         ; o08 = i08*c20+i09*c21+i10*c22+i11*c23
  110.         ; o09 = i12*c00+i13*c01+i14*c02+i15*c03
  111.         ; o10 = i12*c10+i13*c11+i14*c12+i15*c13
  112.         ; o11 = i12*c20+i13*c21+i14*c22+i15*c23
  113.         ; o12 = i16*c00+i17*c01+i18*c02+i19*c03
  114.         ; o13 = i16*c10+i17*c11+i18*c12+i19*c13
  115.         ; o14 = i16*c20+i17*c21+i18*c22+i19*c23
  116.         ; o15 = i20*c00+i21*c01+i22*c02+i23*c03
  117.         ; o16 = i20*c10+i21*c11+i22*c12+i23*c13
  118.         ; o17 = i20*c20+i21*c21+i22*c22+i23*c23
  119.         ; o18 = i24*c00+i25*c01+i26*c02+i27*c03
  120.         ; o19 = i24*c10+i25*c11+i26*c12+i27*c13
  121.         ; o20 = i24*c20+i25*c21+i26*c22+i27*c23
  122.         ; o21 = i28*c00+i29*c01+i30*c02+i31*c03
  123.         ; o22 = i28*c10+i29*c11+i30*c12+i31*c13
  124.         ; o23 = i28*c20+i29*c21+i30*c22+i31*c23
  125.        
  126.         ; ymm0  = ( i08*c10+i09*c11+i10*c12+i11*c13, i08*c00+i09*c01+i10*c02+i11*c03, i04*c20+i05*c21+i06*c22+i07*c23, i04*c10+i05*c11+i06*c12+i07*c13,
  127.         ;           i04*c00+i05*c01+i06*c02+i07*c03, i00*c20+i01*c21+i02*c22+i03*c23, i00*c10+i01*c11+i02*c12+i03*c13, i00*c00+i01*c01+i02*c02+i03*c03 )
  128.         ; ymm0  = ( o07, o06, o05, o04, o03, o02, o01, o00 )
  129.         vhaddps ymm0,  ymm0,  ymm2
  130.         ; ymm4  = ( i20*c00+i21*c01+i22*c02+i23*c03, i16*c20+i17*c21+i18*c22+i19*c23, i16*c10+i17*c11+i18*c12+i19*c13, i16*c00+i17*c01+i18*c02+i19*c03,
  131.         ;           i12*c20+i13*c21+i14*c22+i15*c23, i12*c10+i13*c11+i14*c12+i15*c13, i12*c00+i13*c01+i14*c02+i15*c03, i08*c20+i09*c21+i10*c22+i11*c23 )
  132.         ; ymm0  = ( o15, o14, o13, o12, o11, o10, o09, o08 )
  133.         vhaddps ymm4,  ymm4,  ymm6
  134.         ; ymm8  = ( i28*c20+i29*c21+i30*c22+i31*c23, i28*c10+i29*c11+i30*c12+i31*c13, i28*c00+i29*c01+i30*c02+i31*c03, i24*c20+i25*c21+i26*c22+i27*c23,
  135.         ;           i24*c10+i25*c11+i26*c12+i27*c13, i24*c00+i25*c01+i26*c02+i27*c03, i20*c20+i21*c21+i22*c22+i23*c23, i20*c10+i21*c11+i22*c12+i23*c13 )
  136.         ; ymm8  = ( o23, o22, o21, o20, o19, o18, o17, o16 )
  137.         vhaddps ymm8,  ymm8,  ymm10
  138.  
  139.  
  140.         vmovntps [rdx], ymm0
  141.         vmovntps [rdx + 32], ymm4
  142.         vmovntps [rdx + 64], ymm8
  143.         sub rcx, -128
  144.         add rdx, 96
  145.         sub r8, 8
  146.         jnz .main_processing_loop
  147.         vzeroupper
  148.  
  149.         ret
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Top