Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ; Processing speed on Core i7 2630QM (2GHz, Signle-Channel DDR3-1333 (PC3-10700))
- ; * Data in memory: 8400 MB/s (300 MPix/s)
- ; * Data in L3: 11300 MB/s (400 MPix/s)
- ; * Data in L2: 11500 MB/s (410 MPix/s)
- ; Compile: nasm -f win64 -o Convert.obj Convert.asm
- ; Use:
- ; extern "C" void convert_pixels(const float* source, float* destination, size_t length)
- ;
- ; int main(int argc, char** argv) {
- ; size_t pixels = 64*1024*1024;
- ; float* inputBuffer = (float*)malloc(pixels*sizeof(float)*4)
- ; float* outputBuffer = (float*)malloc(pixels*sizeof(float)*3);
- ; convert_pixels(inputBuffer, outputBuffer, pixels);
- ; }
- SECTION .rdata
- align 32
- c1c0 dd 0.88, 0.77, 0.66, 0.55, 0.44, 0.33, 0.22, 0.11
- c2c1 dd 1.04, 1.03, 1.02, 1.01, 0.88, 0.77, 0.66, 0.55
- c0c2 dd 0.44, 0.33, 0.22, 0.11, 1.04, 1.03, 1.02, 1.01
- SECTION .text
- global convert_pixels
- ; extern "C" void convert_pixels(const float* source, float* destination, size_t length)
- convert_pixels:
- ; rcx - source
- ; rdx - destination
- ; r8 - length
- vzeroupper
- vmovaps ymm13, [c0c2]
- vmovaps ymm14, [c1c0]
- vmovaps ymm15, [c2c1]
- align 32
- .main_processing_loop:
- prefetchnta [rcx + 1408]
- ; ymm1 = ( i07, i06, i05, i04, i03, i02, i01, i00 )
- vmovaps ymm1, [rcx]
- ; ymm12 = ( i15, i14, i13, i12, i11, i10, i09, i08 )
- vmovaps ymm12, [rcx + 32]
- ; ymm10 = ( i23, i22, i21, i20, i19, i18, i17, i16 )
- vmovaps ymm10, [rcx + 64]
- ; ymm11 = ( i31, i30, i29, i28, i27, i26, i25, i24 )
- vmovaps ymm11, [rcx + 96]
- ; ymm2 = ( i11, i10, i09, i08, i03, i02, i01, i00 )
- vperm2f128 ymm2, ymm1, ymm12, 00100000b
- ; ymm3 = ( i11, i10, i09, i08, i07, i06, i05, i04 )
- vperm2f128 ymm3, ymm1, ymm12, 00100001b
- ; ymm4 = ( i19, i18, i17, i16, i11, i10, i09, i08 )
- vperm2f128 ymm4, ymm12, ymm10, 00100000b
- ; ymm6 = ( i19, i18, i17, i16, i15, i14, i13, i12 )
- vperm2f128 ymm6, ymm12, ymm10, 00100001b
- ; ymm7 = ( i23, i22, i21, i20, i15, i14, i13, i12 )
- vperm2f128 ymm7, ymm12, ymm10, 00110001b
- ; ymm8 = ( i27, i26, i25, i24, i23, i22, i21, i20 )
- vperm2f128 ymm8, ymm10, ymm11, 00100001b
- ; ymm9 = ( i31, i30, i29, i28, i23, i22, i21, i20 )
- vperm2f128 ymm9, ymm10, ymm11, 00110001b
- ; ymm0 = ( i07*c13, i06*c12, i05*c11, i04*c10, i03*c03, i02*c02, i01*c01, i00*c00 )
- vmulps ymm0, ymm1, ymm14
- ; ymm1 = ( i07*c23, i06*c22, i05*c21, i04*c20, i03*c13, i02*c12, i01*c11, i00*c10 )
- vmulps ymm1, ymm1, ymm15
- ; ymm2 = ( i11*c03, i10*c02, i09*c01, i08*c00, i03*c23, i02*c22, i01*c21, i00*c20 )
- vmulps ymm2, ymm2, ymm13
- ; ymm3 = ( i11*c13, i10*c12, i09*c11, i08*c10, i07*c03, i06*c02, i05*c01, i04*c00 )
- vmulps ymm3, ymm3, ymm14
- ; ymm4 = ( i19*c03, i18*c02, i17*c01, i16*c00, i11*c23, i10*c22, i09*c21, i08*c20 )
- vmulps ymm4, ymm4, ymm13
- ; ymm5 = ( i19*c13, i18*c12, i17*c11, i16*c10, i15*c03, i14*c02, i13*c01, i12*c00 )
- vmulps ymm5, ymm6, ymm14
- ; ymm6 = ( i19*c23, i18*c22, i17*c21, i16*c20, i15*c13, i14*c12, i13*c11, i12*c10 )
- vmulps ymm6, ymm6, ymm15
- ; ymm7 = ( i23*c03, i22*c02, i21*c01, i20*c00, i15*c23, i14*c22, i13*c21, i12*c20 )
- vmulps ymm7, ymm7, ymm13
- ; ymm8 = ( i27*c23, i26*c22, i25*c21, i24*c20, i23*c13, i22*c12, i21*c11, i20*c10 )
- vmulps ymm8, ymm8, ymm15
- ; ymm9 = ( i31*c03, i30*c02, i29*c01, i28*c00, i23*c23, i22*c22, i21*c21, i20*c20 )
- vmulps ymm9, ymm9, ymm13
- ; ymm10 = ( i31*c13, i30*c12, i29*c11, i28*c10, i27*c03, i26*c02, i25*c01, i24*c00 )
- vmulps ymm10, ymm11, ymm14
- ; ymm11 = ( i31*c23, i30*c22, i29*c21, i28*c20, i27*c13, i26*c12, i25*c11, i24*c10 )
- vmulps ymm11, ymm11, ymm15
- ; ymm0 = ( i06*c22+i07*c23, i04*c20+i05*c21, i06*c12+i07*c13, i04*c10+i05*c11, i02*c12+i03*c13, i00*c10+i01*c11, i02*c02+i03*c03, i00*c00+i01*c01 )
- vhaddps ymm0, ymm0, ymm1
- ; ymm2 = ( i10*c12+i11*c13, i08*c10+i09*c11, i10*c02+i11*c03, i08*c00+i09*c01, i06*c02+i07*c03, i04*c00+i05*c01, i02*c22+i03*c23, i00*c20+i01*c21 )
- vhaddps ymm2, ymm2, ymm3
- ; ymm4 = ( i18*c12+i19*c13, i16*c10+i17*c11, i18*c02+i19*c03, i16*c00+i17*c01, i14*c02+i15*c03, i12*c00+i13*c01, i10*c22+i11*c23, i08*c20+i09*c21 )
- vhaddps ymm4, ymm4, ymm5
- ; ymm6 = ( i22*c02+i23*c03, i20*c00+i21*c01, i18*c22+i19*c23, i16*c20+i17*c21, i14*c22+i15*c23, i12*c20+i13*c21, i14*c12+i15*c13, i12*c10+i13*c11 )
- vhaddps ymm6, ymm6, ymm7
- ; ymm8 = ( i30*c02+i31*c03, i28*c00+i29*c01, i26*c22+i27*c23, i24*c20+i25*c21, i22*c22+i23*c23, i20*c20+i21*c21, i22*c12+i23*c13, i20*c10+i21*c11 )
- vhaddps ymm8, ymm8, ymm9
- ; ymm10 = ( i30*c22+i31*c23, i28*c20+i29*c21, i30*c12+i31*c13, i28*c10+i29*c11, i26*c12+i27*c13, i24*c10+i25*c11, i26*c02+i27*c03, i24*c00+i25*c01 )
- vhaddps ymm10, ymm10, ymm11
- ; o00 = i00*c00+i01*c01+i02*c02+i03*c03
- ; o01 = i00*c10+i01*c11+i02*c12+i03*c13
- ; o02 = i00*c20+i01*c21+i02*c22+i03*c23
- ; o03 = i04*c00+i05*c01+i06*c02+i07*c03
- ; o04 = i04*c10+i05*c11+i06*c12+i07*c13
- ; o05 = i04*c20+i05*c21+i06*c22+i07*c23
- ; o06 = i08*c00+i09*c01+i10*c02+i11*c03
- ; o07 = i08*c10+i09*c11+i10*c12+i11*c13
- ; o08 = i08*c20+i09*c21+i10*c22+i11*c23
- ; o09 = i12*c00+i13*c01+i14*c02+i15*c03
- ; o10 = i12*c10+i13*c11+i14*c12+i15*c13
- ; o11 = i12*c20+i13*c21+i14*c22+i15*c23
- ; o12 = i16*c00+i17*c01+i18*c02+i19*c03
- ; o13 = i16*c10+i17*c11+i18*c12+i19*c13
- ; o14 = i16*c20+i17*c21+i18*c22+i19*c23
- ; o15 = i20*c00+i21*c01+i22*c02+i23*c03
- ; o16 = i20*c10+i21*c11+i22*c12+i23*c13
- ; o17 = i20*c20+i21*c21+i22*c22+i23*c23
- ; o18 = i24*c00+i25*c01+i26*c02+i27*c03
- ; o19 = i24*c10+i25*c11+i26*c12+i27*c13
- ; o20 = i24*c20+i25*c21+i26*c22+i27*c23
- ; o21 = i28*c00+i29*c01+i30*c02+i31*c03
- ; o22 = i28*c10+i29*c11+i30*c12+i31*c13
- ; o23 = i28*c20+i29*c21+i30*c22+i31*c23
- ; ymm0 = ( i08*c10+i09*c11+i10*c12+i11*c13, i08*c00+i09*c01+i10*c02+i11*c03, i04*c20+i05*c21+i06*c22+i07*c23, i04*c10+i05*c11+i06*c12+i07*c13,
- ; i04*c00+i05*c01+i06*c02+i07*c03, i00*c20+i01*c21+i02*c22+i03*c23, i00*c10+i01*c11+i02*c12+i03*c13, i00*c00+i01*c01+i02*c02+i03*c03 )
- ; ymm0 = ( o07, o06, o05, o04, o03, o02, o01, o00 )
- vhaddps ymm0, ymm0, ymm2
- ; ymm4 = ( i20*c00+i21*c01+i22*c02+i23*c03, i16*c20+i17*c21+i18*c22+i19*c23, i16*c10+i17*c11+i18*c12+i19*c13, i16*c00+i17*c01+i18*c02+i19*c03,
- ; i12*c20+i13*c21+i14*c22+i15*c23, i12*c10+i13*c11+i14*c12+i15*c13, i12*c00+i13*c01+i14*c02+i15*c03, i08*c20+i09*c21+i10*c22+i11*c23 )
- ; ymm0 = ( o15, o14, o13, o12, o11, o10, o09, o08 )
- vhaddps ymm4, ymm4, ymm6
- ; ymm8 = ( i28*c20+i29*c21+i30*c22+i31*c23, i28*c10+i29*c11+i30*c12+i31*c13, i28*c00+i29*c01+i30*c02+i31*c03, i24*c20+i25*c21+i26*c22+i27*c23,
- ; i24*c10+i25*c11+i26*c12+i27*c13, i24*c00+i25*c01+i26*c02+i27*c03, i20*c20+i21*c21+i22*c22+i23*c23, i20*c10+i21*c11+i22*c12+i23*c13 )
- ; ymm8 = ( o23, o22, o21, o20, o19, o18, o17, o16 )
- vhaddps ymm8, ymm8, ymm10
- vmovntps [rdx], ymm0
- vmovntps [rdx + 32], ymm4
- vmovntps [rdx + 64], ymm8
- sub rcx, -128
- add rdx, 96
- sub r8, 8
- jnz .main_processing_loop
- vzeroupper
- ret
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement