Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ; Processing speed on Core i7 2630QM (2GHz, Signle-Channel DDR3-1333 (PC3-10700))
- ; * Data in memory: 8700 MB/s (270 MPix/s)
- ; * Data in L3: 17700 MB/s (550 MPix/s)
- ; * Data in L2: 19000 MB/s (590 MPix/s)
- ; Compile: nasm -f win64 -o Convert.obj Convert.asm
- ; Use:
- ; extern "C" void convert_pixels(const float* source, float* destination, size_t length)
- ;
- ; int main(int argc, char** argv) {
- ; size_t pixels = 64*1024*1024;
- ; float* inputBuffer = (float*)malloc(pixels*sizeof(float)*4)
- ; float* outputBuffer = (float*)malloc(pixels*sizeof(float)*4);
- ; convert_pixels(inputBuffer, outputBuffer, pixels);
- ; }
- SECTION .rdata
- align 32
- c0 dd 0.44, 0.33, 0.22, 0.11, 0.44, 0.33, 0.22, 0.11
- c1 dd 0.88, 0.77, 0.66, 0.55, 0.88, 0.77, 0.66, 0.55
- c2 dd 1.04, 1.03, 1.02, 1.01, 1.04, 1.03, 1.02, 1.01
- SECTION .text
- global convert_pixels
- ; extern "C" void convert_pixels(const float* source, float* destination, size_t length)
- convert_pixels:
- ; rcx - source
- ; rdx - destination
- ; r8 - length
- vzeroupper
- vmovaps ymm12, [c0]
- vmovaps ymm13, [c1]
- vmovaps ymm14, [c2]
- vxorps ymm15, ymm15, ymm15
- mov rax, r8
- shr r8, 1
- align 32
- .main_processing_loop:
- prefetchnta [rcx + 1408]
- prefetchnta [rcx + rax*8 + 1408]
- vmovaps ymm2, [rcx]
- vmovaps ymm5, [rcx + 32]
- vmovaps ymm8, [rcx + rax*8]
- vmovaps ymm11, [rcx + rax*8 + 32]
- vmulps ymm0, ymm2, ymm12
- vmulps ymm1, ymm2, ymm13
- vmulps ymm2, ymm2, ymm14
- vmulps ymm3, ymm5, ymm12
- vmulps ymm4, ymm5, ymm13
- vmulps ymm5, ymm5, ymm14
- vmulps ymm6, ymm8, ymm12
- vmulps ymm7, ymm8, ymm13
- vmulps ymm8, ymm8, ymm14
- vmulps ymm9, ymm11, ymm12
- vmulps ymm10, ymm11, ymm13
- vmulps ymm11, ymm11, ymm14
- vhaddps ymm0, ymm0, ymm1
- vhaddps ymm2, ymm2, ymm15
- vhaddps ymm3, ymm3, ymm4
- vhaddps ymm5, ymm5, ymm15
- vhaddps ymm6, ymm6, ymm7
- vhaddps ymm8, ymm8, ymm15
- vhaddps ymm9, ymm9, ymm10
- vhaddps ymm11, ymm11, ymm15
- vhaddps ymm0, ymm0, ymm2
- vhaddps ymm3, ymm3, ymm5
- vhaddps ymm6, ymm6, ymm8
- vhaddps ymm9, ymm9, ymm11
- vmovntps [rdx], ymm0
- vmovntps [rdx + 32], ymm3
- vmovntps [rdx + rax*8], ymm6
- vmovntps [rdx + rax*8 + 32], ymm9
- add rcx, 64
- add rdx, 64
- sub r8, 4
- jnz .main_processing_loop
- vzeroupper
- ret
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement