Convert.asm (4-component output)

; Processing speed on Core i7 2630QM (2GHz, Signle-Channel DDR3-1333 (PC3-10700))
; * Data in memory: 8700 MB/s (270 MPix/s)
; * Data in L3: 17700 MB/s (550 MPix/s)
; * Data in L2: 19000 MB/s (590 MPix/s)
; Compile: nasm -f win64 -o Convert.obj Convert.asm
; Use:
; extern "C" void convert_pixels(const float* source, float* destination, size_t length)
;
; int main(int argc, char** argv) {
;     size_t pixels = 64*1024*1024;
;     float* inputBuffer = (float*)malloc(pixels*sizeof(float)*4)
;     float* outputBuffer = (float*)malloc(pixels*sizeof(float)*4);
;     convert_pixels(inputBuffer, outputBuffer, pixels);
; }

SECTION .rdata
align 32
    c0 dd 0.44, 0.33, 0.22, 0.11, 0.44, 0.33, 0.22, 0.11
    c1 dd 0.88, 0.77, 0.66, 0.55, 0.88, 0.77, 0.66, 0.55
    c2 dd 1.04, 1.03, 1.02, 1.01, 1.04, 1.03, 1.02, 1.01

SECTION .text

global convert_pixels

; extern "C" void convert_pixels(const float* source, float* destination, size_t length)
convert_pixels:
    ; rcx - source
    ; rdx - destination
    ; r8 - length

    vzeroupper
    vmovaps ymm12, [c0]
    vmovaps ymm13, [c1]
    vmovaps ymm14, [c2]
    vxorps  ymm15,  ymm15,  ymm15

    mov rax, r8
    shr r8, 1
    align 32
.main_processing_loop:
    prefetchnta [rcx + 1408]
    prefetchnta [rcx + rax*8 + 1408]

    vmovaps ymm2,  [rcx]
    vmovaps ymm5,  [rcx + 32]
    vmovaps ymm8,  [rcx + rax*8]
    vmovaps ymm11, [rcx + rax*8 + 32]

    vmulps ymm0,  ymm2,  ymm12
    vmulps ymm1,  ymm2,  ymm13
    vmulps ymm2,  ymm2,  ymm14

    vmulps ymm3,  ymm5,  ymm12
    vmulps ymm4,  ymm5,  ymm13
    vmulps ymm5,  ymm5,  ymm14

    vmulps ymm6,  ymm8,  ymm12
    vmulps ymm7,  ymm8,  ymm13
    vmulps ymm8,  ymm8,  ymm14

    vmulps ymm9,  ymm11, ymm12
    vmulps ymm10, ymm11, ymm13
    vmulps ymm11, ymm11, ymm14

    vhaddps ymm0,  ymm0,  ymm1
    vhaddps ymm2,  ymm2,  ymm15

    vhaddps ymm3,  ymm3,  ymm4
    vhaddps ymm5,  ymm5,  ymm15

    vhaddps ymm6,  ymm6,  ymm7
    vhaddps ymm8,  ymm8,  ymm15

    vhaddps ymm9,  ymm9,  ymm10
    vhaddps ymm11, ymm11, ymm15

    vhaddps ymm0,  ymm0,  ymm2
    vhaddps ymm3,  ymm3,  ymm5
    vhaddps ymm6,  ymm6,  ymm8
    vhaddps ymm9,  ymm9,  ymm11

    vmovntps [rdx], ymm0
    vmovntps [rdx + 32], ymm3
    vmovntps [rdx + rax*8], ymm6
    vmovntps [rdx + rax*8 + 32], ymm9

    add rcx, 64
    add rdx, 64
    sub r8, 4
    jnz .main_processing_loop
    vzeroupper

    ret