Untitled

   0x0000000000bcec8b:  mov    rdi,rbx
   0x0000000000bcec8e:  push   rbx
   0x0000000000bcec8f:  mov    ecx,edi
   0x0000000000bcec91:  xor    ebx,ebx
   0x0000000000bcec93:  add    ecx,0x7
   0x0000000000bcec96:  xor    eax,eax
   0x0000000000bcec98:  and    ecx,0xfffffff8
   0x0000000000bcec9b:  sub    ecx,edi
   0x0000000000bcec9d:  je     0xbcecb2
=> 0x0000000000bcec9f:  mov    al,BYTE PTR [edi+ebx*1]
   0x0000000000bceca3:  add    al,BYTE PTR [esi+ebx*1]
   0x0000000000bceca7:  inc    ebx
   0x0000000000bceca9:  cmp    ebx,ecx
   0x0000000000bcecab:  mov    BYTE PTR [edi+ebx*1-0x1],al
   0x0000000000bcecb0:  jb     0xbcec9f
   0x0000000000bcecb2:  mov    ecx,edx
   0x0000000000bcecb4:  sub    edx,ebx
   0x0000000000bcecb6:  and    edx,0x3f
   0x0000000000bcecb9:  sub    ecx,edx
   0x0000000000bcecbb:  movq   mm1,QWORD PTR [esi+ebx*1]
   0x0000000000bcecc0:  movq   mm0,QWORD PTR [edi+ebx*1]
   0x0000000000bcecc5:  movq   mm3,QWORD PTR [esi+ebx*1+0x8]
   0x0000000000bceccb:  paddb  mm0,mm1
   0x0000000000bcecce:  movq   mm2,QWORD PTR [edi+ebx*1+0x8]
   0x0000000000bcecd4:  movq   QWORD PTR [edi+ebx*1],mm0
   0x0000000000bcecd9:  paddb  mm2,mm3
   0x0000000000bcecdc:  movq   mm5,QWORD PTR [esi+ebx*1+0x10]


// Optimized code for PNG Up filter decoder
void /* PRIVATE */
png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
   png_bytep prev_row)
{
   png_uint_32 len;
   len  = row_info->rowbytes;       // # of bytes to filter
   _asm {
      mov edi, row
      // get # of bytes to alignment
      mov ecx, edi
      xor ebx, ebx
      add ecx, 0x7
      xor eax, eax
      and ecx, 0xfffffff8
      mov esi, prev_row
      sub ecx, edi
      jz dupgo
      // fix alignment
duplp1:
      mov al, [edi+ebx]
      add al, [esi+ebx]
      inc ebx
      cmp ebx, ecx
      mov [edi + ebx-1], al  // mov does not affect flags; -1 to offset inc ebx
      jb duplp1
dupgo:
      mov ecx, len
      mov edx, ecx
      sub edx, ebx                  // subtract alignment fix
      and edx, 0x0000003f           // calc bytes over mult of 64
      sub ecx, edx                  // drop over bytes from length
      // Unrolled loop - use all MMX registers and interleave to reduce
      // number of branch instructions (loops) and reduce partial stalls
duploop:
      movq mm1, [esi+ebx]
      movq mm0, [edi+ebx]
      movq mm3, [esi+ebx+8]
      paddb mm0, mm1
      movq mm2, [edi+ebx+8]
      movq [edi+ebx], mm0
      paddb mm2, mm3
      movq mm5, [esi+ebx+16]