Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // http://www.intel.com/content/dam/doc/manual/64-ia-32-architectures-optimization-manual.pdf
- // return 0 if strings are equal, 1 if greater, -1if less
- int strcmp_sse4_2(const char *src1, const char *src2) {
- int val;
- __asm{
- mov esi, src1 ;
- mov edi, src2
- mov edx, -16 ; common index relative to base of either string pointer xor eax, eax
- topofloop:
- add edx, 16 ; prevent loop carry dependency
- next:
- lea ecx, [esi+edx] ; address of fragment that we want to load
- and ecx, 0x0fff ; check least significant12 bits of addr for page boundary cmp ecx, 0x0ff0
- jg too_close_pgb ; branch to byte-granular if within 16 bytes of boundary lea ecx, [edi+edx] ; do the same check for each fragment of 2nd string
- and ecx, 0x0fff
- cmp ecx, 0x0ff0
- jg too_close_pgb
- movdqu xmm2, BYTE PTR[esi+edx]
- movdqu xmm1, BYTE PTR[edi+edx]
- pcmpistri xmm2, xmm1, 0x18 ; equal each
- ja topofloop
- jnc ret_tag
- add edx, ecx ; ecx points to the byte offset that differ
- not_equal:
- movzx eax, BYTE PTR[esi+edx] movzx edx, BYTE PTR[edi+edx] cmp eax, edx
- cmova eax, ONE
- cmovb eax, NEG_ONE
- jmp ret_tag
- too_close_pgb:
- add edx, 1 ; do byte granular compare movzx ecx, BYTE PTR[esi+edx-1]
- movzx ebx, BYTE PTR[edi+edx-1]
- cmp ecx, ebx
- jne inequality
- add ebx, ecx
- jnz next
- jmp ret_tag
- inequality:
- cmovb eax, NEG_ONE
- cmova eax, ONE ret_tag:
- mov [val], eax
- }
- return(val);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement