runnig

SAD, sum of absolute differences in ASM (NEON instr set)

Nov 7th, 2012
352
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
ARM 2.10 KB | None | 0 0
  1.     .cpu cortex-a8
  2.     .fpu neon
  3.    
  4.     .text
  5.     .align  2
  6.     .global _Z8rect_sadPKhS0_jjj
  7.     .type   _Z8rect_sadPKhS0_jjj, %function
  8.    
  9. _Z8rect_sadPKhS0_jjj:
  10.     .fnstart
  11.     src_line1_r0 .req  r0
  12.     targ_line1_r1 .req r1
  13.     rect_w_r2 .req r2
  14.     rect_h_r3 .req r3
  15.     stride_r4 .req  r4
  16.    
  17.     gap_r6 .req r6
  18.     end_r7 .req r7
  19.     line_end_r8 .req r8
  20.    
  21. .LFB0:
  22.     @ args = 4, pretend = 0, frame = 8
  23.     @ frame_needed = 0, uses_anonymous_args = 0
  24.     @ link register save eliminated.
  25.     @ r0 = stride
  26.     @ r1 = targ_line
  27.     @ r2 = rect_w
  28.     @ r3 = rect_h
  29.     @ [sp,#40] = stride
  30.    
  31.     stmfd   sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12}
  32.     .save {r4, r5, r6, r7, r8, r9, r10, r11, r12}
  33.    
  34. .LCFI0:
  35.     .pad #8
  36.     sub sp, sp, #8  @,,
  37.    
  38.     ldr stride_r4, [sp, #40]                                @ stride, stride
  39.    
  40.     pld [src_line1_r0]
  41.     pld [targ_line1_r1]
  42.                
  43.     veor.u8 q0, q0, q0                                      @ q0 = #0
  44.     veor.u8 q2, q2, q2                                      @ q2 = #0
  45.    
  46.     sub gap_r6, stride_r4, rect_w_r2   
  47.     mla end_r7, rect_h_r3, stride_r4, src_line1_r0          @ end = rect_h * stride + src_line
  48.    
  49. .rect_sad_vert_loop:
  50.  
  51.     pld [src_line1_r0, #1024]
  52.     pld [targ_line1_r1, #1024]
  53.  
  54.     add line_end_r8, src_line1_r0, rect_w_r2                    @ line_end = src_line + rect_w
  55.     veor.u8 q1, q1, q1                                      @ q1 = #0
  56.    
  57. .rect_sad_horz_loop:
  58.     vld1.8          {d4}, [src_line1_r0]!                       @ d4 = src_line[0..7]
  59.     vld1.8          {d5}, [targ_line1_r1]!                      @ d5 = targ_line[0..7]
  60.    
  61.     vabal.u8        q1, d4, d5                              @ q1 += |d4 - d5|
  62.        
  63.     cmp src_line1_r0, line_end_r8                           @ if(src_line < line_end)
  64.     blt .rect_sad_horz_loop                                
  65.  
  66. .rect_sad_vert_loop_end:
  67.     vpadalq.u16 q0, q1                                      @ q0 = sum(q1)
  68.    
  69.     add src_line1_r0, src_line1_r0, gap_r6      @ src_line1 += gap_r6
  70.     add targ_line1_r1, targ_line1_r1, gap_r6    @ targ_line1 += gap_r6
  71.        
  72.     cmp src_line1_r0, end_r7
  73.     blt .rect_sad_vert_loop
  74.    
  75. .rect_sad_ret:
  76.     vpadd.u32 d0, d0, d1                        @ d0 = d0 + d1 
  77.     vmov r0, r1, d0                             @ r0 = d0[0], r1 = d0[1]
  78.     add  r0, r0, r1                             @ r0 = r0 + r1
  79.    
  80.     add sp, sp, #8              @,,
  81.     ldmfd   sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12}
  82.     bx  lr
  83. .LFE0:
  84.     .fnend
  85.     .size   _Z8rect_sadPKhS0_jjj, .-_Z8rect_sadPKhS0_jjj
  86.     .ident  "GCC: (GNU) 4.4.3"
  87.     .section    .note.GNU-stack,"",%progbits
Advertisement
Add Comment
Please, Sign In to add comment