daily pastebin goal
77%
SHARE
TWEET

Untitled

a guest Aug 10th, 2018 64 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. Rgb to grayscale conversion with arm neon
  2. void neon_asm_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
  3. {
  4.     __asm__ volatile(
  5.      "lsr %2, %2, #3 n"
  6.      "# build the three constants:  n"
  7.      "mov r4, #28                   n" // Blue channel multiplier
  8.      "mov r5, #151                  n" // Green channel multiplier
  9.      "mov r6, #77                   n" // Red channel multiplier
  10.      "vdup.8 d4, r4                 n"
  11.      "vdup.8 d5, r5                 n"
  12.      "vdup.8 d6, r6                 n"
  13.      "0: n"
  14.      "# load 8 pixels: n"  //RGBR
  15.      "vld4.8 {d0-d3}, [%1]! n"
  16.      "# do the weight average: n"
  17.      "vmull.u8 q7, d0, d4 n"
  18.      "vmlal.u8 q7, d1, d5 n"
  19.      "vmlal.u8 q7, d2, d6 n"
  20.      "# shift and store: n"
  21.      "vshrn.u16 d7, q7, #8 n" // Divide q3 by 256 and store in the d7
  22.      "vst1.8 {d7}, [%0]! n"
  23.      "subs %2, %2, #1 n" // Decrement iteration count
  24.  
  25.      "# load 8 pixels: n"
  26.      "vld4.8 {d8-d11}, [%1]! n" //Other GBRG
  27.      "# do the weight average: n"
  28.      "vmull.u8 q7, d3, d4 n"
  29.      "vmlal.u8 q7, d8, d5 n"
  30.      "vmlal.u8 q7, d9, d6 n"
  31.      "# shift and store: n"
  32.      "vshrn.u16 d7, q7, #8 n" // Divide q3 by 256 and store in the d7
  33.      "vst1.8 {d7}, [%0]! n"
  34.      "subs %2, %2, #1 n" // Decrement iteration count
  35.  
  36.      "# load 8 pixels: n"
  37.      "vld4.8 {d0-d3}, [%1]! n"
  38.      "# do the weight average: n"
  39.      "vmull.u8 q7, d10, d4 n"
  40.      "vmlal.u8 q7, d11, d5 n"
  41.      "vmlal.u8 q7, d0, d6 n"
  42.      "# shift and store: n"
  43.      "vshrn.u16 d7, q7, #8 n" // Divide q3 by 256 and store in the d7
  44.      "vst1.8 {d7}, [%0]! n"
  45.      "subs %2, %2, #1 n" // Decrement iteration count
  46.  
  47.  
  48.      "# do the weight average: n"
  49.      "vmull.u8 q7, d1, d4 n"
  50.      "vmlal.u8 q7, d2, d5 n"
  51.      "vmlal.u8 q7, d3, d6 n"
  52.      "# shift and store: n"
  53.      "vshrn.u16 d7, q7, #8 n" // Divide q3 by 256 and store in the d7
  54.      "vst1.8 {d7}, [%0]! n"
  55.  
  56.      "subs %2, %2, #1 n" // Decrement iteration count
  57.  
  58.  
  59.  
  60.      "bne 0b n" // Repeat unil iteration count is not zero
  61.      :
  62.      : "r"(dest), "r"(src), "r"(numPixels)
  63.      : "r4", "r5", "r6"
  64.     );
  65. }
  66.    
  67. "vld4.8 {d0-d3}, [%1]! n"
  68.    
  69. "vld3.8 {d0-d2}, [%1]! n"
  70.    
  71. void neon_asm_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
  72. {
  73.   __asm__ volatile(
  74.    "# build the three constants:  n"
  75.    "mov r4, #28                   n" // Blue channel multiplier
  76.    "mov r5, #151                  n" // Green channel multiplier
  77.    "mov r6, #77                   n" // Red channel multiplier
  78.    "vdup.8 d4, r4                 n"
  79.    "vdup.8 d5, r5                 n"
  80.    "vdup.8 d6, r6                 n"
  81.  
  82.    "0: n"
  83.    "# load 8 pixels: n"  //RGBR
  84.    "vld3.8 {d0-d2}, [%1]! n"
  85.    "# do the weight average: n"
  86.    "vmull.u8 q7, d0, d4 n"
  87.    "vmlal.u8 q7, d1, d5 n"
  88.    "vmlal.u8 q7, d2, d6 n"
  89.    "# shift and store: n"
  90.    "vshrn.u16 d7, q7, #8 n" // Divide q3 by 256 and store in the d7
  91.    "vst1.8 {d7}, [%0]! n"
  92.    "subs %2, %2, #1 n" // Decrement iteration count
  93.    "bne 0b n" // Repeat unil iteration count is not zero
  94.    :
  95.    : "r"(dest), "r"(src), "r"(numPixels)
  96.    : "r4", "r5", "r6"
  97.   );
  98. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top