Guest User

Untitled

a guest
Aug 10th, 2018
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.03 KB | None | 0 0
  1. Rgb to grayscale conversion with arm neon
  2. void neon_asm_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
  3. {
  4. __asm__ volatile(
  5. "lsr %2, %2, #3 n"
  6. "# build the three constants: n"
  7. "mov r4, #28 n" // Blue channel multiplier
  8. "mov r5, #151 n" // Green channel multiplier
  9. "mov r6, #77 n" // Red channel multiplier
  10. "vdup.8 d4, r4 n"
  11. "vdup.8 d5, r5 n"
  12. "vdup.8 d6, r6 n"
  13. "0: n"
  14. "# load 8 pixels: n" //RGBR
  15. "vld4.8 {d0-d3}, [%1]! n"
  16. "# do the weight average: n"
  17. "vmull.u8 q7, d0, d4 n"
  18. "vmlal.u8 q7, d1, d5 n"
  19. "vmlal.u8 q7, d2, d6 n"
  20. "# shift and store: n"
  21. "vshrn.u16 d7, q7, #8 n" // Divide q3 by 256 and store in the d7
  22. "vst1.8 {d7}, [%0]! n"
  23. "subs %2, %2, #1 n" // Decrement iteration count
  24.  
  25. "# load 8 pixels: n"
  26. "vld4.8 {d8-d11}, [%1]! n" //Other GBRG
  27. "# do the weight average: n"
  28. "vmull.u8 q7, d3, d4 n"
  29. "vmlal.u8 q7, d8, d5 n"
  30. "vmlal.u8 q7, d9, d6 n"
  31. "# shift and store: n"
  32. "vshrn.u16 d7, q7, #8 n" // Divide q3 by 256 and store in the d7
  33. "vst1.8 {d7}, [%0]! n"
  34. "subs %2, %2, #1 n" // Decrement iteration count
  35.  
  36. "# load 8 pixels: n"
  37. "vld4.8 {d0-d3}, [%1]! n"
  38. "# do the weight average: n"
  39. "vmull.u8 q7, d10, d4 n"
  40. "vmlal.u8 q7, d11, d5 n"
  41. "vmlal.u8 q7, d0, d6 n"
  42. "# shift and store: n"
  43. "vshrn.u16 d7, q7, #8 n" // Divide q3 by 256 and store in the d7
  44. "vst1.8 {d7}, [%0]! n"
  45. "subs %2, %2, #1 n" // Decrement iteration count
  46.  
  47.  
  48. "# do the weight average: n"
  49. "vmull.u8 q7, d1, d4 n"
  50. "vmlal.u8 q7, d2, d5 n"
  51. "vmlal.u8 q7, d3, d6 n"
  52. "# shift and store: n"
  53. "vshrn.u16 d7, q7, #8 n" // Divide q3 by 256 and store in the d7
  54. "vst1.8 {d7}, [%0]! n"
  55.  
  56. "subs %2, %2, #1 n" // Decrement iteration count
  57.  
  58.  
  59.  
  60. "bne 0b n" // Repeat unil iteration count is not zero
  61. :
  62. : "r"(dest), "r"(src), "r"(numPixels)
  63. : "r4", "r5", "r6"
  64. );
  65. }
  66.  
  67. "vld4.8 {d0-d3}, [%1]! n"
  68.  
  69. "vld3.8 {d0-d2}, [%1]! n"
  70.  
  71. void neon_asm_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
  72. {
  73. __asm__ volatile(
  74. "# build the three constants: n"
  75. "mov r4, #28 n" // Blue channel multiplier
  76. "mov r5, #151 n" // Green channel multiplier
  77. "mov r6, #77 n" // Red channel multiplier
  78. "vdup.8 d4, r4 n"
  79. "vdup.8 d5, r5 n"
  80. "vdup.8 d6, r6 n"
  81.  
  82. "0: n"
  83. "# load 8 pixels: n" //RGBR
  84. "vld3.8 {d0-d2}, [%1]! n"
  85. "# do the weight average: n"
  86. "vmull.u8 q7, d0, d4 n"
  87. "vmlal.u8 q7, d1, d5 n"
  88. "vmlal.u8 q7, d2, d6 n"
  89. "# shift and store: n"
  90. "vshrn.u16 d7, q7, #8 n" // Divide q3 by 256 and store in the d7
  91. "vst1.8 {d7}, [%0]! n"
  92. "subs %2, %2, #1 n" // Decrement iteration count
  93. "bne 0b n" // Repeat unil iteration count is not zero
  94. :
  95. : "r"(dest), "r"(src), "r"(numPixels)
  96. : "r4", "r5", "r6"
  97. );
  98. }
Add Comment
Please, Sign In to add comment