Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Rgb to grayscale conversion with arm neon
- void neon_asm_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
- {
- __asm__ volatile(
- "lsr %2, %2, #3 n"
- "# build the three constants: n"
- "mov r4, #28 n" // Blue channel multiplier
- "mov r5, #151 n" // Green channel multiplier
- "mov r6, #77 n" // Red channel multiplier
- "vdup.8 d4, r4 n"
- "vdup.8 d5, r5 n"
- "vdup.8 d6, r6 n"
- "0: n"
- "# load 8 pixels: n" //RGBR
- "vld4.8 {d0-d3}, [%1]! n"
- "# do the weight average: n"
- "vmull.u8 q7, d0, d4 n"
- "vmlal.u8 q7, d1, d5 n"
- "vmlal.u8 q7, d2, d6 n"
- "# shift and store: n"
- "vshrn.u16 d7, q7, #8 n" // Divide q3 by 256 and store in the d7
- "vst1.8 {d7}, [%0]! n"
- "subs %2, %2, #1 n" // Decrement iteration count
- "# load 8 pixels: n"
- "vld4.8 {d8-d11}, [%1]! n" //Other GBRG
- "# do the weight average: n"
- "vmull.u8 q7, d3, d4 n"
- "vmlal.u8 q7, d8, d5 n"
- "vmlal.u8 q7, d9, d6 n"
- "# shift and store: n"
- "vshrn.u16 d7, q7, #8 n" // Divide q3 by 256 and store in the d7
- "vst1.8 {d7}, [%0]! n"
- "subs %2, %2, #1 n" // Decrement iteration count
- "# load 8 pixels: n"
- "vld4.8 {d0-d3}, [%1]! n"
- "# do the weight average: n"
- "vmull.u8 q7, d10, d4 n"
- "vmlal.u8 q7, d11, d5 n"
- "vmlal.u8 q7, d0, d6 n"
- "# shift and store: n"
- "vshrn.u16 d7, q7, #8 n" // Divide q3 by 256 and store in the d7
- "vst1.8 {d7}, [%0]! n"
- "subs %2, %2, #1 n" // Decrement iteration count
- "# do the weight average: n"
- "vmull.u8 q7, d1, d4 n"
- "vmlal.u8 q7, d2, d5 n"
- "vmlal.u8 q7, d3, d6 n"
- "# shift and store: n"
- "vshrn.u16 d7, q7, #8 n" // Divide q3 by 256 and store in the d7
- "vst1.8 {d7}, [%0]! n"
- "subs %2, %2, #1 n" // Decrement iteration count
- "bne 0b n" // Repeat unil iteration count is not zero
- :
- : "r"(dest), "r"(src), "r"(numPixels)
- : "r4", "r5", "r6"
- );
- }
- "vld4.8 {d0-d3}, [%1]! n"
- "vld3.8 {d0-d2}, [%1]! n"
- void neon_asm_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
- {
- __asm__ volatile(
- "# build the three constants: n"
- "mov r4, #28 n" // Blue channel multiplier
- "mov r5, #151 n" // Green channel multiplier
- "mov r6, #77 n" // Red channel multiplier
- "vdup.8 d4, r4 n"
- "vdup.8 d5, r5 n"
- "vdup.8 d6, r6 n"
- "0: n"
- "# load 8 pixels: n" //RGBR
- "vld3.8 {d0-d2}, [%1]! n"
- "# do the weight average: n"
- "vmull.u8 q7, d0, d4 n"
- "vmlal.u8 q7, d1, d5 n"
- "vmlal.u8 q7, d2, d6 n"
- "# shift and store: n"
- "vshrn.u16 d7, q7, #8 n" // Divide q3 by 256 and store in the d7
- "vst1.8 {d7}, [%0]! n"
- "subs %2, %2, #1 n" // Decrement iteration count
- "bne 0b n" // Repeat unil iteration count is not zero
- :
- : "r"(dest), "r"(src), "r"(numPixels)
- : "r4", "r5", "r6"
- );
- }
Add Comment
Please, Sign In to add comment