Don't like ads? PRO users don't see any ads ;-)
Guest

Untitled

By: a guest on May 7th, 2012  |  syntax: None  |  size: 1.33 KB  |  hits: 26  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. Fast Image square on (int) Image- ARM neon intrinsics - iOS Dev
  2. int *image_sqr_Baseaaddr = (int *) malloc(noOfPixels * sizeof(int));
  3.  
  4. for (int i=0; i<newNoOfPixels; i++)
  5.      image_sqr_Baseaaddr[i] = (int) image_scaled_Baseaaddr[i] * (int) image_scaled_Baseaaddr[i];
  6.        
  7. #include <arm_neon.h>
  8.  
  9. // ...
  10.  
  11. int i;
  12.  
  13. for (i = 0; i <= newNoOfPixels - 16; i += 16)           // SIMD loop
  14. {
  15.     uint8x16_t v = vld1q_u8(&image_scaled_Baseaaddr[i]);// load 16 x 8 bit pixels
  16.  
  17.     int16x8_t vl = (int16x8_t)vmovl_u8(vget_low_u8(v)); // unpack into 2 x 16 bit vectors
  18.     int16x8_t vh = (int16x8_t)vmovl_u8(vget_high_u8(v));
  19.  
  20.     vl = vmulq_s16(vl, vl);                             // square them
  21.     vh = vmulq_s16(vh, vh);
  22.  
  23.     int32x4_t vll = vmovl_s16(vget_low_s16(vl));        // unpack to 4 x 32 bit vectors
  24.     int32x4_t vlh = vmovl_s16(vget_high_s16(vl));
  25.     int32x4_t vhl = vmovl_s16(vget_low_s16(vh));
  26.     int32x4_t vhh = vmovl_s16(vget_high_s16(vh));
  27.  
  28.     vst1q_s32(&image_sqr_Baseaaddr[i], vll);            // store 32 bit squared values
  29.     vst1q_s32(&image_sqr_Baseaaddr[i + 4], vlh);
  30.     vst1q_s32(&image_sqr_Baseaaddr[i + 8], vhl);
  31.     vst1q_s32(&image_sqr_Baseaaddr[i + 12], vhh);
  32. }
  33. for ( ; i < newNoOfPixels; ++i)                         // scalar clean up loop
  34. {
  35.     int32_t p = (int32_t)image_scaled_Baseaaddr[i];
  36.     image_sqr_Baseaaddr[i] = p * p;
  37. }