Advertisement
Guest User

PMADDWD

a guest
Mar 23rd, 2014
229
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 2.63 KB | None | 0 0
  1. #include <smmintrin.h>
  2. #include <stdio.h>
  3. #include <stdint.h>
  4.  
  5.  
  6.  
  7. long dp32(const int32_t * const pA, const int32_t * const pB, long SIZE){
  8.     long valor = 0, i=0;
  9.  
  10.     __m128i vsum, vecPi, vecCi, vecQCi;
  11.  
  12.     vsum = _mm_set1_epi32(0);
  13.  
  14.     int sumDot[1];
  15.  
  16.     for( ; i<SIZE-3 ;i+=4){
  17.             vecPi = _mm_loadu_si128((__m128i *)&(pA)[i] );
  18.             vecCi = _mm_loadu_si128((__m128i *)&(pB)[i] );
  19.             vecQCi = _mm_mullo_epi32(vecPi,vecCi);
  20.             vsum = _mm_add_epi32(vsum,vecQCi);
  21.     }
  22.  
  23.     vsum = _mm_hadd_epi32(vsum, vsum);
  24.     vsum = _mm_hadd_epi32(vsum, vsum);
  25.     sumDot[0] = (int32_t)_mm_extract_epi32(vsum, 0);
  26.  
  27.     for( ; i<SIZE; i++)
  28.           valor += pA[i] * pB[i];
  29.  
  30.     return valor += sumDot[0];
  31. }
  32.  
  33. long dp16(const int16_t * const pA, const int16_t * const pB, long SIZE){
  34.     long valor = 0, i=0;
  35.  
  36.     __m128i vsum, vecPi, vecCi, vecQCi;
  37.  
  38.     vsum = _mm_set1_epi32(0);
  39.  
  40.     int sumDot[1];
  41.  
  42.     for( ; i<SIZE-7 ;i+=8){
  43.             vecPi  = _mm_loadu_si128((__m128i *)&(pA)[i] );
  44.             vecCi  = _mm_loadu_si128((__m128i *)&(pB)[i] );
  45.             vecQCi = _mm_madd_epi16(vecPi,vecCi);
  46.             vsum   = _mm_add_epi32(vsum, vecQCi);
  47.     }
  48.  
  49.     vsum = _mm_hadd_epi32(vsum, vsum);
  50.     vsum = _mm_hadd_epi32(vsum, vsum);
  51.     sumDot[0] = (int32_t)_mm_extract_epi32(vsum, 0);
  52.    
  53.     for( ; i<SIZE; i++)
  54.           valor += pA[i] * pB[i];
  55.  
  56.     return valor += sumDot[0];
  57. }
  58.  
  59. //#define SIZE 9
  60. //#define VECA  0, -1,  2,  -3,  4, -5,   6, -7,  8
  61. //#define VECB -9, 10, 11, -12, 13, 14, -15, 16, 17
  62. #define SIZE 40
  63. #define VECA 169, -341, 487, -50, 116, -66, 170, -217, -236, 72, -296, -469, -178, -1, 61, -26, -107, 593, -558, 125, 111, 111, 184, 141, -228, -490, -357, 535, 206, -85, -29, -216, -490, -265, 12, 250, 306, -291, -46, 180
  64. #define VECB 398, 305, 268, -125, -96, -214, -284, 108, -37, 2, -402, -228, 243, 33, 76, 265, 3, -558, -323, -552, 419, 408, -217, -2, -440, -375, 153, -108, -79, -80, 299, 81, -385, 80, 53, 294, 170, -380, -164, -172
  65. /*            0 -10 +22  +36 +52 -70  -90-112+136 */
  66. /*            -10   +58      -18      -202   +136 */
  67. /*            +48            -220            +136 */
  68. /*            -172                           +136 */
  69. /*            -36                                 */
  70.  
  71. int main(){
  72.     int32_t arrA32[] = {VECA};
  73.     int16_t arrA16[] = {VECA};
  74.     int32_t arrB32[] = {VECB};
  75.     int16_t arrB16[] = {VECB};
  76.    
  77.     long v32 = dp32(arrA32, arrB32, SIZE);
  78.     long v16 = dp16(arrA16, arrB16, SIZE);
  79.    
  80.     if(v32 == v16){
  81.         printf("Success! (32: %ld == %ld : 16)\n", v32, v16);
  82.     }else{
  83.         printf("Failure! (32: %ld != %ld : 16)\n", v32, v16);
  84.     }
  85.     return 0;
  86. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement