Advertisement
Guest User

SoundTouch-1.9.2_SIMD.patch

a guest
Feb 12th, 2016
246
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 40.49 KB | None | 0 0
  1. --- soundtouch-1.9.2.orig/include/STTypes.h Sun Sep 20 16:40:59 2015
  2. +++ soundtouch-1.9.2.SIMD/include/STTypes.h Sat Feb 13 00:04:07 2016
  3. @@ -50,8 +50,9 @@
  4.  #endif
  5.  
  6.  
  7. -// Helper macro for aligning pointer up to next 16-byte boundary
  8. -#define SOUNDTOUCH_ALIGN_POINTER_16(x)      ( ( (ulongptr)(x) + 15 ) & ~(ulongptr)15 )
  9. +// Helper macro for aligning pointer up to next 32-byte boundary
  10. +#define ALIGN_SIZE 64
  11. +#define SOUNDTOUCH_ALIGN_POINTER(x)    (((ulongptr)(x) + (ALIGN_SIZE)-1) & ~(ulongptr)((ALIGN_SIZE)-1))
  12.  
  13.  
  14.  #if (defined(__GNUC__) && !defined(ANDROID))
  15. @@ -98,8 +99,8 @@
  16.          ///   However, if you still prefer to select the sample format here
  17.          ///   also in GNU environment, then please #undef the INTEGER_SAMPLE
  18.          ///   and FLOAT_SAMPLE defines first as in comments above.
  19. -        //#define SOUNDTOUCH_INTEGER_SAMPLES     1    //< 16bit integer samples
  20. -        #define SOUNDTOUCH_FLOAT_SAMPLES       1    //< 32bit float samples
  21. +        #define SOUNDTOUCH_INTEGER_SAMPLES     1    //< 16bit integer samples
  22. +        //#define SOUNDTOUCH_FLOAT_SAMPLES       1    //< 32bit float samples
  23.      
  24.      #endif
  25.  
  26. @@ -143,8 +144,14 @@
  27.          #endif // SOUNDTOUCH_FLOAT_SAMPLES
  28.  
  29.          #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
  30. -            // Allow MMX optimizations
  31. -            #define SOUNDTOUCH_ALLOW_MMX   1
  32. +            // Allow SSE2 optimizations
  33. +            #define SOUNDTOUCH_ALLOW_SSE2      1
  34. +            // Allow AVX2 optimizations
  35. +            #define SOUNDTOUCH_ALLOW_AVX2      1
  36. +            #ifndef _M_X64
  37. +                // Allow MMX optimizations
  38. +                #define SOUNDTOUCH_ALLOW_MMX   1
  39. +            #endif
  40.          #endif
  41.  
  42.      #else
  43. --- soundtouch-1.9.2.orig/source/SoundTouch/cpu_detect.h    Sun Sep 20 16:40:59 2015
  44. +++ soundtouch-1.9.2.SIMD/source/SoundTouch/cpu_detect.h    Sat Feb 13 03:25:37 2016
  45. @@ -50,6 +50,12 @@
  46.  #define SUPPORT_ALTIVEC     0x0004
  47.  #define SUPPORT_SSE         0x0008
  48.  #define SUPPORT_SSE2        0x0010
  49. +//#define SUPPORT_AVX         0x0020
  50. +//#define SUPPORT_XOP         0x0040
  51. +//#define SUPPORT_FMA4        0x0080
  52. +//#define SUPPORT_FMA3        0x0100
  53. +#define SUPPORT_AVX2        0x0200
  54. +//#define SUPPORT_AVX512      0x0400
  55.  
  56.  /// Checks which instruction set extensions are supported by the CPU.
  57.  ///
  58. --- soundtouch-1.9.2.orig/source/SoundTouch/cpu_detect_x86.cpp  Sun Sep 20 16:40:59 2015
  59. +++ soundtouch-1.9.2.SIMD/source/SoundTouch/cpu_detect_x86.cpp  Sat Feb 13 03:29:07 2016
  60. @@ -39,6 +39,7 @@
  61.  //
  62.  ////////////////////////////////////////////////////////////////////////////////
  63.  
  64. +#include <stdint.h>
  65.  #include "cpu_detect.h"
  66.  #include "STTypes.h"
  67.  
  68. @@ -48,14 +49,24 @@
  69.     #if defined(__GNUC__) && defined(__i386__)
  70.         // gcc
  71.         #include "cpuid.h"
  72. -   #elif defined(_M_IX86)
  73. +   #elif defined(_M_IX86) || defined(_M_X64)
  74.         // windows non-gcc
  75.         #include <intrin.h>
  76.     #endif
  77.  
  78. -   #define bit_MMX     (1 << 23)
  79. -   #define bit_SSE     (1 << 25)
  80. -   #define bit_SSE2    (1 << 26)
  81. +   #define bit_MMX      (1 << 23)  // func 01: edx
  82. +   #define bit_SSE      (1 << 25)  // func 01: edx
  83. +   #define bit_SSE2     (1 << 26)  // func 01: edx
  84. +   #define bit_OSXSAVE  (1 << 27)  // func 01: ecx
  85. +   #define bit_AVX      (1 << 28)  // func 01: ecx
  86. +   #define bit_XOP      (1 << 11)  // func 0x80000001: ecx
  87. +   #define bit_FMA4     (1 << 16)  // func 0x80000001: ecx
  88. +   #define bit_FMA3     (1 << 12)  // func 01: ecx
  89. +   #define bit_AVX2     (1 <<  5)  // func 07 ecx=0: ebx
  90. +   #define bit_AVX512F  (1 << 16)  // func 07 ecx=0: ebx
  91. +   #define bit_AVX512PF (1 << 26)  // func 07 ecx=0: ebx
  92. +   #define bit_AVX512ER (1 << 28)  // func 07 ecx=0: ebx
  93. +   #define bit_AVX512CD (1 << 28)  // func 07 ecx=0: ebx
  94.  #endif
  95.  
  96.  
  97. @@ -82,31 +93,77 @@
  98.  /// If building for a 64bit system (no Itanium) and the user wants optimizations.
  99.  /// Return the OR of SUPPORT_{MMX,SSE,SSE2}. 11001 or 0x19.
  100.  /// Keep the _dwDisabledISA test (2 more operations, could be eliminated).
  101. +/*
  102.  #if ((defined(__GNUC__) && defined(__x86_64__)) \
  103.      || defined(_M_X64))  \
  104.      && defined(SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS)
  105.      return 0x19 & ~_dwDisabledISA;
  106. -
  107. +*/
  108.  /// If building for a 32bit system and the user wants optimizations.
  109.  /// Keep the _dwDisabledISA test (2 more operations, could be eliminated).
  110. -#elif ((defined(__GNUC__) && defined(__i386__)) \
  111. -    || defined(_M_IX86))  \
  112. +#if ((defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) \
  113. +    || (defined(_M_IX86) || defined(_M_X64)))  \
  114.      && defined(SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS)
  115.  
  116.      if (_dwDisabledISA == 0xffffffff) return 0;
  117. -
  118. +
  119. +    enum { UNKNOWN, INTEL, AMD } vendor = UNKNOWN;
  120. +    const uint32_t strIntel[] = { 0x756e6547, 0x49656e69, 0x6c65746e }; // "GenuineIntel"
  121. +    const uint32_t strAmd[]   = { 0x68747541, 0x69746E65, 0x444D4163 }; // "AuthenticAMD"
  122. +
  123.      uint res = 0;
  124.  
  125.  #if defined(__GNUC__)
  126.      // GCC version of cpuid. Requires GCC 4.3.0 or later for __cpuid intrinsic support.
  127. +    //                AVX.                4.4                AVX
  128. +    //                AVX2.               4.7                AVX2
  129.      uint eax, ebx, ecx, edx;  // unsigned int is the standard type. uint is defined by the compiler and not guaranteed to be portable.
  130.  
  131.      // Check if no cpuid support.
  132. -    if (!__get_cpuid (1, &eax, &ebx, &ecx, &edx)) return 0; // always disable extensions.
  133. +    if (!__get_cpuid (0, &eax, &ebx, &ecx, &edx)) return 0; // always disable extensions.
  134.  
  135. -    if (edx & bit_MMX)  res = res | SUPPORT_MMX;
  136. -    if (edx & bit_SSE)  res = res | SUPPORT_SSE;
  137. -    if (edx & bit_SSE2) res = res | SUPPORT_SSE2;
  138. +    uint cpuidMaxFuncNum = eax;
  139. +    if      (ebx == strIntel[0] && edx == strIntel[1] && ecx == strIntel[2]) vendor = INTEL;
  140. +    else if (ebx == strAmd[0]   && edx == strAmd[1]   && ecx == strAmd[2])   vendor = AMD;
  141. +    __get_cpuid(1, &eax, &ebx, &ecx, &edx);
  142. +
  143. +    #if defined(__x86_x64__)
  144. +                                        res = res | SUPPORT_SSE2 | SUPPORT_SSE | SUPPORT_MMX;
  145. +    #else
  146. +    if (edx & bit_MMX)                  res = res | SUPPORT_MMX;
  147. +    if (edx & bit_SSE)                  res = res | SUPPORT_SSE;
  148. +    if (edx & bit_SSE2)                 res = res | SUPPORT_SSE2;
  149. +    #endif
  150. +
  151. +    // Check AVX if GCC version 4.4 or later
  152. +    #if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4
  153. +    if ((ecx & (bit_AVX | bit_OSXSAVE)) == (bit_AVX | bit_OSXSAVE))
  154. +    {
  155. +        #ifdef __APPLE__
  156. +         __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));    // MacPorts
  157. +        #else
  158. +        __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
  159. +        #endif
  160. +        uint64_t xcr0 = ((uint64_t)edx << 32LL) | eax;
  161. +        if ((xcr0 & 0x06) == 0x06)
  162. +        {
  163. +                                        res = res | SUPPORT_AVX;
  164. +            if (ecx & bit_FMA3)         res = res | SUPPORT_FMA3;
  165. +            if (cpuidMaxFuncNum >= 7)
  166. +            {
  167. +                __get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
  168. +                if (ebx & bit_AVX2)     res = res | SUPPORT_AVX2;
  169. +                if ((ebx & bit_AVX512F) && ((xcr0 & 0xe0) == 0xe0))
  170. +                                        res = res | SUPPORT_AVX512;
  171. +            }
  172. +            if (vendor == AMD) {
  173. +                __get_cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
  174. +                if (ecx & bit_XOP)      res = res | SUPPORT_XOP;
  175. +                if (ecx & bit_FMA4)     res = res | SUPPORT_FMA4;
  176. +            }
  177. +        }
  178. +    }
  179. +    #endif
  180.  
  181.  #else
  182.      // Window / VS version of cpuid. Notice that Visual Studio 2005 or later required
  183. @@ -117,10 +174,45 @@
  184.      __cpuid(reg,0);
  185.      if ((unsigned int)reg[0] == 0) return 0; // always disable extensions.
  186.  
  187. +    int cpuidMaxFuncNum = reg[0];
  188. +    if      ((unsigned int)reg[1] == strIntel[0] && (unsigned int)reg[3] == strIntel[1] &&
  189. +             (unsigned int)reg[2] == strIntel[2]) vendor = INTEL;
  190. +    else if ((unsigned int)reg[1] == strAmd[0] && (unsigned int)reg[3] == strAmd[1] &&
  191. +             (unsigned int)reg[2] == strAmd[2]) vendor = AMD;
  192.      __cpuid(reg,1);
  193. -    if ((unsigned int)reg[3] & bit_MMX)  res = res | SUPPORT_MMX;
  194. -    if ((unsigned int)reg[3] & bit_SSE)  res = res | SUPPORT_SSE;
  195. -    if ((unsigned int)reg[3] & bit_SSE2) res = res | SUPPORT_SSE2;
  196. +    #if defined(_M_X64)
  197. +                        // note: MMX intrinsics cannot be compiled by Visual C++ for x64.
  198. +                        //       It seems that however, hardware and Windows can be used.
  199. +                                                        res = res | SUPPORT_SSE2 | SUPPORT_SSE | SUPPORT_MMX;
  200. +    #else
  201. +    if ((unsigned int)reg[3] & bit_MMX)                 res = res | SUPPORT_MMX;
  202. +    if ((unsigned int)reg[3] & bit_SSE)                 res = res | SUPPORT_SSE;
  203. +    if ((unsigned int)reg[3] & bit_SSE2)                res = res | SUPPORT_SSE2;
  204. +    #endif
  205. +
  206. +    // compiler is AVX support? (Is _XCR_XFEATURE_ENABLED_MASK defined?) -- VC++, etc...
  207. +    //#if (_MSC_FULL_VER >= 160040219)
  208. +    #ifdef _XCR_XFEATURE_ENABLED_MASK
  209. +    if (((unsigned int)reg[2] & (bit_AVX | bit_OSXSAVE)) == (bit_AVX | bit_OSXSAVE)) {
  210. +        uint64_t xcr0 = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
  211. +        if ((xcr0 & 0x06) == 0x06) {
  212. +            //                                            res = res | SUPPORT_AVX;
  213. +            //if ((unsigned int)reg[2] & bit_FMA3)        res = res | SUPPORT_FMA3;
  214. +            if (cpuidMaxFuncNum >= 7)
  215. +            {
  216. +                __cpuidex(reg, 7, 0);
  217. +                if ((unsigned int)reg[1] & bit_AVX2)    res = res | SUPPORT_AVX2;
  218. +            //    if (((unsigned int)reg[1] & bit_AVX512F) && (xcr0 & 0xe0) == 0xe0)
  219. +            //                                            res = res | SUPPORT_AVX512;
  220. +            //}
  221. +            //if (vendor == AMD) {
  222. +            //    __cpuid(reg, 0x80000001);
  223. +            //    if ((unsigned int)reg[2] & bit_XOP)     res = res | SUPPORT_XOP;
  224. +            //    if ((unsigned int)reg[2] & bit_FMA4)    res = res | SUPPORT_FMA4;
  225. +            }
  226. +        }
  227. +    }
  228. +    #endif
  229.  
  230.  #endif
  231.  
  232. --- soundtouch-1.9.2.orig/source/SoundTouch/FIFOSampleBuffer.cpp    Sun Sep 20 16:40:59 2015
  233. +++ soundtouch-1.9.2.SIMD/source/SoundTouch/FIFOSampleBuffer.cpp    Sat Feb 13 00:04:07 2016
  234. @@ -171,13 +171,13 @@
  235.          // enlarge the buffer in 4kbyte steps (round up to next 4k boundary)
  236.          sizeInBytes = (capacityRequirement * channels * sizeof(SAMPLETYPE) + 4095) & (uint)-4096;
  237.          assert(sizeInBytes % 2 == 0);
  238. -        tempUnaligned = new SAMPLETYPE[sizeInBytes / sizeof(SAMPLETYPE) + 16 / sizeof(SAMPLETYPE)];
  239. +        tempUnaligned = new SAMPLETYPE[sizeInBytes / sizeof(SAMPLETYPE) + (ALIGN_SIZE) / sizeof(SAMPLETYPE)];
  240.          if (tempUnaligned == NULL)
  241.          {
  242.              ST_THROW_RT_ERROR("Couldn't allocate memory!\n");
  243.          }
  244.          // Align the buffer to begin at 16byte cache line boundary for optimal performance
  245. -        temp = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER_16(tempUnaligned);
  246. +        temp = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER(tempUnaligned);
  247.          if (samplesInBuffer)
  248.          {
  249.              memcpy(temp, ptrBegin(), samplesInBuffer * channels * sizeof(SAMPLETYPE));
  250. --- soundtouch-1.9.2.orig/source/SoundTouch/FIRFilter.cpp   Sun Sep 20 16:40:59 2015
  251. +++ soundtouch-1.9.2.SIMD/source/SoundTouch/FIRFilter.cpp   Sat Feb 13 00:04:07 2016
  252. @@ -303,6 +303,24 @@
  253.  
  254.      // Check if MMX/SSE instruction set extensions supported by CPU
  255.  
  256. +#ifdef SOUNDTOUCH_ALLOW_AVX2
  257. +    // AVX2 routines available only with integer sample types
  258. +    if (uExtensions & SUPPORT_AVX2)
  259. +    {
  260. +        return ::new FIRFilterAVX2;
  261. +    }
  262. +    else
  263. +#endif
  264. +
  265. +#ifdef SOUNDTOUCH_ALLOW_SSE2
  266. +    // SSE2 routines available only with integer sample types
  267. +    if (uExtensions & SUPPORT_SSE2)
  268. +    {
  269. +        return ::new FIRFilterSSE2;
  270. +    }
  271. +    else
  272. +#endif // SOUNDTOUCH_ALLOW_SSE2
  273. +
  274.  #ifdef SOUNDTOUCH_ALLOW_MMX
  275.      // MMX routines available only with integer sample types
  276.      if (uExtensions & SUPPORT_MMX)
  277. --- soundtouch-1.9.2.orig/source/SoundTouch/FIRFilter.h Sun Sep 20 16:40:59 2015
  278. +++ soundtouch-1.9.2.SIMD/source/SoundTouch/FIRFilter.h Sat Feb 13 00:04:07 2016
  279. @@ -141,6 +141,42 @@
  280.  
  281.  #endif // SOUNDTOUCH_ALLOW_SSE
  282.  
  283. +
  284. +#ifdef SOUNDTOUCH_ALLOW_SSE2
  285. +    /// Class that implements SSE2 optimized functions exclusive for 16bit integer samples type.
  286. +    class FIRFilterSSE2 : public FIRFilter
  287. +    {
  288. +    protected:
  289. +        short *filterCoeffsUnalign;
  290. +        short *filterCoeffsAlign;
  291. +
  292. +        virtual uint evaluateFilterStereo(short *dest, const short *src, uint numSamples) const;
  293. +    public:
  294. +        FIRFilterSSE2();
  295. +        ~FIRFilterSSE2();
  296. +
  297. +        virtual void setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor);
  298. +    };
  299. +
  300. +#endif // SOUNDTOUCH_ALLOW_SSE2
  301. +
  302. +#ifdef SOUNDTOUCH_ALLOW_AVX2
  303. +    /// Class that implements AVX2 optimized functions exclusive for 16bit integer samples type.
  304. +    class FIRFilterAVX2 : public FIRFilter
  305. +    {
  306. +    protected:
  307. +        short *filterCoeffsUnalign;
  308. +        short *filterCoeffsAlign;
  309. +
  310. +        virtual uint evaluateFilterStereo(short *dest, const short *src, uint numSamples) const;
  311. +    public:
  312. +        FIRFilterAVX2();
  313. +        ~FIRFilterAVX2();
  314. +
  315. +        virtual void setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor);
  316. +    };
  317. +#endif // SOUNDTOUCH_ALLOW_AVX2
  318. +
  319.  }
  320.  
  321.  #endif  // FIRFilter_H
  322. --- soundtouch-1.9.2.orig/source/SoundTouch/mmx_optimized.cpp   Sun Sep 20 16:40:59 2015
  323. +++ soundtouch-1.9.2.SIMD/source/SoundTouch/mmx_optimized.cpp   Sat Feb 13 09:53:14 2016
  324. @@ -316,8 +316,8 @@
  325.  
  326.      // Ensure that filter coeffs array is aligned to 16-byte boundary
  327.      delete[] filterCoeffsUnalign;
  328. -    filterCoeffsUnalign = new short[2 * newLength + 8];
  329. -    filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
  330. +    filterCoeffsUnalign = new short[2 * newLength + (ALIGN_SIZE)/sizeof(short)];
  331. +    filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER(filterCoeffsUnalign);
  332.  
  333.      // rearrange the filter coefficients for mmx routines
  334.      for (i = 0;i < length; i += 4)
  335. --- soundtouch-1.9.2.orig/source/SoundTouch/sse_optimized.cpp   Sun Sep 20 16:40:59 2015
  336. +++ soundtouch-1.9.2.SIMD/source/SoundTouch/sse_optimized.cpp   Sat Feb 13 09:53:22 2016
  337. @@ -227,8 +227,8 @@
  338.      // also rearrange coefficients suitably for SSE
  339.      // Ensure that filter coeffs array is aligned to 16-byte boundary
  340.      delete[] filterCoeffsUnalign;
  341. -    filterCoeffsUnalign = new float[2 * newLength + 4];
  342. -    filterCoeffsAlign = (float *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
  343. +    filterCoeffsUnalign = new float[2 * newLength + (ALIGN_SIZE)/sizeof(float)];
  344. +    filterCoeffsAlign = (float *)SOUNDTOUCH_ALIGN_POINTER(filterCoeffsUnalign);
  345.  
  346.      fDivider = (float)resultDivider;
  347.  
  348. @@ -370,3 +370,543 @@
  349.  }
  350.  
  351.  #endif  // SOUNDTOUCH_ALLOW_SSE
  352. +
  353. +#ifdef SOUNDTOUCH_ALLOW_SSE2
  354. +
  355. +// SSE2 routines available only with integer sample type
  356. +// Also refer to MMX optimized routines.
  357. +
  358. +//////////////////////////////////////////////////////////////////////////////
  359. +//
  360. +// implementation of SSE2 optimized functions of class 'TDStretchSSE2'
  361. +//
  362. +//////////////////////////////////////////////////////////////////////////////
  363. +
  364. +#include "TDStretch.h"
  365. +#include <emmintrin.h>
  366. +#include <math.h>
  367. +
  368. +// Calculates cross correlation of two buffers
  369. +double TDStretchSSE2::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm)
  370. +{
  371. +    const   __m128i *pVec1      = (__m128i*)pV1;    // not 16byte aligned
  372. +    const   __m128i *pVec2      = (__m128i*)pV2;    // 16byte aligned
  373. +    const   __m128i shifter     = _mm_cvtsi32_si128(overlapDividerBitsNorm);
  374. +            __m128i accu        = _mm_setzero_si128();
  375. +            __m128i normaccu    = _mm_setzero_si128();
  376. +            __m128i v;                              // for temporary
  377. +
  378. +    // Process 8 parallel sets of 4 * stereo samples or 8 * mono samples
  379. +    // during each round for improved CPU-level parallellization.
  380. +
  381. +    for (int i = channels*overlapLength/16 ; i ; i--)
  382. +    {
  383. +        // Applies shifter immediately after product-sum to prevent overflow
  384. +        __m128i n0 = _mm_loadu_si128(pVec1);
  385. +        __m128i n1 = _mm_loadu_si128(pVec1+1);
  386. +        __m128i a0 = _mm_madd_epi16(n0, *pVec2++); // a0 = pVec1[0] * pVec2[0]
  387. +                n0 = _mm_madd_epi16(n0, n0);       // n0 = pVec1[0]^2
  388. +        __m128i a1 = _mm_madd_epi16(n1, *pVec2++); // a1 = pVec1[1] * pVec2[1]
  389. +                n1 = _mm_madd_epi16(n1, n1);       // n1 = pVec1[1]^2
  390. +                a0 = _mm_sra_epi32(a0, shifter);   // right arithmetic shift
  391. +                n0 = _mm_sra_epi32(n0, shifter);
  392. +                a1 = _mm_sra_epi32(a1, shifter);
  393. +                n1 = _mm_sra_epi32(n1, shifter);
  394. +        accu     = _mm_add_epi32(accu, a0);        // add to accumulator
  395. +        normaccu = _mm_add_epi32(normaccu, n0);
  396. +        accu     = _mm_add_epi32(accu, a1);
  397. +        normaccu = _mm_add_epi32(normaccu, n1);
  398. +        pVec1 += 2;
  399. +    }
  400. +    // sum total
  401. +    v    = _mm_srli_si128(accu, 4);
  402. +    accu = _mm_add_epi32(v, accu);
  403. +    v    = _mm_srli_si128(accu, 8);
  404. +    accu = _mm_add_epi32(v, accu);
  405. +    v        = _mm_srli_si128(normaccu, 4);
  406. +    normaccu = _mm_add_epi32(v, normaccu);
  407. +    v        = _mm_srli_si128(normaccu, 8);
  408. +    normaccu = _mm_add_epi32(v, normaccu);
  409. +
  410. +    __m128d Vcorr  = _mm_cvtepi32_pd(accu);        // int32 to double
  411. +    __m128d Vdnorm = _mm_cvtepi32_pd(normaccu);
  412. +    _mm_store_sd(&dnorm, Vdnorm);                  // feedback to dnorm
  413. +
  414. +    if (_mm_cvtsi128_si32(normaccu) > 0) {
  415. +        Vdnorm = _mm_sqrt_sd(Vdnorm, Vdnorm);
  416. +        Vcorr = _mm_div_sd(Vcorr, Vdnorm);
  417. +    }
  418. +    return _mm_cvtsd_f64(Vcorr);
  419. +}
  420. +
  421. +
  422. +/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
  423. +double TDStretchSSE2::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm)
  424. +{
  425. +    const   __m128i *pVec1     = (__m128i*)pV1;    // (unaligned)
  426. +    const   __m128i *pVec1prev = pVec1;            // for previos round normalizer
  427. +    const   __m128i *pVec2     = (__m128i*)pV2;    // (aligned)
  428. +    const   __m128i shifter    = _mm_cvtsi32_si128(overlapDividerBitsNorm);
  429. +            __m128i accu       = _mm_setzero_si128();
  430. +            __m128i norm       = _mm_setzero_si128();
  431. +            __m128i v;                              // for temporary
  432. +            __m128d vd;                             // for temporary
  433. +
  434. +    // Process 8 parallel sets of 2 * stereo samples or 16 * mono samples
  435. +    // during each round for improved CPU-level parallellization.
  436. +    for (int i = channels * overlapLength / 16 ; i ; i--)
  437. +    {
  438. +        // Applies shifter immediately after product-sum to prevent overflow
  439. +        const __m128i vec1[] = {
  440. +            _mm_loadu_si128(pVec1),
  441. +            _mm_loadu_si128(pVec1+1)
  442. +        };
  443. +        __m128i v1 = _mm_madd_epi16(vec1[0], pVec2[0]);
  444. +                v1 = _mm_sra_epi32(v1, shifter);
  445. +        __m128i v2 = _mm_madd_epi16(vec1[1], pVec2[1]);
  446. +                v2 = _mm_sra_epi32(v2, shifter);
  447. +        pVec1 += 2;
  448. +        accu = _mm_add_epi32(accu, v1);
  449. +        pVec2 += 2;
  450. +        accu = _mm_add_epi32(accu, v2);
  451. +    }
  452. +    v    = _mm_srli_si128(accu, 8);
  453. +    accu = _mm_add_epi32(v, accu);
  454. +    v    = _mm_srli_si128(accu, 4);
  455. +    accu = _mm_add_epi32(v, accu); // accu.m128i_i32[0] is sum total
  456. +
  457. +    // update normalizer with last samples of this round, and previous round
  458. +    for (int ch = channels; ch > 0; ch -= sizeof(*pVec1)/sizeof(*pV1)) {
  459. +        const __m128i vth = _mm_set_epi16(0,1,2,3,4,5,6,7);
  460. +        const __m128i vch = _mm_set1_epi16(ch);
  461. +        const __m128i vMask = _mm_cmpgt_epi16(vch, vth);
  462. +        __m128i vThis = _mm_loadu_si128(--pVec1);
  463. +        __m128i vPrev = _mm_loadu_si128(--pVec1prev);
  464. +        vThis = _mm_and_si128(vThis, vMask); // this round
  465. +        vPrev = _mm_and_si128(vPrev, vMask); // previos round
  466. +
  467. +        vThis = _mm_madd_epi16(vThis, vThis);
  468. +        vThis = _mm_sra_epi32(vThis, shifter);
  469. +        vPrev = _mm_madd_epi16(vPrev, vPrev);
  470. +        vPrev = _mm_sra_epi32(vPrev, shifter);
  471. +        norm  = _mm_add_epi32(norm, vThis);
  472. +        norm  = _mm_sub_epi32(norm, vPrev);
  473. +    }
  474. +    v    = _mm_srli_si128(norm, 8);
  475. +    norm = _mm_add_epi32(norm, v);
  476. +    v    = _mm_srli_si128(norm, 4);
  477. +    norm = _mm_add_epi32(norm, v); // norm.m128i_i32[0] is sum total
  478. +
  479. +    #if defined _WIN64
  480. +    __m128d Vcorr  = _mm_cvtepi32_pd(accu);
  481. +    __m128d Vdnorm = _mm_cvtepi32_pd(norm);
  482. +    vd = _mm_load_sd(&dnorm);
  483. +    Vdnorm = _mm_add_sd(vd, Vdnorm);
  484. +    _mm_store_sd(&dnorm, Vdnorm);  // feedback to dnorm
  485. +
  486. +    const __m128d dmin = _mm_set_sd(1e-9);
  487. +    if (_mm_comige_sd(Vdnorm, dmin)) {
  488. +        Vdnorm = _mm_sqrt_sd(Vdnorm, Vdnorm);
  489. +        Vcorr = _mm_div_sd(Vcorr, Vdnorm);
  490. +    }
  491. +    return _mm_cvtsd_f64(Vcorr);
  492. +    #else
  493. +    // Although it is faster than the above at Pentium 4. But at x64, it may be a slow.
  494. +    __m128d Vdnorm = _mm_cvtepi32_pd(norm);
  495. +    vd = _mm_load_sd(&dnorm);
  496. +    Vdnorm = _mm_add_sd(vd, Vdnorm);
  497. +    _mm_store_sd(&dnorm, Vdnorm);  // feedback to dnorm
  498. +
  499. +    double corr;
  500. +    vd = _mm_cvtepi32_pd(accu);
  501. +    _mm_store_sd(&corr, vd);
  502. +
  503. +    const __m128d dmin = _mm_set_sd(1e-9);
  504. +    if (_mm_comige_sd(Vdnorm, dmin)) {
  505. +        return corr / sqrt(dnorm); // x87 is used even if it specifies -arch:SSE2
  506. +    }
  507. +    return corr;
  508. +    #endif
  509. +}
  510. +
  511. +
  512. +// SSE2-optimized version of the function overlapStereo
  513. +void TDStretchSSE2::overlapStereo(short *output, const short *input)
  514. +{
  515. +    const   __m128i *pVinput  = (__m128i*)input;                // (unaligned)
  516. +    const   __m128i *pVMidBuf = (__m128i*)pMidBuffer;           // (aligned)
  517. +    const   __m128i shifter   = _mm_cvtsi32_si128(overlapDividerBitsPure + 1);
  518. +            // note: Since _mm_set_epi16() is slow at Pentium4, _mm_set_epi32() is substituted.
  519. +            __m128i adder     = _mm_set1_epi32(0x2fffe);        // [ 2, -2, 2, -2, 2, -2, 2, -2 ]
  520. +            __m128i mix1      = _mm_set_epi32(
  521. +                0x10000 | (unsigned short)(overlapLength-1),    // (short)[ 1, overlapLength-1,
  522. +                0x10000 | (unsigned short)(overlapLength-1),    //          1, overlapLength-1,
  523. +                          (unsigned short)overlapLength,        //          0, overlapLength,
  524. +                          (unsigned short)overlapLength);       //          0, overlapLength ]
  525. +            __m128i mix2      = _mm_add_epi16(mix1, adder);
  526. +            __m128i *pVdest   = (__m128i*)output;               // (unaligned)
  527. +    adder = _mm_add_epi16(adder, adder);
  528. +
  529. +    for (int i = overlapLength / 4 ; i ; i--)
  530. +    {
  531. +        const __m128i vi = _mm_loadu_si128(pVinput);
  532. +        const __m128i vm = _mm_load_si128(pVMidBuf);
  533. +        __m128i v1 = _mm_unpacklo_epi16(vm, vi);
  534. +        __m128i v2 = _mm_unpackhi_epi16(vm, vi);
  535. +        v1 = _mm_madd_epi16(v1, mix1);
  536. +        v2 = _mm_madd_epi16(v2, mix2);
  537. +        v1 = _mm_sra_epi32(v1, shifter);
  538. +        v2 = _mm_sra_epi32(v2, shifter);
  539. +        v1 = _mm_packs_epi32(v1, v2);
  540. +        _mm_storeu_si128(pVdest, v1);
  541. +
  542. +        mix1 = _mm_add_epi16(mix1, adder);
  543. +        mix2 = _mm_add_epi16(mix2, adder);
  544. +        pVMidBuf++;
  545. +        pVinput++;
  546. +        pVdest++;
  547. +    }
  548. +}
  549. +
  550. +
  551. +//////////////////////////////////////////////////////////////////////////////
  552. +//
  553. +// implementation of SSE2 optimized functions of class 'FIRFilter'
  554. +//
  555. +//////////////////////////////////////////////////////////////////////////////
  556. +
  557. +#include "FIRFilter.h"
  558. +
  559. +FIRFilterSSE2::FIRFilterSSE2() : FIRFilter()
  560. +{
  561. +    filterCoeffsAlign = NULL;
  562. +    filterCoeffsUnalign = NULL;
  563. +}
  564. +
  565. +
  566. +FIRFilterSSE2::~FIRFilterSSE2()
  567. +{
  568. +    delete[] filterCoeffsUnalign;
  569. +}
  570. +
  571. +
  572. +// (overloaded) Calculates filter coefficients for SSE2 routine
  573. +void FIRFilterSSE2::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
  574. +{
  575. +    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
  576. +
  577. +    // Ensure that filter coeffs array is aligned to 16-byte boundary
  578. +    delete[] filterCoeffsUnalign;
  579. +    filterCoeffsUnalign = new short[2 * newLength + (ALIGN_SIZE)/sizeof(short)];
  580. +    filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER(filterCoeffsUnalign);
  581. +    __m128i *VfilterCoeffsAlign = (__m128i*)filterCoeffsAlign;
  582. +
  583. +    // rearrange the filter coefficients for SSE2 routines
  584. +    for (uint i = 0; i < length; i += 4)
  585. +    {
  586. +        __m128i v = _mm_loadl_epi64((__m128i*)(coeffs + i)); // 3, 2, 1, 0
  587. +        v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 1, 2, 0)); // 3, 1, 2, 0
  588. +        v = _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 0, 0));   // 3, 1, 3, 1, 2, 0, 2, 0
  589. +        _mm_store_si128(VfilterCoeffsAlign++, v);
  590. +    }
  591. +}
  592. +
  593. +
  594. +// sse2-optimized version of the filter routine for stereo sound
  595. +uint FIRFilterSSE2::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const
  596. +{
  597. +    if (length < 2) return 0;
  598. +
  599. +    short *pVdest = dest;
  600. +
  601. +    for (uint i = (numSamples - length) >> 1 ; i ; i--)
  602. +    {
  603. +        const   __m128i *pVsrc    = (__m128i*)src;
  604. +        const   __m128i *pVfilter = (__m128i*)filterCoeffsAlign; //16byte aligned
  605. +                __m128i accu1     = _mm_setzero_si128();
  606. +                __m128i accu2     = _mm_setzero_si128();
  607. +
  608. +        for (uint j = lengthDiv8 * 2; j ; j--)
  609. +        {
  610. +            //           accu1                accu2
  611. +            // r0: s00*f00 + s04*f01    s02*f00 + s06*f01
  612. +            // r1: s01*f02 + s05*f03    s03*f02 + s07*f03
  613. +            // r2: s02*f04 + s06*f05    s04*f04 + s08*f05
  614. +            // r3: s03*f06 + s07*f07    s05*f06 + s09*f07
  615. +                  __m128i v0 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+0));
  616. +                  __m128i v2 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+2));
  617. +            const __m128i v4 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+4));
  618. +            const __m128i v6 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+6));
  619. +            const __m128i vf = _mm_load_si128(pVfilter);
  620. +            v0 = _mm_unpacklo_epi16(v0, v4);
  621. +            v2 = _mm_unpacklo_epi16(v2, v6);
  622. +            v0 = _mm_madd_epi16(v0, vf);
  623. +            v2 = _mm_madd_epi16(v2, vf);
  624. +            pVsrc++;
  625. +            accu1 = _mm_add_epi32(accu1, v0);
  626. +            pVfilter++;
  627. +            accu2 = _mm_add_epi32(accu2, v2);
  628. +        }
  629. +        // r0: accu1 - s00*f00 + s04*f01 + s02*f04 + s06*f05
  630. +        // r1:         s01*f02 + s05*f03 + s03*f06 + s07*f07
  631. +        // r2: accu2 - s02*f00 + s06*f01 + s04*f04 + s08*f05
  632. +        // r3:         s03*f02 + s07*f03 + s05*f06 + s09*f07
  633. +        const __m128i v1 = _mm_srli_si128(accu1, 8);
  634. +        const __m128i v2 = _mm_srli_si128(accu2, 8);
  635. +        accu1 = _mm_add_epi32(accu1, v1);
  636. +        accu2 = _mm_add_epi32(accu2, v2);
  637. +        accu1 = _mm_unpacklo_epi64(accu1, accu2);
  638. +        accu1 = _mm_srai_epi32(accu1, resultDivFactor);
  639. +        accu1 = _mm_packs_epi32(accu1, accu1);
  640. +        _mm_storel_epi64((__m128i*)pVdest, accu1);
  641. +        src += 4;
  642. +        pVdest += 4;
  643. +    }
  644. +    return (numSamples & -2) - length;
  645. +}
  646. +#endif  // SOUNDTOUCH_ALLOW_SSE2
  647. +
  648. +#ifdef SOUNDTOUCH_ALLOW_AVX2
  649. +
  650. +//////////////////////////////////////////////////////////////////////////////
  651. +//
  652. +// implementation of AVX2 optimized functions of class 'TDStretchAVX2'
  653. +//
  654. +//////////////////////////////////////////////////////////////////////////////
  655. +
  656. +#include "TDStretch.h"
  657. +#include <immintrin.h>
  658. +#include <math.h>
  659. +
  660. +// defined SOUNDTOUCH_INTEGER_SAMPLES
  661. +
  662. +// Calculates cross correlation of two buffers
  663. +double TDStretchAVX2::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm)
  664. +{
  665. +    const   __m256i *pVec1      = (__m256i*)pV1;    // not 32byte aligned
  666. +    const   __m256i *pVec2      = (__m256i*)pV2;    // 32byte aligned
  667. +    const   __m256i shifter     = _mm256_set1_epi32(overlapDividerBitsNorm);
  668. +            __m256i accu        = _mm256_setzero_si256();
  669. +            __m256i normaccu    = _mm256_setzero_si256();
  670. +
  671. +    // Process 16 parallel sets of 8 * stereo samples or 16 * mono samples
  672. +    // during each round for improved CPU-level parallellization.
  673. +
  674. +    for (int i = channels*overlapLength/16 ; i ; i--)
  675. +    {
  676. +        // Applies shifter immediately after product-sum to prevent overflow
  677. +        const __m256i v  = _mm256_loadu_si256(pVec1);
  678. +        __m256i v1 = _mm256_madd_epi16(v, *pVec2++);
  679. +        __m256i v2 = _mm256_madd_epi16(v, v);
  680. +                v1 = _mm256_srav_epi32(v1, shifter);
  681. +                v2 = _mm256_srav_epi32(v2, shifter);
  682. +        accu       = _mm256_add_epi32(accu, v1);
  683. +        normaccu   = _mm256_add_epi32(normaccu, v2);
  684. +        pVec1++;
  685. +    }
  686. +    normaccu = _mm256_hadd_epi32(normaccu, accu);
  687. +    __m128i vNorm = _mm256_extracti128_si256(normaccu, 1);
  688. +    _mm256_zeroupper();
  689. +    vNorm = _mm_add_epi32(_mm256_castsi256_si128(normaccu), vNorm);
  690. +    vNorm = _mm_hadd_epi32(vNorm, vNorm); // r1=sum(accu), r0=sum(normaccu)
  691. +
  692. +    __m128d vdNorm = _mm_cvtepi32_pd(vNorm);                              // xmm:r0=(double)sum(normaccu)
  693. +    __m128d vdCorr = _mm_shuffle_pd(vdNorm, vdNorm, _MM_SHUFFLE2(0,1));   // xmm:r0=(double)sum(accu)
  694. +    _mm_store_sd(&dnorm, vdNorm);
  695. +
  696. +    if (_mm_cvtsi128_si32(vNorm) > 0) {
  697. +        vdNorm = _mm_sqrt_sd(vdNorm, vdNorm);
  698. +        vdCorr = _mm_div_sd(vdCorr, vdNorm);
  699. +    }
  700. +    return _mm_cvtsd_f64(vdCorr);
  701. +}
  702. +
  703. +
  704. +/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
  705. +double TDStretchAVX2::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm)
  706. +{
  707. +    const   __m256i *pVec1      = (__m256i*)pV1;    // (unaligned)
  708. +    const   __m256i *pVec1prev  = pVec1;            // for previos round normalizer
  709. +    const   __m256i *pVec2      = (__m256i*)pV2;    // (32byte aligned)
  710. +    const   __m256i shifter     = _mm256_set1_epi32(overlapDividerBitsNorm);
  711. +    const   __m256i chThreshold = _mm256_set_epi16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
  712. +            __m256i accu        = _mm256_setzero_si256();
  713. +            __m256i norm        = _mm256_setzero_si256();
  714. +
  715. +    // Process 8 parallel sets of 2 * stereo samples or 16 * mono samples
  716. +    // during each round for improved CPU-level parallellization.
  717. +    for (int i = channels * overlapLength / 16 ; i ; i--)
  718. +    {
  719. +        // Applies shifter immediately after product-sum to prevent overflow
  720. +        __m256i v0 = _mm256_loadu_si256(pVec1++);
  721. +                v0 = _mm256_madd_epi16(v0, *pVec2++);
  722. +                v0 = _mm256_srav_epi32(v0, shifter);
  723. +              accu = _mm256_add_epi32(accu, v0);
  724. +    }
  725. +    __m128i vcorr = _mm256_extracti128_si256(accu, 1);
  726. +    vcorr = _mm_hadd_epi32(_mm256_castsi256_si128(accu), vcorr);
  727. +    vcorr = _mm_hadd_epi32(vcorr, /* unused */ vcorr);
  728. +    vcorr = _mm_hadd_epi32(vcorr, /* unused */ vcorr);  // xmm:r0 = sum total
  729. +
  730. +    // update normalizer with last samples of this round, and previous round
  731. +    for (int ch = channels; ch > 0; ch -= sizeof(*pVec1)/sizeof(*pV1)) {
  732. +        const __m256i restCh = _mm256_set1_epi16(ch);
  733. +        const __m256i vMask = _mm256_cmpgt_epi16(restCh, chThreshold);
  734. +              __m256i vThis = _mm256_loadu_si256(--pVec1);
  735. +              __m256i vPrev = _mm256_loadu_si256(--pVec1prev);
  736. +                      vThis = _mm256_and_si256(vThis, vMask);
  737. +                      vPrev = _mm256_and_si256(vPrev, vMask);
  738. +                      vThis = _mm256_madd_epi16(vThis, vThis);
  739. +                      vPrev = _mm256_madd_epi16(vPrev, vPrev);
  740. +                      vThis = _mm256_srav_epi32(vThis, shifter);
  741. +                      vPrev = _mm256_srav_epi32(vPrev, shifter);
  742. +        norm = _mm256_add_epi32(norm, vThis);
  743. +        norm = _mm256_sub_epi32(norm, vPrev);
  744. +    }
  745. +    __m128i vnorm = _mm256_extracti128_si256(norm, 1);
  746. +    _mm256_zeroupper();
  747. +    vnorm = _mm_hadd_epi32(_mm256_castsi256_si128(norm), vnorm);
  748. +    vnorm = _mm_hadd_epi32(vnorm, /* unused */ vnorm);
  749. +    vnorm = _mm_hadd_epi32(vnorm, /* unused */ vnorm);  // xmm:r0 = sum total
  750. +
  751. +    __m128d vdcorr = _mm_cvtepi32_pd(vcorr);
  752. +    __m128d vdnorm = _mm_cvtepi32_pd(vnorm);
  753. +    __m128d vd     = _mm_load_sd(&dnorm);
  754. +            vdnorm = _mm_add_sd(vdnorm, vd);
  755. +    _mm_store_sd(&dnorm, vdnorm);
  756. +
  757. +    const __m128d vdmin = _mm_set_sd(1e-9);
  758. +    if (_mm_comige_sd(vdnorm, vdmin)) {
  759. +        vdnorm = _mm_sqrt_sd(vdnorm, /* unused */ vdnorm);
  760. +        vdcorr = _mm_div_sd(vdcorr, vdnorm);
  761. +    }
  762. +    return _mm_cvtsd_f64(vdcorr);
  763. +}
  764. +
  765. +
  766. +// AVX2-optimized version of the function overlapStereo
  767. +void TDStretchAVX2::overlapStereo(short *output, const short *input)
  768. +{
  769. +    const   __m128i *pVinput  = (__m128i*)input;                // (not aligned)
  770. +    const   __m128i *pVMidBuf = (__m128i*)pMidBuffer;           // (32byte aligned)
  771. +    const   __m256i shifter   = _mm256_set1_epi32(overlapDividerBitsPure + 1);
  772. +    const   __m256i adder     = _mm256_set1_epi32(0x4fffc);     // [ 4,-4, 4,-4, 4,-4, 4,-4]
  773. +            __m128i *pVdest   = (__m128i*)output;               // (not aligned)
  774. +            __m256i mix       = _mm256_set_epi32(
  775. +                0x30000 | (unsigned short)(overlapLength-3),    // 3, overlapLength-3,
  776. +                0x30000 | (unsigned short)(overlapLength-3),
  777. +                0x20000 | (unsigned short)(overlapLength-2),    // 2, overlapLength-2,
  778. +                0x20000 | (unsigned short)(overlapLength-2),
  779. +                0x10000 | (unsigned short)(overlapLength-1),    // 1, overlapLength-1,
  780. +                0x10000 | (unsigned short)(overlapLength-1),
  781. +                          (unsigned short)overlapLength,        // 0, overlapLength,
  782. +                          (unsigned short)overlapLength);
  783. +
  784. +    for (int i =  overlapLength / 4; i >= 0; i--)
  785. +    {
  786. +        const __m128i vinput  = _mm_loadu_si128(pVinput++);
  787. +        const __m128i vmidBuf = *pVMidBuf++;
  788. +        __m128i vh   = _mm_unpackhi_epi16(vmidBuf, vinput);
  789. +        __m128i vl   = _mm_unpacklo_epi16(vmidBuf, vinput);
  790. +        __m256i dest = _mm256_inserti128_si256(_mm256_castsi128_si256(vl), vh, 1);
  791. +                dest = _mm256_madd_epi16(dest, mix);
  792. +                dest = _mm256_srav_epi32(dest, shifter);
  793. +        __m128i v    = _mm256_extracti128_si256(dest, 1);
  794. +                v    = _mm_packs_epi32(_mm256_castsi256_si128(dest), v);
  795. +        _mm_storeu_si128(pVdest++, v);
  796. +        mix = _mm256_add_epi16(mix, adder); // update overlap multiplier
  797. +    }
  798. +    _mm256_zeroupper();
  799. +}
  800. +
  801. +
  802. +//////////////////////////////////////////////////////////////////////////////
  803. +//
  804. +// implementation of AVX2 optimized functions of class 'FIRFilter'
  805. +//
  806. +//////////////////////////////////////////////////////////////////////////////
  807. +
  808. +#include "FIRFilter.h"
  809. +
  810. +FIRFilterAVX2::FIRFilterAVX2() : FIRFilter()
  811. +{
  812. +    filterCoeffsAlign = NULL;
  813. +    filterCoeffsUnalign = NULL;
  814. +}
  815. +
  816. +
  817. +FIRFilterAVX2::~FIRFilterAVX2()
  818. +{
  819. +    delete[] filterCoeffsUnalign;
  820. +}
  821. +
  822. +
  823. +// (overloaded) Calculates filter coefficients for AVX2 routine
  824. +void FIRFilterAVX2::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
  825. +{
  826. +    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
  827. +
  828. +    // Ensure that filter coeffs array is aligned to 32-byte boundary
  829. +    delete[] filterCoeffsUnalign;
  830. +    filterCoeffsUnalign = new short[2 * newLength + (ALIGN_SIZE)/sizeof(short)];
  831. +    filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER(filterCoeffsUnalign);
  832. +    __m128i *VfilterCoeffsAlign = (__m128i*)filterCoeffsAlign;
  833. +
  834. +    // rearrange the filter coefficients for SSE2 routines
  835. +    for (uint i = 0; i < length; i += 4)
  836. +    {
  837. +        __m128i v = _mm_loadl_epi64((__m128i*)(coeffs + i)); // 3, 2, 1, 0
  838. +        v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 1, 2, 0)); // 3, 1, 2, 0
  839. +        v = _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 0, 0));   // 3, 1, 3, 1, 2, 0, 2, 0
  840. +        _mm_store_si128(VfilterCoeffsAlign++, v);
  841. +    }
  842. +}
  843. +
  844. +
  845. +// AVX2-optimized version of the filter routine for stereo sound
  846. +uint FIRFilterAVX2::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const
  847. +{
  848. +    if (length < 2) return 0;
  849. +
  850. +    short *pVdest = dest;
  851. +
  852. +    for (uint i = (numSamples - length) >> 1 ; i ; i--)
  853. +    {
  854. +        const   __m256i *pVsrc    = (__m256i*)src;
  855. +        const   __m256i *pVfilter = (__m256i*)filterCoeffsAlign; // 32byte aligned
  856. +                __m256i accu1     = _mm256_setzero_si256();
  857. +                __m256i accu2     = _mm256_setzero_si256();
  858. +
  859. +        for (uint j = lengthDiv8; j ; j--)
  860. +        {
  861. +            const __m256i vfilter = _mm256_load_si256(pVfilter);
  862. +                  __m256i v0 = _mm256_loadu_si256((__m256i*)((short*)pVsrc+0));
  863. +                  __m256i v2 = _mm256_loadu_si256((__m256i*)((short*)pVsrc+2));
  864. +            const __m256i v4 = _mm256_srli_si256(v0, 8);
  865. +            const __m256i v6 = _mm256_srli_si256(v2, 8);
  866. +                          v0 = _mm256_unpacklo_epi16(v0, v4);
  867. +                          v2 = _mm256_unpacklo_epi16(v2, v6);
  868. +                          v0 = _mm256_madd_epi16(v0, vfilter);
  869. +                          v2 = _mm256_madd_epi16(v2, vfilter);
  870. +            accu1 = _mm256_add_epi32(accu1, v0);
  871. +            accu2 = _mm256_add_epi32(accu2, v2);
  872. +            pVsrc++;
  873. +            pVfilter++;
  874. +        }
  875. +        accu1 = _mm256_shuffle_epi32(accu1, _MM_SHUFFLE(3,1,2,0));
  876. +        accu2 = _mm256_shuffle_epi32(accu2, _MM_SHUFFLE(3,1,2,0));
  877. +        accu1 = _mm256_hadd_epi32(accu1, accu2);
  878. +
  879. +        __m128i accu = _mm256_extracti128_si256(accu1, 1);
  880. +        accu = _mm_add_epi32(_mm256_castsi256_si128(accu1), accu);
  881. +        accu = _mm_srai_epi32(accu, resultDivFactor);
  882. +        accu = _mm_packs_epi32(accu, /* unused */ accu);
  883. +        _mm_storel_epi64((__m128i*)pVdest, accu);
  884. +
  885. +        src += 4;
  886. +        pVdest += 4;
  887. +    }
  888. +    _mm256_zeroupper();
  889. +    return (numSamples & -2) - length;
  890. +}
  891. +#endif  // SOUNDTOUCH_ALLOW_AVX2
  892. --- soundtouch-1.9.2.orig/source/SoundTouch/TDStretch.cpp   Sun Sep 20 16:40:59 2015
  893. +++ soundtouch-1.9.2.SIMD/source/SoundTouch/TDStretch.cpp   Sat Feb 13 00:04:07 2016
  894. @@ -721,9 +721,9 @@
  895.      {
  896.          delete[] pMidBufferUnaligned;
  897.  
  898. -        pMidBufferUnaligned = new SAMPLETYPE[overlapLength * channels + 16 / sizeof(SAMPLETYPE)];
  899. +        pMidBufferUnaligned = new SAMPLETYPE[overlapLength * channels + (ALIGN_SIZE) / sizeof(SAMPLETYPE)];
  900.          // ensure that 'pMidBuffer' is aligned to 16 byte boundary for efficiency
  901. -        pMidBuffer = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER_16(pMidBufferUnaligned);
  902. +        pMidBuffer = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER(pMidBufferUnaligned);
  903.  
  904.          clearMidBuffer();
  905.      }
  906. @@ -748,6 +748,24 @@
  907.  
  908.      // Check if MMX/SSE instruction set extensions supported by CPU
  909.  
  910. +#ifdef SOUNDTOUCH_ALLOW_AVX2
  911. +    // AVX2 routines available
  912. +    if (uExtensions & SUPPORT_AVX2)
  913. +    {
  914. +        return ::new TDStretchAVX2;
  915. +    }
  916. +    else
  917. +#endif
  918. +
  919. +#ifdef SOUNDTOUCH_ALLOW_SSE2
  920. +    // SSE2 routines available only with integer sample types
  921. +    if (uExtensions & SUPPORT_SSE2)
  922. +    {
  923. +        return ::new TDStretchSSE2;
  924. +    }
  925. +    else
  926. +#endif // SOUNDTOUCH_ALLOW_SSE2
  927. +
  928.  #ifdef SOUNDTOUCH_ALLOW_MMX
  929.      // MMX routines available only with integer sample types
  930.      if (uExtensions & SUPPORT_MMX)
  931. --- soundtouch-1.9.2.orig/source/SoundTouch/TDStretch.h Sun Sep 20 16:40:59 2015
  932. +++ soundtouch-1.9.2.SIMD/source/SoundTouch/TDStretch.h Sat Feb 13 00:04:07 2016
  933. @@ -277,5 +277,29 @@
  934.  
  935.  #endif /// SOUNDTOUCH_ALLOW_SSE
  936.  
  937. +
  938. +#ifdef SOUNDTOUCH_ALLOW_SSE2
  939. +    /// Class that implements SSE2 optimized routines for 16bit integer samples type.
  940. +    class TDStretchSSE2 : public TDStretch
  941. +    {
  942. +    protected:
  943. +        double calcCrossCorr(const short *mixingPos, const short *compare, double &norm);
  944. +        double calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm);
  945. +        virtual void overlapStereo(short *output, const short *input);
  946. +    };
  947. +
  948. +#endif /// SOUNDTOUCH_ALLOW_SSE2
  949. +
  950. +#ifdef SOUNDTOUCH_ALLOW_AVX2
  951. +    /// Class that implements AVX2 optimized routines for 16bit integer samples type.
  952. +    class TDStretchAVX2 : public TDStretch
  953. +    {
  954. +    protected:
  955. +        double calcCrossCorr(const short *mixingPos, const short *compare, double &norm);
  956. +        double calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm);
  957. +        virtual void overlapStereo(short *output, const short *input);
  958. +    };
  959. +#endif /// SOUNDTOUCH_ALLOW_AVX2
  960. +
  961.  }
  962.  #endif  /// TDStretch_H
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement