SoundTouch-1.9.2_SIMD.patch

--- soundtouch-1.9.2.orig/include/STTypes.h Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SIMD/include/STTypes.h Sat Feb 13 00:04:07 2016
@@ -50,8 +50,9 @@
 #endif


-// Helper macro for aligning pointer up to next 16-byte boundary
-#define SOUNDTOUCH_ALIGN_POINTER_16(x)      ( ( (ulongptr)(x) + 15 ) & ~(ulongptr)15 )
+// Helper macro for aligning pointer up to next 32-byte boundary
+#define ALIGN_SIZE 64
+#define SOUNDTOUCH_ALIGN_POINTER(x)    (((ulongptr)(x) + (ALIGN_SIZE)-1) & ~(ulongptr)((ALIGN_SIZE)-1))


 #if (defined(__GNUC__) && !defined(ANDROID))
@@ -98,8 +99,8 @@
         ///   However, if you still prefer to select the sample format here
         ///   also in GNU environment, then please #undef the INTEGER_SAMPLE
         ///   and FLOAT_SAMPLE defines first as in comments above.
-        //#define SOUNDTOUCH_INTEGER_SAMPLES     1    //< 16bit integer samples
-        #define SOUNDTOUCH_FLOAT_SAMPLES       1    //< 32bit float samples
+        #define SOUNDTOUCH_INTEGER_SAMPLES     1    //< 16bit integer samples
+        //#define SOUNDTOUCH_FLOAT_SAMPLES       1    //< 32bit float samples

     #endif

@@ -143,8 +144,14 @@
         #endif // SOUNDTOUCH_FLOAT_SAMPLES

         #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
-            // Allow MMX optimizations
-            #define SOUNDTOUCH_ALLOW_MMX   1
+            // Allow SSE2 optimizations
+            #define SOUNDTOUCH_ALLOW_SSE2      1
+            // Allow AVX2 optimizations
+            #define SOUNDTOUCH_ALLOW_AVX2      1
+            #ifndef _M_X64
+                // Allow MMX optimizations
+                #define SOUNDTOUCH_ALLOW_MMX   1
+            #endif
         #endif

     #else
--- soundtouch-1.9.2.orig/source/SoundTouch/cpu_detect.h    Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SIMD/source/SoundTouch/cpu_detect.h    Sat Feb 13 03:25:37 2016
@@ -50,6 +50,12 @@
 #define SUPPORT_ALTIVEC     0x0004
 #define SUPPORT_SSE         0x0008
 #define SUPPORT_SSE2        0x0010
+//#define SUPPORT_AVX         0x0020
+//#define SUPPORT_XOP         0x0040
+//#define SUPPORT_FMA4        0x0080
+//#define SUPPORT_FMA3        0x0100
+#define SUPPORT_AVX2        0x0200
+//#define SUPPORT_AVX512      0x0400

 /// Checks which instruction set extensions are supported by the CPU.
 ///
--- soundtouch-1.9.2.orig/source/SoundTouch/cpu_detect_x86.cpp  Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SIMD/source/SoundTouch/cpu_detect_x86.cpp  Sat Feb 13 03:29:07 2016
@@ -39,6 +39,7 @@
 //
 ////////////////////////////////////////////////////////////////////////////////

+#include <stdint.h>
 #include "cpu_detect.h"
 #include "STTypes.h"

@@ -48,14 +49,24 @@
    #if defined(__GNUC__) && defined(__i386__)
        // gcc
        #include "cpuid.h"
-   #elif defined(_M_IX86)
+   #elif defined(_M_IX86) || defined(_M_X64)
        // windows non-gcc
        #include <intrin.h>
    #endif

-   #define bit_MMX     (1 << 23)
-   #define bit_SSE     (1 << 25)
-   #define bit_SSE2    (1 << 26)
+   #define bit_MMX      (1 << 23)  // func 01: edx
+   #define bit_SSE      (1 << 25)  // func 01: edx
+   #define bit_SSE2     (1 << 26)  // func 01: edx
+   #define bit_OSXSAVE  (1 << 27)  // func 01: ecx
+   #define bit_AVX      (1 << 28)  // func 01: ecx
+   #define bit_XOP      (1 << 11)  // func 0x80000001: ecx
+   #define bit_FMA4     (1 << 16)  // func 0x80000001: ecx
+   #define bit_FMA3     (1 << 12)  // func 01: ecx
+   #define bit_AVX2     (1 <<  5)  // func 07 ecx=0: ebx
+   #define bit_AVX512F  (1 << 16)  // func 07 ecx=0: ebx
+   #define bit_AVX512PF (1 << 26)  // func 07 ecx=0: ebx
+   #define bit_AVX512ER (1 << 28)  // func 07 ecx=0: ebx
+   #define bit_AVX512CD (1 << 28)  // func 07 ecx=0: ebx
 #endif


@@ -82,31 +93,77 @@
 /// If building for a 64bit system (no Itanium) and the user wants optimizations.
 /// Return the OR of SUPPORT_{MMX,SSE,SSE2}. 11001 or 0x19.
 /// Keep the _dwDisabledISA test (2 more operations, could be eliminated).
+/*
 #if ((defined(__GNUC__) && defined(__x86_64__)) \
     || defined(_M_X64))  \
     && defined(SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS)
     return 0x19 & ~_dwDisabledISA;
-
+*/
 /// If building for a 32bit system and the user wants optimizations.
 /// Keep the _dwDisabledISA test (2 more operations, could be eliminated).
-#elif ((defined(__GNUC__) && defined(__i386__)) \
-    || defined(_M_IX86))  \
+#if ((defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) \
+    || (defined(_M_IX86) || defined(_M_X64)))  \
     && defined(SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS)

     if (_dwDisabledISA == 0xffffffff) return 0;
-
+
+    enum { UNKNOWN, INTEL, AMD } vendor = UNKNOWN;
+    const uint32_t strIntel[] = { 0x756e6547, 0x49656e69, 0x6c65746e }; // "GenuineIntel"
+    const uint32_t strAmd[]   = { 0x68747541, 0x69746E65, 0x444D4163 }; // "AuthenticAMD"
+
     uint res = 0;

 #if defined(__GNUC__)
     // GCC version of cpuid. Requires GCC 4.3.0 or later for __cpuid intrinsic support.
+    //                AVX.                4.4                AVX
+    //                AVX2.               4.7                AVX2
     uint eax, ebx, ecx, edx;  // unsigned int is the standard type. uint is defined by the compiler and not guaranteed to be portable.

     // Check if no cpuid support.
-    if (!__get_cpuid (1, &eax, &ebx, &ecx, &edx)) return 0; // always disable extensions.
+    if (!__get_cpuid (0, &eax, &ebx, &ecx, &edx)) return 0; // always disable extensions.

-    if (edx & bit_MMX)  res = res | SUPPORT_MMX;
-    if (edx & bit_SSE)  res = res | SUPPORT_SSE;
-    if (edx & bit_SSE2) res = res | SUPPORT_SSE2;
+    uint cpuidMaxFuncNum = eax;
+    if      (ebx == strIntel[0] && edx == strIntel[1] && ecx == strIntel[2]) vendor = INTEL;
+    else if (ebx == strAmd[0]   && edx == strAmd[1]   && ecx == strAmd[2])   vendor = AMD;
+    __get_cpuid(1, &eax, &ebx, &ecx, &edx);
+
+    #if defined(__x86_x64__)
+                                        res = res | SUPPORT_SSE2 | SUPPORT_SSE | SUPPORT_MMX;
+    #else
+    if (edx & bit_MMX)                  res = res | SUPPORT_MMX;
+    if (edx & bit_SSE)                  res = res | SUPPORT_SSE;
+    if (edx & bit_SSE2)                 res = res | SUPPORT_SSE2;
+    #endif
+
+    // Check AVX if GCC version 4.4 or later
+    #if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4
+    if ((ecx & (bit_AVX | bit_OSXSAVE)) == (bit_AVX | bit_OSXSAVE))
+    {
+        #ifdef __APPLE__
+         __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));    // MacPorts
+        #else
+        __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
+        #endif
+        uint64_t xcr0 = ((uint64_t)edx << 32LL) | eax;
+        if ((xcr0 & 0x06) == 0x06)
+        {
+                                        res = res | SUPPORT_AVX;
+            if (ecx & bit_FMA3)         res = res | SUPPORT_FMA3;
+            if (cpuidMaxFuncNum >= 7)
+            {
+                __get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+                if (ebx & bit_AVX2)     res = res | SUPPORT_AVX2;
+                if ((ebx & bit_AVX512F) && ((xcr0 & 0xe0) == 0xe0))
+                                        res = res | SUPPORT_AVX512;
+            }
+            if (vendor == AMD) {
+                __get_cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+                if (ecx & bit_XOP)      res = res | SUPPORT_XOP;
+                if (ecx & bit_FMA4)     res = res | SUPPORT_FMA4;
+            }
+        }
+    }
+    #endif

 #else
     // Window / VS version of cpuid. Notice that Visual Studio 2005 or later required
@@ -117,10 +174,45 @@
     __cpuid(reg,0);
     if ((unsigned int)reg[0] == 0) return 0; // always disable extensions.

+    int cpuidMaxFuncNum = reg[0];
+    if      ((unsigned int)reg[1] == strIntel[0] && (unsigned int)reg[3] == strIntel[1] &&
+             (unsigned int)reg[2] == strIntel[2]) vendor = INTEL;
+    else if ((unsigned int)reg[1] == strAmd[0] && (unsigned int)reg[3] == strAmd[1] &&
+             (unsigned int)reg[2] == strAmd[2]) vendor = AMD;
     __cpuid(reg,1);
-    if ((unsigned int)reg[3] & bit_MMX)  res = res | SUPPORT_MMX;
-    if ((unsigned int)reg[3] & bit_SSE)  res = res | SUPPORT_SSE;
-    if ((unsigned int)reg[3] & bit_SSE2) res = res | SUPPORT_SSE2;
+    #if defined(_M_X64)
+                        // note: MMX intrinsics cannot be compiled by Visual C++ for x64.
+                        //       It seems that however, hardware and Windows can be used.
+                                                        res = res | SUPPORT_SSE2 | SUPPORT_SSE | SUPPORT_MMX;
+    #else
+    if ((unsigned int)reg[3] & bit_MMX)                 res = res | SUPPORT_MMX;
+    if ((unsigned int)reg[3] & bit_SSE)                 res = res | SUPPORT_SSE;
+    if ((unsigned int)reg[3] & bit_SSE2)                res = res | SUPPORT_SSE2;
+    #endif
+
+    // compiler is AVX support? (Is _XCR_XFEATURE_ENABLED_MASK defined?) -- VC++, etc...
+    //#if (_MSC_FULL_VER >= 160040219)
+    #ifdef _XCR_XFEATURE_ENABLED_MASK
+    if (((unsigned int)reg[2] & (bit_AVX | bit_OSXSAVE)) == (bit_AVX | bit_OSXSAVE)) {
+        uint64_t xcr0 = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+        if ((xcr0 & 0x06) == 0x06) {
+            //                                            res = res | SUPPORT_AVX;
+            //if ((unsigned int)reg[2] & bit_FMA3)        res = res | SUPPORT_FMA3;
+            if (cpuidMaxFuncNum >= 7)
+            {
+                __cpuidex(reg, 7, 0);
+                if ((unsigned int)reg[1] & bit_AVX2)    res = res | SUPPORT_AVX2;
+            //    if (((unsigned int)reg[1] & bit_AVX512F) && (xcr0 & 0xe0) == 0xe0)
+            //                                            res = res | SUPPORT_AVX512;
+            //}
+            //if (vendor == AMD) {
+            //    __cpuid(reg, 0x80000001);
+            //    if ((unsigned int)reg[2] & bit_XOP)     res = res | SUPPORT_XOP;
+            //    if ((unsigned int)reg[2] & bit_FMA4)    res = res | SUPPORT_FMA4;
+            }
+        }
+    }
+    #endif

 #endif

--- soundtouch-1.9.2.orig/source/SoundTouch/FIFOSampleBuffer.cpp    Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SIMD/source/SoundTouch/FIFOSampleBuffer.cpp    Sat Feb 13 00:04:07 2016
@@ -171,13 +171,13 @@
         // enlarge the buffer in 4kbyte steps (round up to next 4k boundary)
         sizeInBytes = (capacityRequirement * channels * sizeof(SAMPLETYPE) + 4095) & (uint)-4096;
         assert(sizeInBytes % 2 == 0);
-        tempUnaligned = new SAMPLETYPE[sizeInBytes / sizeof(SAMPLETYPE) + 16 / sizeof(SAMPLETYPE)];
+        tempUnaligned = new SAMPLETYPE[sizeInBytes / sizeof(SAMPLETYPE) + (ALIGN_SIZE) / sizeof(SAMPLETYPE)];
         if (tempUnaligned == NULL)
         {
             ST_THROW_RT_ERROR("Couldn't allocate memory!\n");
         }
         // Align the buffer to begin at 16byte cache line boundary for optimal performance
-        temp = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER_16(tempUnaligned);
+        temp = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER(tempUnaligned);
         if (samplesInBuffer)
         {
             memcpy(temp, ptrBegin(), samplesInBuffer * channels * sizeof(SAMPLETYPE));
--- soundtouch-1.9.2.orig/source/SoundTouch/FIRFilter.cpp   Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SIMD/source/SoundTouch/FIRFilter.cpp   Sat Feb 13 00:04:07 2016
@@ -303,6 +303,24 @@

     // Check if MMX/SSE instruction set extensions supported by CPU

+#ifdef SOUNDTOUCH_ALLOW_AVX2
+    // AVX2 routines available only with integer sample types
+    if (uExtensions & SUPPORT_AVX2)
+    {
+        return ::new FIRFilterAVX2;
+    }
+    else
+#endif
+
+#ifdef SOUNDTOUCH_ALLOW_SSE2
+    // SSE2 routines available only with integer sample types
+    if (uExtensions & SUPPORT_SSE2)
+    {
+        return ::new FIRFilterSSE2;
+    }
+    else
+#endif // SOUNDTOUCH_ALLOW_SSE2
+
 #ifdef SOUNDTOUCH_ALLOW_MMX
     // MMX routines available only with integer sample types
     if (uExtensions & SUPPORT_MMX)
--- soundtouch-1.9.2.orig/source/SoundTouch/FIRFilter.h Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SIMD/source/SoundTouch/FIRFilter.h Sat Feb 13 00:04:07 2016
@@ -141,6 +141,42 @@

 #endif // SOUNDTOUCH_ALLOW_SSE

+
+#ifdef SOUNDTOUCH_ALLOW_SSE2
+    /// Class that implements SSE2 optimized functions exclusive for 16bit integer samples type.
+    class FIRFilterSSE2 : public FIRFilter
+    {
+    protected:
+        short *filterCoeffsUnalign;
+        short *filterCoeffsAlign;
+
+        virtual uint evaluateFilterStereo(short *dest, const short *src, uint numSamples) const;
+    public:
+        FIRFilterSSE2();
+        ~FIRFilterSSE2();
+
+        virtual void setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor);
+    };
+
+#endif // SOUNDTOUCH_ALLOW_SSE2
+
+#ifdef SOUNDTOUCH_ALLOW_AVX2
+    /// Class that implements AVX2 optimized functions exclusive for 16bit integer samples type.
+    class FIRFilterAVX2 : public FIRFilter
+    {
+    protected:
+        short *filterCoeffsUnalign;
+        short *filterCoeffsAlign;
+
+        virtual uint evaluateFilterStereo(short *dest, const short *src, uint numSamples) const;
+    public:
+        FIRFilterAVX2();
+        ~FIRFilterAVX2();
+
+        virtual void setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor);
+    };
+#endif // SOUNDTOUCH_ALLOW_AVX2
+
 }

 #endif  // FIRFilter_H
--- soundtouch-1.9.2.orig/source/SoundTouch/mmx_optimized.cpp   Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SIMD/source/SoundTouch/mmx_optimized.cpp   Sat Feb 13 09:53:14 2016
@@ -316,8 +316,8 @@

     // Ensure that filter coeffs array is aligned to 16-byte boundary
     delete[] filterCoeffsUnalign;
-    filterCoeffsUnalign = new short[2 * newLength + 8];
-    filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
+    filterCoeffsUnalign = new short[2 * newLength + (ALIGN_SIZE)/sizeof(short)];
+    filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER(filterCoeffsUnalign);

     // rearrange the filter coefficients for mmx routines
     for (i = 0;i < length; i += 4)
--- soundtouch-1.9.2.orig/source/SoundTouch/sse_optimized.cpp   Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SIMD/source/SoundTouch/sse_optimized.cpp   Sat Feb 13 09:53:22 2016
@@ -227,8 +227,8 @@
     // also rearrange coefficients suitably for SSE
     // Ensure that filter coeffs array is aligned to 16-byte boundary
     delete[] filterCoeffsUnalign;
-    filterCoeffsUnalign = new float[2 * newLength + 4];
-    filterCoeffsAlign = (float *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
+    filterCoeffsUnalign = new float[2 * newLength + (ALIGN_SIZE)/sizeof(float)];
+    filterCoeffsAlign = (float *)SOUNDTOUCH_ALIGN_POINTER(filterCoeffsUnalign);

     fDivider = (float)resultDivider;

@@ -370,3 +370,543 @@
 }

 #endif  // SOUNDTOUCH_ALLOW_SSE
+
+#ifdef SOUNDTOUCH_ALLOW_SSE2
+
+// SSE2 routines available only with integer sample type
+// Also refer to MMX optimized routines.
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of SSE2 optimized functions of class 'TDStretchSSE2'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "TDStretch.h"
+#include <emmintrin.h>
+#include <math.h>
+
+// Calculates cross correlation of two buffers
+double TDStretchSSE2::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm)
+{
+    const   __m128i *pVec1      = (__m128i*)pV1;    // not 16byte aligned
+    const   __m128i *pVec2      = (__m128i*)pV2;    // 16byte aligned
+    const   __m128i shifter     = _mm_cvtsi32_si128(overlapDividerBitsNorm);
+            __m128i accu        = _mm_setzero_si128();
+            __m128i normaccu    = _mm_setzero_si128();
+            __m128i v;                              // for temporary
+
+    // Process 8 parallel sets of 4 * stereo samples or 8 * mono samples
+    // during each round for improved CPU-level parallellization.
+
+    for (int i = channels*overlapLength/16 ; i ; i--)
+    {
+        // Applies shifter immediately after product-sum to prevent overflow
+        __m128i n0 = _mm_loadu_si128(pVec1);
+        __m128i n1 = _mm_loadu_si128(pVec1+1);
+        __m128i a0 = _mm_madd_epi16(n0, *pVec2++); // a0 = pVec1[0] * pVec2[0]
+                n0 = _mm_madd_epi16(n0, n0);       // n0 = pVec1[0]^2
+        __m128i a1 = _mm_madd_epi16(n1, *pVec2++); // a1 = pVec1[1] * pVec2[1]
+                n1 = _mm_madd_epi16(n1, n1);       // n1 = pVec1[1]^2
+                a0 = _mm_sra_epi32(a0, shifter);   // right arithmetic shift
+                n0 = _mm_sra_epi32(n0, shifter);
+                a1 = _mm_sra_epi32(a1, shifter);
+                n1 = _mm_sra_epi32(n1, shifter);
+        accu     = _mm_add_epi32(accu, a0);        // add to accumulator
+        normaccu = _mm_add_epi32(normaccu, n0);
+        accu     = _mm_add_epi32(accu, a1);
+        normaccu = _mm_add_epi32(normaccu, n1);
+        pVec1 += 2;
+    }
+    // sum total
+    v    = _mm_srli_si128(accu, 4);
+    accu = _mm_add_epi32(v, accu);
+    v    = _mm_srli_si128(accu, 8);
+    accu = _mm_add_epi32(v, accu);
+    v        = _mm_srli_si128(normaccu, 4);
+    normaccu = _mm_add_epi32(v, normaccu);
+    v        = _mm_srli_si128(normaccu, 8);
+    normaccu = _mm_add_epi32(v, normaccu);
+
+    __m128d Vcorr  = _mm_cvtepi32_pd(accu);        // int32 to double
+    __m128d Vdnorm = _mm_cvtepi32_pd(normaccu);
+    _mm_store_sd(&dnorm, Vdnorm);                  // feedback to dnorm
+
+    if (_mm_cvtsi128_si32(normaccu) > 0) {
+        Vdnorm = _mm_sqrt_sd(Vdnorm, Vdnorm);
+        Vcorr = _mm_div_sd(Vcorr, Vdnorm);
+    }
+    return _mm_cvtsd_f64(Vcorr);
+}
+
+
+/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
+double TDStretchSSE2::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm)
+{
+    const   __m128i *pVec1     = (__m128i*)pV1;    // (unaligned)
+    const   __m128i *pVec1prev = pVec1;            // for previos round normalizer
+    const   __m128i *pVec2     = (__m128i*)pV2;    // (aligned)
+    const   __m128i shifter    = _mm_cvtsi32_si128(overlapDividerBitsNorm);
+            __m128i accu       = _mm_setzero_si128();
+            __m128i norm       = _mm_setzero_si128();
+            __m128i v;                              // for temporary
+            __m128d vd;                             // for temporary
+
+    // Process 8 parallel sets of 2 * stereo samples or 16 * mono samples
+    // during each round for improved CPU-level parallellization.
+    for (int i = channels * overlapLength / 16 ; i ; i--)
+    {
+        // Applies shifter immediately after product-sum to prevent overflow
+        const __m128i vec1[] = {
+            _mm_loadu_si128(pVec1),
+            _mm_loadu_si128(pVec1+1)
+        };
+        __m128i v1 = _mm_madd_epi16(vec1[0], pVec2[0]);
+                v1 = _mm_sra_epi32(v1, shifter);
+        __m128i v2 = _mm_madd_epi16(vec1[1], pVec2[1]);
+                v2 = _mm_sra_epi32(v2, shifter);
+        pVec1 += 2;
+        accu = _mm_add_epi32(accu, v1);
+        pVec2 += 2;
+        accu = _mm_add_epi32(accu, v2);
+    }
+    v    = _mm_srli_si128(accu, 8);
+    accu = _mm_add_epi32(v, accu);
+    v    = _mm_srli_si128(accu, 4);
+    accu = _mm_add_epi32(v, accu); // accu.m128i_i32[0] is sum total
+
+    // update normalizer with last samples of this round, and previous round
+    for (int ch = channels; ch > 0; ch -= sizeof(*pVec1)/sizeof(*pV1)) {
+        const __m128i vth = _mm_set_epi16(0,1,2,3,4,5,6,7);
+        const __m128i vch = _mm_set1_epi16(ch);
+        const __m128i vMask = _mm_cmpgt_epi16(vch, vth);
+        __m128i vThis = _mm_loadu_si128(--pVec1);
+        __m128i vPrev = _mm_loadu_si128(--pVec1prev);
+        vThis = _mm_and_si128(vThis, vMask); // this round
+        vPrev = _mm_and_si128(vPrev, vMask); // previos round
+
+        vThis = _mm_madd_epi16(vThis, vThis);
+        vThis = _mm_sra_epi32(vThis, shifter);
+        vPrev = _mm_madd_epi16(vPrev, vPrev);
+        vPrev = _mm_sra_epi32(vPrev, shifter);
+        norm  = _mm_add_epi32(norm, vThis);
+        norm  = _mm_sub_epi32(norm, vPrev);
+    }
+    v    = _mm_srli_si128(norm, 8);
+    norm = _mm_add_epi32(norm, v);
+    v    = _mm_srli_si128(norm, 4);
+    norm = _mm_add_epi32(norm, v); // norm.m128i_i32[0] is sum total
+
+    #if defined _WIN64
+    __m128d Vcorr  = _mm_cvtepi32_pd(accu);
+    __m128d Vdnorm = _mm_cvtepi32_pd(norm);
+    vd = _mm_load_sd(&dnorm);
+    Vdnorm = _mm_add_sd(vd, Vdnorm);
+    _mm_store_sd(&dnorm, Vdnorm);  // feedback to dnorm
+
+    const __m128d dmin = _mm_set_sd(1e-9);
+    if (_mm_comige_sd(Vdnorm, dmin)) {
+        Vdnorm = _mm_sqrt_sd(Vdnorm, Vdnorm);
+        Vcorr = _mm_div_sd(Vcorr, Vdnorm);
+    }
+    return _mm_cvtsd_f64(Vcorr);
+    #else
+    // Although it is faster than the above at Pentium 4. But at x64, it may be a slow.
+    __m128d Vdnorm = _mm_cvtepi32_pd(norm);
+    vd = _mm_load_sd(&dnorm);
+    Vdnorm = _mm_add_sd(vd, Vdnorm);
+    _mm_store_sd(&dnorm, Vdnorm);  // feedback to dnorm
+
+    double corr;
+    vd = _mm_cvtepi32_pd(accu);
+    _mm_store_sd(&corr, vd);
+
+    const __m128d dmin = _mm_set_sd(1e-9);
+    if (_mm_comige_sd(Vdnorm, dmin)) {
+        return corr / sqrt(dnorm); // x87 is used even if it specifies -arch:SSE2
+    }
+    return corr;
+    #endif
+}
+
+
+// SSE2-optimized version of the function overlapStereo
+void TDStretchSSE2::overlapStereo(short *output, const short *input)
+{
+    const   __m128i *pVinput  = (__m128i*)input;                // (unaligned)
+    const   __m128i *pVMidBuf = (__m128i*)pMidBuffer;           // (aligned)
+    const   __m128i shifter   = _mm_cvtsi32_si128(overlapDividerBitsPure + 1);
+            // note: Since _mm_set_epi16() is slow at Pentium4, _mm_set_epi32() is substituted.
+            __m128i adder     = _mm_set1_epi32(0x2fffe);        // [ 2, -2, 2, -2, 2, -2, 2, -2 ]
+            __m128i mix1      = _mm_set_epi32(
+                0x10000 | (unsigned short)(overlapLength-1),    // (short)[ 1, overlapLength-1,
+                0x10000 | (unsigned short)(overlapLength-1),    //          1, overlapLength-1,
+                          (unsigned short)overlapLength,        //          0, overlapLength,
+                          (unsigned short)overlapLength);       //          0, overlapLength ]
+            __m128i mix2      = _mm_add_epi16(mix1, adder);
+            __m128i *pVdest   = (__m128i*)output;               // (unaligned)
+    adder = _mm_add_epi16(adder, adder);
+
+    for (int i = overlapLength / 4 ; i ; i--)
+    {
+        const __m128i vi = _mm_loadu_si128(pVinput);
+        const __m128i vm = _mm_load_si128(pVMidBuf);
+        __m128i v1 = _mm_unpacklo_epi16(vm, vi);
+        __m128i v2 = _mm_unpackhi_epi16(vm, vi);
+        v1 = _mm_madd_epi16(v1, mix1);
+        v2 = _mm_madd_epi16(v2, mix2);
+        v1 = _mm_sra_epi32(v1, shifter);
+        v2 = _mm_sra_epi32(v2, shifter);
+        v1 = _mm_packs_epi32(v1, v2);
+        _mm_storeu_si128(pVdest, v1);
+
+        mix1 = _mm_add_epi16(mix1, adder);
+        mix2 = _mm_add_epi16(mix2, adder);
+        pVMidBuf++;
+        pVinput++;
+        pVdest++;
+    }
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of SSE2 optimized functions of class 'FIRFilter'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "FIRFilter.h"
+
+FIRFilterSSE2::FIRFilterSSE2() : FIRFilter()
+{
+    filterCoeffsAlign = NULL;
+    filterCoeffsUnalign = NULL;
+}
+
+
+FIRFilterSSE2::~FIRFilterSSE2()
+{
+    delete[] filterCoeffsUnalign;
+}
+
+
+// (overloaded) Calculates filter coefficients for SSE2 routine
+void FIRFilterSSE2::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
+{
+    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
+
+    // Ensure that filter coeffs array is aligned to 16-byte boundary
+    delete[] filterCoeffsUnalign;
+    filterCoeffsUnalign = new short[2 * newLength + (ALIGN_SIZE)/sizeof(short)];
+    filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER(filterCoeffsUnalign);
+    __m128i *VfilterCoeffsAlign = (__m128i*)filterCoeffsAlign;
+
+    // rearrange the filter coefficients for SSE2 routines
+    for (uint i = 0; i < length; i += 4)
+    {
+        __m128i v = _mm_loadl_epi64((__m128i*)(coeffs + i)); // 3, 2, 1, 0
+        v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 1, 2, 0)); // 3, 1, 2, 0
+        v = _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 0, 0));   // 3, 1, 3, 1, 2, 0, 2, 0
+        _mm_store_si128(VfilterCoeffsAlign++, v);
+    }
+}
+
+
+// sse2-optimized version of the filter routine for stereo sound
+uint FIRFilterSSE2::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const
+{
+    if (length < 2) return 0;
+
+    short *pVdest = dest;
+
+    for (uint i = (numSamples - length) >> 1 ; i ; i--)
+    {
+        const   __m128i *pVsrc    = (__m128i*)src;
+        const   __m128i *pVfilter = (__m128i*)filterCoeffsAlign; //16byte aligned
+                __m128i accu1     = _mm_setzero_si128();
+                __m128i accu2     = _mm_setzero_si128();
+
+        for (uint j = lengthDiv8 * 2; j ; j--)
+        {
+            //           accu1                accu2
+            // r0: s00*f00 + s04*f01    s02*f00 + s06*f01
+            // r1: s01*f02 + s05*f03    s03*f02 + s07*f03
+            // r2: s02*f04 + s06*f05    s04*f04 + s08*f05
+            // r3: s03*f06 + s07*f07    s05*f06 + s09*f07
+                  __m128i v0 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+0));
+                  __m128i v2 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+2));
+            const __m128i v4 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+4));
+            const __m128i v6 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+6));
+            const __m128i vf = _mm_load_si128(pVfilter);
+            v0 = _mm_unpacklo_epi16(v0, v4);
+            v2 = _mm_unpacklo_epi16(v2, v6);
+            v0 = _mm_madd_epi16(v0, vf);
+            v2 = _mm_madd_epi16(v2, vf);
+            pVsrc++;
+            accu1 = _mm_add_epi32(accu1, v0);
+            pVfilter++;
+            accu2 = _mm_add_epi32(accu2, v2);
+        }
+        // r0: accu1 - s00*f00 + s04*f01 + s02*f04 + s06*f05
+        // r1:         s01*f02 + s05*f03 + s03*f06 + s07*f07
+        // r2: accu2 - s02*f00 + s06*f01 + s04*f04 + s08*f05
+        // r3:         s03*f02 + s07*f03 + s05*f06 + s09*f07
+        const __m128i v1 = _mm_srli_si128(accu1, 8);
+        const __m128i v2 = _mm_srli_si128(accu2, 8);
+        accu1 = _mm_add_epi32(accu1, v1);
+        accu2 = _mm_add_epi32(accu2, v2);
+        accu1 = _mm_unpacklo_epi64(accu1, accu2);
+        accu1 = _mm_srai_epi32(accu1, resultDivFactor);
+        accu1 = _mm_packs_epi32(accu1, accu1);
+        _mm_storel_epi64((__m128i*)pVdest, accu1);
+        src += 4;
+        pVdest += 4;
+    }
+    return (numSamples & -2) - length;
+}
+#endif  // SOUNDTOUCH_ALLOW_SSE2
+
+#ifdef SOUNDTOUCH_ALLOW_AVX2
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of AVX2 optimized functions of class 'TDStretchAVX2'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "TDStretch.h"
+#include <immintrin.h>
+#include <math.h>
+
+// defined SOUNDTOUCH_INTEGER_SAMPLES
+
+// Calculates cross correlation of two buffers
+double TDStretchAVX2::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm)
+{
+    const   __m256i *pVec1      = (__m256i*)pV1;    // not 32byte aligned
+    const   __m256i *pVec2      = (__m256i*)pV2;    // 32byte aligned
+    const   __m256i shifter     = _mm256_set1_epi32(overlapDividerBitsNorm);
+            __m256i accu        = _mm256_setzero_si256();
+            __m256i normaccu    = _mm256_setzero_si256();
+
+    // Process 16 parallel sets of 8 * stereo samples or 16 * mono samples
+    // during each round for improved CPU-level parallellization.
+
+    for (int i = channels*overlapLength/16 ; i ; i--)
+    {
+        // Applies shifter immediately after product-sum to prevent overflow
+        const __m256i v  = _mm256_loadu_si256(pVec1);
+        __m256i v1 = _mm256_madd_epi16(v, *pVec2++);
+        __m256i v2 = _mm256_madd_epi16(v, v);
+                v1 = _mm256_srav_epi32(v1, shifter);
+                v2 = _mm256_srav_epi32(v2, shifter);
+        accu       = _mm256_add_epi32(accu, v1);
+        normaccu   = _mm256_add_epi32(normaccu, v2);
+        pVec1++;
+    }
+    normaccu = _mm256_hadd_epi32(normaccu, accu);
+    __m128i vNorm = _mm256_extracti128_si256(normaccu, 1);
+    _mm256_zeroupper();
+    vNorm = _mm_add_epi32(_mm256_castsi256_si128(normaccu), vNorm);
+    vNorm = _mm_hadd_epi32(vNorm, vNorm); // r1=sum(accu), r0=sum(normaccu)
+
+    __m128d vdNorm = _mm_cvtepi32_pd(vNorm);                              // xmm:r0=(double)sum(normaccu)
+    __m128d vdCorr = _mm_shuffle_pd(vdNorm, vdNorm, _MM_SHUFFLE2(0,1));   // xmm:r0=(double)sum(accu)
+    _mm_store_sd(&dnorm, vdNorm);
+
+    if (_mm_cvtsi128_si32(vNorm) > 0) {
+        vdNorm = _mm_sqrt_sd(vdNorm, vdNorm);
+        vdCorr = _mm_div_sd(vdCorr, vdNorm);
+    }
+    return _mm_cvtsd_f64(vdCorr);
+}
+
+
+/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
+double TDStretchAVX2::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm)
+{
+    const   __m256i *pVec1      = (__m256i*)pV1;    // (unaligned)
+    const   __m256i *pVec1prev  = pVec1;            // for previos round normalizer
+    const   __m256i *pVec2      = (__m256i*)pV2;    // (32byte aligned)
+    const   __m256i shifter     = _mm256_set1_epi32(overlapDividerBitsNorm);
+    const   __m256i chThreshold = _mm256_set_epi16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
+            __m256i accu        = _mm256_setzero_si256();
+            __m256i norm        = _mm256_setzero_si256();
+
+    // Process 8 parallel sets of 2 * stereo samples or 16 * mono samples
+    // during each round for improved CPU-level parallellization.
+    for (int i = channels * overlapLength / 16 ; i ; i--)
+    {
+        // Applies shifter immediately after product-sum to prevent overflow
+        __m256i v0 = _mm256_loadu_si256(pVec1++);
+                v0 = _mm256_madd_epi16(v0, *pVec2++);
+                v0 = _mm256_srav_epi32(v0, shifter);
+              accu = _mm256_add_epi32(accu, v0);
+    }
+    __m128i vcorr = _mm256_extracti128_si256(accu, 1);
+    vcorr = _mm_hadd_epi32(_mm256_castsi256_si128(accu), vcorr);
+    vcorr = _mm_hadd_epi32(vcorr, /* unused */ vcorr);
+    vcorr = _mm_hadd_epi32(vcorr, /* unused */ vcorr);  // xmm:r0 = sum total
+
+    // update normalizer with last samples of this round, and previous round
+    for (int ch = channels; ch > 0; ch -= sizeof(*pVec1)/sizeof(*pV1)) {
+        const __m256i restCh = _mm256_set1_epi16(ch);
+        const __m256i vMask = _mm256_cmpgt_epi16(restCh, chThreshold);
+              __m256i vThis = _mm256_loadu_si256(--pVec1);
+              __m256i vPrev = _mm256_loadu_si256(--pVec1prev);
+                      vThis = _mm256_and_si256(vThis, vMask);
+                      vPrev = _mm256_and_si256(vPrev, vMask);
+                      vThis = _mm256_madd_epi16(vThis, vThis);
+                      vPrev = _mm256_madd_epi16(vPrev, vPrev);
+                      vThis = _mm256_srav_epi32(vThis, shifter);
+                      vPrev = _mm256_srav_epi32(vPrev, shifter);
+        norm = _mm256_add_epi32(norm, vThis);
+        norm = _mm256_sub_epi32(norm, vPrev);
+    }
+    __m128i vnorm = _mm256_extracti128_si256(norm, 1);
+    _mm256_zeroupper();
+    vnorm = _mm_hadd_epi32(_mm256_castsi256_si128(norm), vnorm);
+    vnorm = _mm_hadd_epi32(vnorm, /* unused */ vnorm);
+    vnorm = _mm_hadd_epi32(vnorm, /* unused */ vnorm);  // xmm:r0 = sum total
+
+    __m128d vdcorr = _mm_cvtepi32_pd(vcorr);
+    __m128d vdnorm = _mm_cvtepi32_pd(vnorm);
+    __m128d vd     = _mm_load_sd(&dnorm);
+            vdnorm = _mm_add_sd(vdnorm, vd);
+    _mm_store_sd(&dnorm, vdnorm);
+
+    const __m128d vdmin = _mm_set_sd(1e-9);
+    if (_mm_comige_sd(vdnorm, vdmin)) {
+        vdnorm = _mm_sqrt_sd(vdnorm, /* unused */ vdnorm);
+        vdcorr = _mm_div_sd(vdcorr, vdnorm);
+    }
+    return _mm_cvtsd_f64(vdcorr);
+}
+
+
+// AVX2-optimized version of the function overlapStereo
+void TDStretchAVX2::overlapStereo(short *output, const short *input)
+{
+    const   __m128i *pVinput  = (__m128i*)input;                // (not aligned)
+    const   __m128i *pVMidBuf = (__m128i*)pMidBuffer;           // (32byte aligned)
+    const   __m256i shifter   = _mm256_set1_epi32(overlapDividerBitsPure + 1);
+    const   __m256i adder     = _mm256_set1_epi32(0x4fffc);     // [ 4,-4, 4,-4, 4,-4, 4,-4]
+            __m128i *pVdest   = (__m128i*)output;               // (not aligned)
+            __m256i mix       = _mm256_set_epi32(
+                0x30000 | (unsigned short)(overlapLength-3),    // 3, overlapLength-3,
+                0x30000 | (unsigned short)(overlapLength-3),
+                0x20000 | (unsigned short)(overlapLength-2),    // 2, overlapLength-2,
+                0x20000 | (unsigned short)(overlapLength-2),
+                0x10000 | (unsigned short)(overlapLength-1),    // 1, overlapLength-1,
+                0x10000 | (unsigned short)(overlapLength-1),
+                          (unsigned short)overlapLength,        // 0, overlapLength,
+                          (unsigned short)overlapLength);
+
+    for (int i =  overlapLength / 4; i >= 0; i--)
+    {
+        const __m128i vinput  = _mm_loadu_si128(pVinput++);
+        const __m128i vmidBuf = *pVMidBuf++;
+        __m128i vh   = _mm_unpackhi_epi16(vmidBuf, vinput);
+        __m128i vl   = _mm_unpacklo_epi16(vmidBuf, vinput);
+        __m256i dest = _mm256_inserti128_si256(_mm256_castsi128_si256(vl), vh, 1);
+                dest = _mm256_madd_epi16(dest, mix);
+                dest = _mm256_srav_epi32(dest, shifter);
+        __m128i v    = _mm256_extracti128_si256(dest, 1);
+                v    = _mm_packs_epi32(_mm256_castsi256_si128(dest), v);
+        _mm_storeu_si128(pVdest++, v);
+        mix = _mm256_add_epi16(mix, adder); // update overlap multiplier
+    }
+    _mm256_zeroupper();
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of AVX2 optimized functions of class 'FIRFilter'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "FIRFilter.h"
+
+FIRFilterAVX2::FIRFilterAVX2() : FIRFilter()
+{
+    filterCoeffsAlign = NULL;
+    filterCoeffsUnalign = NULL;
+}
+
+
+FIRFilterAVX2::~FIRFilterAVX2()
+{
+    delete[] filterCoeffsUnalign;
+}
+
+
+// (overloaded) Calculates filter coefficients for AVX2 routine
+void FIRFilterAVX2::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
+{
+    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
+
+    // Ensure that filter coeffs array is aligned to 32-byte boundary
+    delete[] filterCoeffsUnalign;
+    filterCoeffsUnalign = new short[2 * newLength + (ALIGN_SIZE)/sizeof(short)];
+    filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER(filterCoeffsUnalign);
+    __m128i *VfilterCoeffsAlign = (__m128i*)filterCoeffsAlign;
+
+    // rearrange the filter coefficients for SSE2 routines
+    for (uint i = 0; i < length; i += 4)
+    {
+        __m128i v = _mm_loadl_epi64((__m128i*)(coeffs + i)); // 3, 2, 1, 0
+        v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 1, 2, 0)); // 3, 1, 2, 0
+        v = _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 0, 0));   // 3, 1, 3, 1, 2, 0, 2, 0
+        _mm_store_si128(VfilterCoeffsAlign++, v);
+    }
+}
+
+
+// AVX2-optimized version of the filter routine for stereo sound
+uint FIRFilterAVX2::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const
+{
+    if (length < 2) return 0;
+
+    short *pVdest = dest;
+
+    for (uint i = (numSamples - length) >> 1 ; i ; i--)
+    {
+        const   __m256i *pVsrc    = (__m256i*)src;
+        const   __m256i *pVfilter = (__m256i*)filterCoeffsAlign; // 32byte aligned
+                __m256i accu1     = _mm256_setzero_si256();
+                __m256i accu2     = _mm256_setzero_si256();
+
+        for (uint j = lengthDiv8; j ; j--)
+        {
+            const __m256i vfilter = _mm256_load_si256(pVfilter);
+                  __m256i v0 = _mm256_loadu_si256((__m256i*)((short*)pVsrc+0));
+                  __m256i v2 = _mm256_loadu_si256((__m256i*)((short*)pVsrc+2));
+            const __m256i v4 = _mm256_srli_si256(v0, 8);
+            const __m256i v6 = _mm256_srli_si256(v2, 8);
+                          v0 = _mm256_unpacklo_epi16(v0, v4);
+                          v2 = _mm256_unpacklo_epi16(v2, v6);
+                          v0 = _mm256_madd_epi16(v0, vfilter);
+                          v2 = _mm256_madd_epi16(v2, vfilter);
+            accu1 = _mm256_add_epi32(accu1, v0);
+            accu2 = _mm256_add_epi32(accu2, v2);
+            pVsrc++;
+            pVfilter++;
+        }
+        accu1 = _mm256_shuffle_epi32(accu1, _MM_SHUFFLE(3,1,2,0));
+        accu2 = _mm256_shuffle_epi32(accu2, _MM_SHUFFLE(3,1,2,0));
+        accu1 = _mm256_hadd_epi32(accu1, accu2);
+
+        __m128i accu = _mm256_extracti128_si256(accu1, 1);
+        accu = _mm_add_epi32(_mm256_castsi256_si128(accu1), accu);
+        accu = _mm_srai_epi32(accu, resultDivFactor);
+        accu = _mm_packs_epi32(accu, /* unused */ accu);
+        _mm_storel_epi64((__m128i*)pVdest, accu);
+
+        src += 4;
+        pVdest += 4;
+    }
+    _mm256_zeroupper();
+    return (numSamples & -2) - length;
+}
+#endif  // SOUNDTOUCH_ALLOW_AVX2
--- soundtouch-1.9.2.orig/source/SoundTouch/TDStretch.cpp   Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SIMD/source/SoundTouch/TDStretch.cpp   Sat Feb 13 00:04:07 2016
@@ -721,9 +721,9 @@
     {
         delete[] pMidBufferUnaligned;

-        pMidBufferUnaligned = new SAMPLETYPE[overlapLength * channels + 16 / sizeof(SAMPLETYPE)];
+        pMidBufferUnaligned = new SAMPLETYPE[overlapLength * channels + (ALIGN_SIZE) / sizeof(SAMPLETYPE)];
         // ensure that 'pMidBuffer' is aligned to 16 byte boundary for efficiency
-        pMidBuffer = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER_16(pMidBufferUnaligned);
+        pMidBuffer = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER(pMidBufferUnaligned);

         clearMidBuffer();
     }
@@ -748,6 +748,24 @@

     // Check if MMX/SSE instruction set extensions supported by CPU

+#ifdef SOUNDTOUCH_ALLOW_AVX2
+    // AVX2 routines available
+    if (uExtensions & SUPPORT_AVX2)
+    {
+        return ::new TDStretchAVX2;
+    }
+    else
+#endif
+
+#ifdef SOUNDTOUCH_ALLOW_SSE2
+    // SSE2 routines available only with integer sample types
+    if (uExtensions & SUPPORT_SSE2)
+    {
+        return ::new TDStretchSSE2;
+    }
+    else
+#endif // SOUNDTOUCH_ALLOW_SSE2
+
 #ifdef SOUNDTOUCH_ALLOW_MMX
     // MMX routines available only with integer sample types
     if (uExtensions & SUPPORT_MMX)
--- soundtouch-1.9.2.orig/source/SoundTouch/TDStretch.h Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SIMD/source/SoundTouch/TDStretch.h Sat Feb 13 00:04:07 2016
@@ -277,5 +277,29 @@

 #endif /// SOUNDTOUCH_ALLOW_SSE

+
+#ifdef SOUNDTOUCH_ALLOW_SSE2
+    /// Class that implements SSE2 optimized routines for 16bit integer samples type.
+    class TDStretchSSE2 : public TDStretch
+    {
+    protected:
+        double calcCrossCorr(const short *mixingPos, const short *compare, double &norm);
+        double calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm);
+        virtual void overlapStereo(short *output, const short *input);
+    };
+
+#endif /// SOUNDTOUCH_ALLOW_SSE2
+
+#ifdef SOUNDTOUCH_ALLOW_AVX2
+    /// Class that implements AVX2 optimized routines for 16bit integer samples type.
+    class TDStretchAVX2 : public TDStretch
+    {
+    protected:
+        double calcCrossCorr(const short *mixingPos, const short *compare, double &norm);
+        double calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm);
+        virtual void overlapStereo(short *output, const short *input);
+    };
+#endif /// SOUNDTOUCH_ALLOW_AVX2
+
 }
 #endif  /// TDStretch_H