SoundTouch-1.9.2_SSE2.patch

--- soundtouch-1.9.2.orig/include/STTypes.h Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SSE2/include/STTypes.h Fri Feb 12 01:54:14 2016
@@ -98,8 +98,8 @@
         ///   However, if you still prefer to select the sample format here
         ///   also in GNU environment, then please #undef the INTEGER_SAMPLE
         ///   and FLOAT_SAMPLE defines first as in comments above.
-        //#define SOUNDTOUCH_INTEGER_SAMPLES     1    //< 16bit integer samples
-        #define SOUNDTOUCH_FLOAT_SAMPLES       1    //< 32bit float samples
+        #define SOUNDTOUCH_INTEGER_SAMPLES     1    //< 16bit integer samples
+        //#define SOUNDTOUCH_FLOAT_SAMPLES       1    //< 32bit float samples

     #endif

@@ -143,8 +143,12 @@
         #endif // SOUNDTOUCH_FLOAT_SAMPLES

         #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
-            // Allow MMX optimizations
-            #define SOUNDTOUCH_ALLOW_MMX   1
+            // Allow SSE2 optimizations
+            #define SOUNDTOUCH_ALLOW_SSE2      1
+            #ifndef _M_X64
+                // Allow MMX optimizations
+                #define SOUNDTOUCH_ALLOW_MMX   1
+            #endif
         #endif

     #else
--- soundtouch-1.9.2.orig/source/SoundTouch/FIRFilter.cpp   Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SSE2/source/SoundTouch/FIRFilter.cpp   Fri Feb 12 01:54:14 2016
@@ -303,6 +303,15 @@

     // Check if MMX/SSE instruction set extensions supported by CPU

+#ifdef SOUNDTOUCH_ALLOW_SSE2
+    // SSE2 routines available only with integer sample types
+    if (uExtensions & SUPPORT_SSE2)
+    {
+        return ::new FIRFilterSSE2;
+    }
+    else
+#endif // SOUNDTOUCH_ALLOW_SSE2
+
 #ifdef SOUNDTOUCH_ALLOW_MMX
     // MMX routines available only with integer sample types
     if (uExtensions & SUPPORT_MMX)
--- soundtouch-1.9.2.orig/source/SoundTouch/FIRFilter.h Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SSE2/source/SoundTouch/FIRFilter.h Fri Feb 12 01:54:14 2016
@@ -141,6 +141,25 @@

 #endif // SOUNDTOUCH_ALLOW_SSE

+
+#ifdef SOUNDTOUCH_ALLOW_SSE2
+    /// Class that implements SSE2 optimized functions exclusive for 16bit integer samples type.
+    class FIRFilterSSE2 : public FIRFilter
+    {
+    protected:
+        short *filterCoeffsUnalign;
+        short *filterCoeffsAlign;
+
+        virtual uint evaluateFilterStereo(short *dest, const short *src, uint numSamples) const;
+    public:
+        FIRFilterSSE2();
+        ~FIRFilterSSE2();
+
+        virtual void setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor);
+    };
+
+#endif // SOUNDTOUCH_ALLOW_SSE2
+
 }

 #endif  // FIRFilter_H
--- soundtouch-1.9.2.orig/source/SoundTouch/sse_optimized.cpp   Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SSE2/source/SoundTouch/sse_optimized.cpp   Fri Feb 12 11:54:13 2016
@@ -370,3 +370,282 @@
 }

 #endif  // SOUNDTOUCH_ALLOW_SSE
+
+#ifdef SOUNDTOUCH_ALLOW_SSE2
+
+// SSE2 routines available only with integer sample type
+// Also refer to MMX optimized routines.
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of SSE2 optimized functions of class 'TDStretchSSE2'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "TDStretch.h"
+#include <emmintrin.h>
+#include <math.h>
+
+// Calculates cross correlation of two buffers
+double TDStretchSSE2::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm)
+{
+    const   __m128i *pVec1      = (__m128i*)pV1;    // not 16byte aligned
+    const   __m128i *pVec2      = (__m128i*)pV2;    // 16byte aligned
+    const   __m128i shifter     = _mm_cvtsi32_si128(overlapDividerBitsNorm);
+            __m128i accu        = _mm_setzero_si128();
+            __m128i normaccu    = _mm_setzero_si128();
+            __m128i v;                              // for temporary
+
+    // Process 8 parallel sets of 4 * stereo samples or 8 * mono samples
+    // during each round for improved CPU-level parallellization.
+
+    for (int i = channels*overlapLength/16 ; i ; i--)
+    {
+        // Applies shifter immediately after product-sum to prevent overflow
+        __m128i n0 = _mm_loadu_si128(pVec1);
+        __m128i n1 = _mm_loadu_si128(pVec1+1);
+        __m128i a0 = _mm_madd_epi16(n0, *pVec2++); // a0 = pVec1[0] * pVec2[0]
+                n0 = _mm_madd_epi16(n0, n0);       // n0 = pVec1[0]^2
+        __m128i a1 = _mm_madd_epi16(n1, *pVec2++); // a1 = pVec1[1] * pVec2[1]
+                n1 = _mm_madd_epi16(n1, n1);       // n1 = pVec1[1]^2
+                a0 = _mm_sra_epi32(a0, shifter);   // right arithmetic shift
+                n0 = _mm_sra_epi32(n0, shifter);
+                a1 = _mm_sra_epi32(a1, shifter);
+                n1 = _mm_sra_epi32(n1, shifter);
+        accu     = _mm_add_epi32(accu, a0);        // add to accumulator
+        normaccu = _mm_add_epi32(normaccu, n0);
+        accu     = _mm_add_epi32(accu, a1);
+        normaccu = _mm_add_epi32(normaccu, n1);
+        pVec1 += 2;
+    }
+    // sum total
+    v    = _mm_srli_si128(accu, 4);
+    accu = _mm_add_epi32(v, accu);
+    v    = _mm_srli_si128(accu, 8);
+    accu = _mm_add_epi32(v, accu);
+    v        = _mm_srli_si128(normaccu, 4);
+    normaccu = _mm_add_epi32(v, normaccu);
+    v        = _mm_srli_si128(normaccu, 8);
+    normaccu = _mm_add_epi32(v, normaccu);
+
+    __m128d Vcorr  = _mm_cvtepi32_pd(accu);        // int32 to double
+    __m128d Vdnorm = _mm_cvtepi32_pd(normaccu);
+    _mm_store_sd(&dnorm, Vdnorm);                  // feedback to dnorm
+
+    if (_mm_cvtsi128_si32(normaccu) > 0) {
+        Vdnorm = _mm_sqrt_sd(Vdnorm, Vdnorm);
+        Vcorr = _mm_div_sd(Vcorr, Vdnorm);
+    }
+    return _mm_cvtsd_f64(Vcorr);
+}
+
+
+/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
+double TDStretchSSE2::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm)
+{
+    const   __m128i *pVec1     = (__m128i*)pV1;    // (unaligned)
+    const   __m128i *pVec1prev = pVec1;            // for previos round normalizer
+    const   __m128i *pVec2     = (__m128i*)pV2;    // (aligned)
+    const   __m128i shifter    = _mm_cvtsi32_si128(overlapDividerBitsNorm);
+            __m128i accu       = _mm_setzero_si128();
+            __m128i norm       = _mm_setzero_si128();
+            __m128i v;                              // for temporary
+            __m128d vd;                             // for temporary
+
+    // Process 8 parallel sets of 2 * stereo samples or 16 * mono samples
+    // during each round for improved CPU-level parallellization.
+    for (int i = channels * overlapLength / 16 ; i ; i--)
+    {
+        // Applies shifter immediately after product-sum to prevent overflow
+        const __m128i vec1[] = {
+            _mm_loadu_si128(pVec1),
+            _mm_loadu_si128(pVec1+1)
+        };
+        __m128i v1 = _mm_madd_epi16(vec1[0], pVec2[0]);
+                v1 = _mm_sra_epi32(v1, shifter);
+        __m128i v2 = _mm_madd_epi16(vec1[1], pVec2[1]);
+                v2 = _mm_sra_epi32(v2, shifter);
+        pVec1 += 2;
+        accu = _mm_add_epi32(accu, v1);
+        pVec2 += 2;
+        accu = _mm_add_epi32(accu, v2);
+    }
+    v    = _mm_srli_si128(accu, 8);
+    accu = _mm_add_epi32(v, accu);
+    v    = _mm_srli_si128(accu, 4);
+    accu = _mm_add_epi32(v, accu); // accu.m128i_i32[0] is sum total
+
+    // update normalizer with last samples of this round, and previous round
+    for (int ch = channels; ch > 0; ch -= sizeof(*pVec1)/sizeof(*pV1)) {
+        const __m128i vth = _mm_set_epi16(0,1,2,3,4,5,6,7);
+        const __m128i vch = _mm_set1_epi16(ch);
+        const __m128i vMask = _mm_cmpgt_epi16(vch, vth);
+        __m128i vThis = _mm_loadu_si128(--pVec1);
+        __m128i vPrev = _mm_loadu_si128(--pVec1prev);
+        vThis = _mm_and_si128(vThis, vMask); // this round
+        vPrev = _mm_and_si128(vPrev, vMask); // previos round
+
+        vThis = _mm_madd_epi16(vThis, vThis);
+        vThis = _mm_sra_epi32(vThis, shifter);
+        vPrev = _mm_madd_epi16(vPrev, vPrev);
+        vPrev = _mm_sra_epi32(vPrev, shifter);
+        norm  = _mm_add_epi32(norm, vThis);
+        norm  = _mm_sub_epi32(norm, vPrev);
+    }
+    v    = _mm_srli_si128(norm, 8);
+    norm = _mm_add_epi32(norm, v);
+    v    = _mm_srli_si128(norm, 4);
+    norm = _mm_add_epi32(norm, v); // norm.m128i_i32[0] is sum total
+
+    __m128d Vcorr  = _mm_cvtepi32_pd(accu);
+    __m128d Vdnorm = _mm_cvtepi32_pd(norm);
+    vd = _mm_load_sd(&dnorm);
+    Vdnorm = _mm_add_sd(vd, Vdnorm);
+    _mm_store_sd(&dnorm, Vdnorm);  // feedback to dnorm
+
+    const __m128d dmin = _mm_set_sd(1e-9);
+    if (_mm_comige_sd(Vdnorm, dmin)) {
+        Vdnorm = _mm_sqrt_sd(Vdnorm, Vdnorm);
+        Vcorr = _mm_div_sd(Vcorr, Vdnorm);
+    }
+    return _mm_cvtsd_f64(Vcorr);
+}
+
+
+// SSE2-optimized version of the function overlapStereo
+void TDStretchSSE2::overlapStereo(short *output, const short *input)
+{
+    const   __m128i *pVinput  = (__m128i*)input;                // (unaligned)
+    const   __m128i *pVMidBuf = (__m128i*)pMidBuffer;           // (aligned)
+    const   __m128i shifter   = _mm_cvtsi32_si128(overlapDividerBitsPure + 1);
+            // note: Since _mm_set_epi16() is slow at Pentium4, _mm_set_epi32() is substituted.
+            __m128i adder     = _mm_set1_epi32(0x2fffe);        // [ 2, -2, 2, -2, 2, -2, 2, -2 ]
+            __m128i mix1      = _mm_set_epi32(
+                0x10000 | (unsigned short)(overlapLength-1),    // (short)[ 1, overlapLength-1,
+                0x10000 | (unsigned short)(overlapLength-1),    //          1, overlapLength-1,
+                          (unsigned short)overlapLength,        //          0, overlapLength,
+                          (unsigned short)overlapLength);       //          0, overlapLength ]
+            __m128i mix2      = _mm_add_epi16(mix1, adder);
+            __m128i *pVdest   = (__m128i*)output;               // (unaligned)
+    adder = _mm_add_epi16(adder, adder);
+
+    for (int i = overlapLength / 4 ; i ; i--)
+    {
+        const __m128i vi = _mm_loadu_si128(pVinput);
+        const __m128i vm = _mm_load_si128(pVMidBuf);
+        __m128i v1 = _mm_unpacklo_epi16(vm, vi);
+        __m128i v2 = _mm_unpackhi_epi16(vm, vi);
+        v1 = _mm_madd_epi16(v1, mix1);
+        v2 = _mm_madd_epi16(v2, mix2);
+        v1 = _mm_sra_epi32(v1, shifter);
+        v2 = _mm_sra_epi32(v2, shifter);
+        v1 = _mm_packs_epi32(v1, v2);
+        _mm_storeu_si128(pVdest, v1);
+
+        mix1 = _mm_add_epi16(mix1, adder);
+        mix2 = _mm_add_epi16(mix2, adder);
+        pVMidBuf++;
+        pVinput++;
+        pVdest++;
+    }
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of SSE2 optimized functions of class 'FIRFilter'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "FIRFilter.h"
+
+FIRFilterSSE2::FIRFilterSSE2() : FIRFilter()
+{
+    filterCoeffsAlign = NULL;
+    filterCoeffsUnalign = NULL;
+}
+
+
+FIRFilterSSE2::~FIRFilterSSE2()
+{
+    delete[] filterCoeffsUnalign;
+    filterCoeffsAlign = NULL;
+    filterCoeffsUnalign = NULL;
+}
+
+
+// (overloaded) Calculates filter coefficients for SSE2 routine
+void FIRFilterSSE2::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
+{
+    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
+
+    // Ensure that filter coeffs array is aligned to 16-byte boundary
+    delete[] filterCoeffsUnalign;
+    filterCoeffsUnalign = new short[2 * newLength + 8];
+    filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
+    __m128i *VfilterCoeffsAlign = (__m128i*)filterCoeffsAlign;
+
+    // rearrange the filter coefficients for SSE2 routines
+    for (uint i = 0; i < length; i += 4)
+    {
+        __m128i v = _mm_loadl_epi64((__m128i*)(coeffs + i)); // 3, 2, 1, 0
+        v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 1, 2, 0)); // 3, 1, 2, 0
+        v = _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 0, 0));   // 3, 1, 3, 1, 2, 0, 2, 0
+        _mm_store_si128(VfilterCoeffsAlign++, v);
+    }
+}
+
+
+// sse2-optimized version of the filter routine for stereo sound
+uint FIRFilterSSE2::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const
+{
+    if (length < 2) return 0;
+
+    short *pVdest = dest;
+
+    for (uint i = (numSamples - length) >> 1 ; i ; i--)
+    {
+        const   __m128i *pVsrc    = (__m128i*)src;
+        const   __m128i *pVfilter = (__m128i*)filterCoeffsAlign; //16byte aligned
+                __m128i accu1     = _mm_setzero_si128();
+                __m128i accu2     = _mm_setzero_si128();
+
+        for (uint j = lengthDiv8 * 2; j ; j--)
+        {
+            //           accu1                accu2
+            // r0: s00*f00 + s04*f01    s02*f00 + s06*f01
+            // r1: s01*f02 + s05*f03    s03*f02 + s07*f03
+            // r2: s02*f04 + s06*f05    s04*f04 + s08*f05
+            // r3: s03*f06 + s07*f07    s05*f06 + s09*f07
+                  __m128i v0 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+0));
+                  __m128i v2 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+2));
+            const __m128i v4 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+4));
+            const __m128i v6 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+6));
+            const __m128i vf = _mm_load_si128(pVfilter);
+            v0 = _mm_unpacklo_epi16(v0, v4);
+            v2 = _mm_unpacklo_epi16(v2, v6);
+            v0 = _mm_madd_epi16(v0, vf);
+            v2 = _mm_madd_epi16(v2, vf);
+            pVsrc++;
+            accu1 = _mm_add_epi32(accu1, v0);
+            pVfilter++;
+            accu2 = _mm_add_epi32(accu2, v2);
+        }
+        // r0: accu1 - s00*f00 + s04*f01 + s02*f04 + s06*f05
+        // r1:         s01*f02 + s05*f03 + s03*f06 + s07*f07
+        // r2: accu2 - s02*f00 + s06*f01 + s04*f04 + s08*f05
+        // r3:         s03*f02 + s07*f03 + s05*f06 + s09*f07
+        const __m128i v1 = _mm_srli_si128(accu1, 8);
+        const __m128i v2 = _mm_srli_si128(accu2, 8);
+        accu1 = _mm_add_epi32(accu1, v1);
+        accu2 = _mm_add_epi32(accu2, v2);
+        accu1 = _mm_unpacklo_epi64(accu1, accu2);
+        accu1 = _mm_srai_epi32(accu1, resultDivFactor);
+        accu1 = _mm_packs_epi32(accu1, accu1);
+        _mm_storel_epi64((__m128i*)pVdest, accu1);
+        src += 4;
+        pVdest += 4;
+    }
+    return (numSamples & -2) - length;
+}
+#endif  // SOUNDTOUCH_ALLOW_SSE2
--- soundtouch-1.9.2.orig/source/SoundTouch/TDStretch.cpp   Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SSE2/source/SoundTouch/TDStretch.cpp   Fri Feb 12 01:54:14 2016
@@ -748,6 +748,15 @@

     // Check if MMX/SSE instruction set extensions supported by CPU

+#ifdef SOUNDTOUCH_ALLOW_SSE2
+    // SSE2 routines available only with integer sample types
+    if (uExtensions & SUPPORT_SSE2)
+    {
+        return ::new TDStretchSSE2;
+    }
+    else
+#endif // SOUNDTOUCH_ALLOW_SSE2
+
 #ifdef SOUNDTOUCH_ALLOW_MMX
     // MMX routines available only with integer sample types
     if (uExtensions & SUPPORT_MMX)
--- soundtouch-1.9.2.orig/source/SoundTouch/TDStretch.h Sun Sep 20 16:40:59 2015
+++ soundtouch-1.9.2.SSE2/source/SoundTouch/TDStretch.h Fri Feb 12 12:14:54 2016
@@ -277,5 +277,18 @@

 #endif /// SOUNDTOUCH_ALLOW_SSE

+
+#ifdef SOUNDTOUCH_ALLOW_SSE2
+    /// Class that implements SSE2 optimized routines for 16bit integer samples type.
+    class TDStretchSSE2 : public TDStretch
+    {
+    protected:
+        double calcCrossCorr(const short *mixingPos, const short *compare, double &norm);
+        double calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm);
+        virtual void overlapStereo(short *output, const short *input);
+    };
+
+#endif /// SOUNDTOUCH_ALLOW_SSE2
+
 }
 #endif  /// TDStretch_H