Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- --- soundtouch-1.9.2.orig/include/STTypes.h Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SIMD/include/STTypes.h Sat Feb 13 00:04:07 2016
- @@ -50,8 +50,9 @@
- #endif
- -// Helper macro for aligning pointer up to next 16-byte boundary
- -#define SOUNDTOUCH_ALIGN_POINTER_16(x) ( ( (ulongptr)(x) + 15 ) & ~(ulongptr)15 )
- +// Helper macro for aligning pointer up to next 32-byte boundary
- +#define ALIGN_SIZE 64
- +#define SOUNDTOUCH_ALIGN_POINTER(x) (((ulongptr)(x) + (ALIGN_SIZE)-1) & ~(ulongptr)((ALIGN_SIZE)-1))
- #if (defined(__GNUC__) && !defined(ANDROID))
- @@ -98,8 +99,8 @@
- /// However, if you still prefer to select the sample format here
- /// also in GNU environment, then please #undef the INTEGER_SAMPLE
- /// and FLOAT_SAMPLE defines first as in comments above.
- - //#define SOUNDTOUCH_INTEGER_SAMPLES 1 //< 16bit integer samples
- - #define SOUNDTOUCH_FLOAT_SAMPLES 1 //< 32bit float samples
- + #define SOUNDTOUCH_INTEGER_SAMPLES 1 //< 16bit integer samples
- + //#define SOUNDTOUCH_FLOAT_SAMPLES 1 //< 32bit float samples
- #endif
- @@ -143,8 +144,14 @@
- #endif // SOUNDTOUCH_FLOAT_SAMPLES
- #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
- - // Allow MMX optimizations
- - #define SOUNDTOUCH_ALLOW_MMX 1
- + // Allow SSE2 optimizations
- + #define SOUNDTOUCH_ALLOW_SSE2 1
- + // Allow AVX2 optimizations
- + #define SOUNDTOUCH_ALLOW_AVX2 1
- + #ifndef _M_X64
- + // Allow MMX optimizations
- + #define SOUNDTOUCH_ALLOW_MMX 1
- + #endif
- #endif
- #else
- --- soundtouch-1.9.2.orig/source/SoundTouch/cpu_detect.h Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SIMD/source/SoundTouch/cpu_detect.h Sat Feb 13 03:25:37 2016
- @@ -50,6 +50,12 @@
- #define SUPPORT_ALTIVEC 0x0004
- #define SUPPORT_SSE 0x0008
- #define SUPPORT_SSE2 0x0010
- +//#define SUPPORT_AVX 0x0020
- +//#define SUPPORT_XOP 0x0040
- +//#define SUPPORT_FMA4 0x0080
- +//#define SUPPORT_FMA3 0x0100
- +#define SUPPORT_AVX2 0x0200
- +//#define SUPPORT_AVX512 0x0400
- /// Checks which instruction set extensions are supported by the CPU.
- ///
- --- soundtouch-1.9.2.orig/source/SoundTouch/cpu_detect_x86.cpp Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SIMD/source/SoundTouch/cpu_detect_x86.cpp Sat Feb 13 03:29:07 2016
- @@ -39,6 +39,7 @@
- //
- ////////////////////////////////////////////////////////////////////////////////
- +#include <stdint.h>
- #include "cpu_detect.h"
- #include "STTypes.h"
- @@ -48,14 +49,24 @@
- #if defined(__GNUC__) && defined(__i386__)
- // gcc
- #include "cpuid.h"
- - #elif defined(_M_IX86)
- + #elif defined(_M_IX86) || defined(_M_X64)
- // windows non-gcc
- #include <intrin.h>
- #endif
- - #define bit_MMX (1 << 23)
- - #define bit_SSE (1 << 25)
- - #define bit_SSE2 (1 << 26)
- + #define bit_MMX (1 << 23) // func 01: edx
- + #define bit_SSE (1 << 25) // func 01: edx
- + #define bit_SSE2 (1 << 26) // func 01: edx
- + #define bit_OSXSAVE (1 << 27) // func 01: ecx
- + #define bit_AVX (1 << 28) // func 01: ecx
- + #define bit_XOP (1 << 11) // func 0x80000001: ecx
- + #define bit_FMA4 (1 << 16) // func 0x80000001: ecx
- + #define bit_FMA3 (1 << 12) // func 01: ecx
- + #define bit_AVX2 (1 << 5) // func 07 ecx=0: ebx
- + #define bit_AVX512F (1 << 16) // func 07 ecx=0: ebx
- + #define bit_AVX512PF (1 << 26) // func 07 ecx=0: ebx
- + #define bit_AVX512ER (1 << 28) // func 07 ecx=0: ebx
- + #define bit_AVX512CD (1 << 28) // func 07 ecx=0: ebx
- #endif
- @@ -82,31 +93,77 @@
- /// If building for a 64bit system (no Itanium) and the user wants optimizations.
- /// Return the OR of SUPPORT_{MMX,SSE,SSE2}. 11001 or 0x19.
- /// Keep the _dwDisabledISA test (2 more operations, could be eliminated).
- +/*
- #if ((defined(__GNUC__) && defined(__x86_64__)) \
- || defined(_M_X64)) \
- && defined(SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS)
- return 0x19 & ~_dwDisabledISA;
- -
- +*/
- /// If building for a 32bit system and the user wants optimizations.
- /// Keep the _dwDisabledISA test (2 more operations, could be eliminated).
- -#elif ((defined(__GNUC__) && defined(__i386__)) \
- - || defined(_M_IX86)) \
- +#if ((defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) \
- + || (defined(_M_IX86) || defined(_M_X64))) \
- && defined(SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS)
- if (_dwDisabledISA == 0xffffffff) return 0;
- -
- +
- + enum { UNKNOWN, INTEL, AMD } vendor = UNKNOWN;
- + const uint32_t strIntel[] = { 0x756e6547, 0x49656e69, 0x6c65746e }; // "GenuineIntel"
- + const uint32_t strAmd[] = { 0x68747541, 0x69746E65, 0x444D4163 }; // "AuthenticAMD"
- +
- uint res = 0;
- #if defined(__GNUC__)
- // GCC version of cpuid. Requires GCC 4.3.0 or later for __cpuid intrinsic support.
- + // AVX. 4.4 AVX
- + // AVX2. 4.7 AVX2
- uint eax, ebx, ecx, edx; // unsigned int is the standard type. uint is defined by the compiler and not guaranteed to be portable.
- // Check if no cpuid support.
- - if (!__get_cpuid (1, &eax, &ebx, &ecx, &edx)) return 0; // always disable extensions.
- + if (!__get_cpuid (0, &eax, &ebx, &ecx, &edx)) return 0; // always disable extensions.
- - if (edx & bit_MMX) res = res | SUPPORT_MMX;
- - if (edx & bit_SSE) res = res | SUPPORT_SSE;
- - if (edx & bit_SSE2) res = res | SUPPORT_SSE2;
- + uint cpuidMaxFuncNum = eax;
- + if (ebx == strIntel[0] && edx == strIntel[1] && ecx == strIntel[2]) vendor = INTEL;
- + else if (ebx == strAmd[0] && edx == strAmd[1] && ecx == strAmd[2]) vendor = AMD;
- + __get_cpuid(1, &eax, &ebx, &ecx, &edx);
- +
- + #if defined(__x86_x64__)
- + res = res | SUPPORT_SSE2 | SUPPORT_SSE | SUPPORT_MMX;
- + #else
- + if (edx & bit_MMX) res = res | SUPPORT_MMX;
- + if (edx & bit_SSE) res = res | SUPPORT_SSE;
- + if (edx & bit_SSE2) res = res | SUPPORT_SSE2;
- + #endif
- +
- + // Check AVX if GCC version 4.4 or later
- + #if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4
- + if ((ecx & (bit_AVX | bit_OSXSAVE)) == (bit_AVX | bit_OSXSAVE))
- + {
- + #ifdef __APPLE__
- + __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0)); // MacPorts
- + #else
- + __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
- + #endif
- + uint64_t xcr0 = ((uint64_t)edx << 32LL) | eax;
- + if ((xcr0 & 0x06) == 0x06)
- + {
- + res = res | SUPPORT_AVX;
- + if (ecx & bit_FMA3) res = res | SUPPORT_FMA3;
- + if (cpuidMaxFuncNum >= 7)
- + {
- + __get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
- + if (ebx & bit_AVX2) res = res | SUPPORT_AVX2;
- + if ((ebx & bit_AVX512F) && ((xcr0 & 0xe0) == 0xe0))
- + res = res | SUPPORT_AVX512;
- + }
- + if (vendor == AMD) {
- + __get_cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
- + if (ecx & bit_XOP) res = res | SUPPORT_XOP;
- + if (ecx & bit_FMA4) res = res | SUPPORT_FMA4;
- + }
- + }
- + }
- + #endif
- #else
- // Window / VS version of cpuid. Notice that Visual Studio 2005 or later required
- @@ -117,10 +174,45 @@
- __cpuid(reg,0);
- if ((unsigned int)reg[0] == 0) return 0; // always disable extensions.
- + int cpuidMaxFuncNum = reg[0];
- + if ((unsigned int)reg[1] == strIntel[0] && (unsigned int)reg[3] == strIntel[1] &&
- + (unsigned int)reg[2] == strIntel[2]) vendor = INTEL;
- + else if ((unsigned int)reg[1] == strAmd[0] && (unsigned int)reg[3] == strAmd[1] &&
- + (unsigned int)reg[2] == strAmd[2]) vendor = AMD;
- __cpuid(reg,1);
- - if ((unsigned int)reg[3] & bit_MMX) res = res | SUPPORT_MMX;
- - if ((unsigned int)reg[3] & bit_SSE) res = res | SUPPORT_SSE;
- - if ((unsigned int)reg[3] & bit_SSE2) res = res | SUPPORT_SSE2;
- + #if defined(_M_X64)
- + // note: MMX intrinsics cannot be compiled by Visual C++ for x64.
- + // It seems that however, hardware and Windows can be used.
- + res = res | SUPPORT_SSE2 | SUPPORT_SSE | SUPPORT_MMX;
- + #else
- + if ((unsigned int)reg[3] & bit_MMX) res = res | SUPPORT_MMX;
- + if ((unsigned int)reg[3] & bit_SSE) res = res | SUPPORT_SSE;
- + if ((unsigned int)reg[3] & bit_SSE2) res = res | SUPPORT_SSE2;
- + #endif
- +
- + // compiler is AVX support? (Is _XCR_XFEATURE_ENABLED_MASK defined?) -- VC++, etc...
- + //#if (_MSC_FULL_VER >= 160040219)
- + #ifdef _XCR_XFEATURE_ENABLED_MASK
- + if (((unsigned int)reg[2] & (bit_AVX | bit_OSXSAVE)) == (bit_AVX | bit_OSXSAVE)) {
- + uint64_t xcr0 = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
- + if ((xcr0 & 0x06) == 0x06) {
- + // res = res | SUPPORT_AVX;
- + //if ((unsigned int)reg[2] & bit_FMA3) res = res | SUPPORT_FMA3;
- + if (cpuidMaxFuncNum >= 7)
- + {
- + __cpuidex(reg, 7, 0);
- + if ((unsigned int)reg[1] & bit_AVX2) res = res | SUPPORT_AVX2;
- + // if (((unsigned int)reg[1] & bit_AVX512F) && (xcr0 & 0xe0) == 0xe0)
- + // res = res | SUPPORT_AVX512;
- + //}
- + //if (vendor == AMD) {
- + // __cpuid(reg, 0x80000001);
- + // if ((unsigned int)reg[2] & bit_XOP) res = res | SUPPORT_XOP;
- + // if ((unsigned int)reg[2] & bit_FMA4) res = res | SUPPORT_FMA4;
- + }
- + }
- + }
- + #endif
- #endif
- --- soundtouch-1.9.2.orig/source/SoundTouch/FIFOSampleBuffer.cpp Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SIMD/source/SoundTouch/FIFOSampleBuffer.cpp Sat Feb 13 00:04:07 2016
- @@ -171,13 +171,13 @@
- // enlarge the buffer in 4kbyte steps (round up to next 4k boundary)
- sizeInBytes = (capacityRequirement * channels * sizeof(SAMPLETYPE) + 4095) & (uint)-4096;
- assert(sizeInBytes % 2 == 0);
- - tempUnaligned = new SAMPLETYPE[sizeInBytes / sizeof(SAMPLETYPE) + 16 / sizeof(SAMPLETYPE)];
- + tempUnaligned = new SAMPLETYPE[sizeInBytes / sizeof(SAMPLETYPE) + (ALIGN_SIZE) / sizeof(SAMPLETYPE)];
- if (tempUnaligned == NULL)
- {
- ST_THROW_RT_ERROR("Couldn't allocate memory!\n");
- }
- // Align the buffer to begin at 16byte cache line boundary for optimal performance
- - temp = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER_16(tempUnaligned);
- + temp = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER(tempUnaligned);
- if (samplesInBuffer)
- {
- memcpy(temp, ptrBegin(), samplesInBuffer * channels * sizeof(SAMPLETYPE));
- --- soundtouch-1.9.2.orig/source/SoundTouch/FIRFilter.cpp Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SIMD/source/SoundTouch/FIRFilter.cpp Sat Feb 13 00:04:07 2016
- @@ -303,6 +303,24 @@
- // Check if MMX/SSE instruction set extensions supported by CPU
- +#ifdef SOUNDTOUCH_ALLOW_AVX2
- + // AVX2 routines available only with integer sample types
- + if (uExtensions & SUPPORT_AVX2)
- + {
- + return ::new FIRFilterAVX2;
- + }
- + else
- +#endif
- +
- +#ifdef SOUNDTOUCH_ALLOW_SSE2
- + // SSE2 routines available only with integer sample types
- + if (uExtensions & SUPPORT_SSE2)
- + {
- + return ::new FIRFilterSSE2;
- + }
- + else
- +#endif // SOUNDTOUCH_ALLOW_SSE2
- +
- #ifdef SOUNDTOUCH_ALLOW_MMX
- // MMX routines available only with integer sample types
- if (uExtensions & SUPPORT_MMX)
- --- soundtouch-1.9.2.orig/source/SoundTouch/FIRFilter.h Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SIMD/source/SoundTouch/FIRFilter.h Sat Feb 13 00:04:07 2016
- @@ -141,6 +141,42 @@
- #endif // SOUNDTOUCH_ALLOW_SSE
- +
- +#ifdef SOUNDTOUCH_ALLOW_SSE2
- + /// Class that implements SSE2 optimized functions exclusive for 16bit integer samples type.
- + class FIRFilterSSE2 : public FIRFilter
- + {
- + protected:
- + short *filterCoeffsUnalign;
- + short *filterCoeffsAlign;
- +
- + virtual uint evaluateFilterStereo(short *dest, const short *src, uint numSamples) const;
- + public:
- + FIRFilterSSE2();
- + ~FIRFilterSSE2();
- +
- + virtual void setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor);
- + };
- +
- +#endif // SOUNDTOUCH_ALLOW_SSE2
- +
- +#ifdef SOUNDTOUCH_ALLOW_AVX2
- + /// Class that implements AVX2 optimized functions exclusive for 16bit integer samples type.
- + class FIRFilterAVX2 : public FIRFilter
- + {
- + protected:
- + short *filterCoeffsUnalign;
- + short *filterCoeffsAlign;
- +
- + virtual uint evaluateFilterStereo(short *dest, const short *src, uint numSamples) const;
- + public:
- + FIRFilterAVX2();
- + ~FIRFilterAVX2();
- +
- + virtual void setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor);
- + };
- +#endif // SOUNDTOUCH_ALLOW_AVX2
- +
- }
- #endif // FIRFilter_H
- --- soundtouch-1.9.2.orig/source/SoundTouch/mmx_optimized.cpp Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SIMD/source/SoundTouch/mmx_optimized.cpp Sat Feb 13 09:53:14 2016
- @@ -316,8 +316,8 @@
- // Ensure that filter coeffs array is aligned to 16-byte boundary
- delete[] filterCoeffsUnalign;
- - filterCoeffsUnalign = new short[2 * newLength + 8];
- - filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
- + filterCoeffsUnalign = new short[2 * newLength + (ALIGN_SIZE)/sizeof(short)];
- + filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER(filterCoeffsUnalign);
- // rearrange the filter coefficients for mmx routines
- for (i = 0;i < length; i += 4)
- --- soundtouch-1.9.2.orig/source/SoundTouch/sse_optimized.cpp Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SIMD/source/SoundTouch/sse_optimized.cpp Sat Feb 13 09:53:22 2016
- @@ -227,8 +227,8 @@
- // also rearrange coefficients suitably for SSE
- // Ensure that filter coeffs array is aligned to 16-byte boundary
- delete[] filterCoeffsUnalign;
- - filterCoeffsUnalign = new float[2 * newLength + 4];
- - filterCoeffsAlign = (float *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
- + filterCoeffsUnalign = new float[2 * newLength + (ALIGN_SIZE)/sizeof(float)];
- + filterCoeffsAlign = (float *)SOUNDTOUCH_ALIGN_POINTER(filterCoeffsUnalign);
- fDivider = (float)resultDivider;
- @@ -370,3 +370,543 @@
- }
- #endif // SOUNDTOUCH_ALLOW_SSE
- +
- +#ifdef SOUNDTOUCH_ALLOW_SSE2
- +
- +// SSE2 routines available only with integer sample type
- +// Also refer to MMX optimized routines.
- +
- +//////////////////////////////////////////////////////////////////////////////
- +//
- +// implementation of SSE2 optimized functions of class 'TDStretchSSE2'
- +//
- +//////////////////////////////////////////////////////////////////////////////
- +
- +#include "TDStretch.h"
- +#include <emmintrin.h>
- +#include <math.h>
- +
- +// Calculates cross correlation of two buffers
- +double TDStretchSSE2::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm)
- +{
- + const __m128i *pVec1 = (__m128i*)pV1; // not 16byte aligned
- + const __m128i *pVec2 = (__m128i*)pV2; // 16byte aligned
- + const __m128i shifter = _mm_cvtsi32_si128(overlapDividerBitsNorm);
- + __m128i accu = _mm_setzero_si128();
- + __m128i normaccu = _mm_setzero_si128();
- + __m128i v; // for temporary
- +
- + // Process 8 parallel sets of 4 * stereo samples or 8 * mono samples
- + // during each round for improved CPU-level parallellization.
- +
- + for (int i = channels*overlapLength/16 ; i ; i--)
- + {
- + // Applies shifter immediately after product-sum to prevent overflow
- + __m128i n0 = _mm_loadu_si128(pVec1);
- + __m128i n1 = _mm_loadu_si128(pVec1+1);
- + __m128i a0 = _mm_madd_epi16(n0, *pVec2++); // a0 = pVec1[0] * pVec2[0]
- + n0 = _mm_madd_epi16(n0, n0); // n0 = pVec1[0]^2
- + __m128i a1 = _mm_madd_epi16(n1, *pVec2++); // a1 = pVec1[1] * pVec2[1]
- + n1 = _mm_madd_epi16(n1, n1); // n1 = pVec1[1]^2
- + a0 = _mm_sra_epi32(a0, shifter); // right arithmetic shift
- + n0 = _mm_sra_epi32(n0, shifter);
- + a1 = _mm_sra_epi32(a1, shifter);
- + n1 = _mm_sra_epi32(n1, shifter);
- + accu = _mm_add_epi32(accu, a0); // add to accumulator
- + normaccu = _mm_add_epi32(normaccu, n0);
- + accu = _mm_add_epi32(accu, a1);
- + normaccu = _mm_add_epi32(normaccu, n1);
- + pVec1 += 2;
- + }
- + // sum total
- + v = _mm_srli_si128(accu, 4);
- + accu = _mm_add_epi32(v, accu);
- + v = _mm_srli_si128(accu, 8);
- + accu = _mm_add_epi32(v, accu);
- + v = _mm_srli_si128(normaccu, 4);
- + normaccu = _mm_add_epi32(v, normaccu);
- + v = _mm_srli_si128(normaccu, 8);
- + normaccu = _mm_add_epi32(v, normaccu);
- +
- + __m128d Vcorr = _mm_cvtepi32_pd(accu); // int32 to double
- + __m128d Vdnorm = _mm_cvtepi32_pd(normaccu);
- + _mm_store_sd(&dnorm, Vdnorm); // feedback to dnorm
- +
- + if (_mm_cvtsi128_si32(normaccu) > 0) {
- + Vdnorm = _mm_sqrt_sd(Vdnorm, Vdnorm);
- + Vcorr = _mm_div_sd(Vcorr, Vdnorm);
- + }
- + return _mm_cvtsd_f64(Vcorr);
- +}
- +
- +
- +/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
- +double TDStretchSSE2::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm)
- +{
- + const __m128i *pVec1 = (__m128i*)pV1; // (unaligned)
- + const __m128i *pVec1prev = pVec1; // for previos round normalizer
- + const __m128i *pVec2 = (__m128i*)pV2; // (aligned)
- + const __m128i shifter = _mm_cvtsi32_si128(overlapDividerBitsNorm);
- + __m128i accu = _mm_setzero_si128();
- + __m128i norm = _mm_setzero_si128();
- + __m128i v; // for temporary
- + __m128d vd; // for temporary
- +
- + // Process 8 parallel sets of 2 * stereo samples or 16 * mono samples
- + // during each round for improved CPU-level parallellization.
- + for (int i = channels * overlapLength / 16 ; i ; i--)
- + {
- + // Applies shifter immediately after product-sum to prevent overflow
- + const __m128i vec1[] = {
- + _mm_loadu_si128(pVec1),
- + _mm_loadu_si128(pVec1+1)
- + };
- + __m128i v1 = _mm_madd_epi16(vec1[0], pVec2[0]);
- + v1 = _mm_sra_epi32(v1, shifter);
- + __m128i v2 = _mm_madd_epi16(vec1[1], pVec2[1]);
- + v2 = _mm_sra_epi32(v2, shifter);
- + pVec1 += 2;
- + accu = _mm_add_epi32(accu, v1);
- + pVec2 += 2;
- + accu = _mm_add_epi32(accu, v2);
- + }
- + v = _mm_srli_si128(accu, 8);
- + accu = _mm_add_epi32(v, accu);
- + v = _mm_srli_si128(accu, 4);
- + accu = _mm_add_epi32(v, accu); // accu.m128i_i32[0] is sum total
- +
- + // update normalizer with last samples of this round, and previous round
- + for (int ch = channels; ch > 0; ch -= sizeof(*pVec1)/sizeof(*pV1)) {
- + const __m128i vth = _mm_set_epi16(0,1,2,3,4,5,6,7);
- + const __m128i vch = _mm_set1_epi16(ch);
- + const __m128i vMask = _mm_cmpgt_epi16(vch, vth);
- + __m128i vThis = _mm_loadu_si128(--pVec1);
- + __m128i vPrev = _mm_loadu_si128(--pVec1prev);
- + vThis = _mm_and_si128(vThis, vMask); // this round
- + vPrev = _mm_and_si128(vPrev, vMask); // previos round
- +
- + vThis = _mm_madd_epi16(vThis, vThis);
- + vThis = _mm_sra_epi32(vThis, shifter);
- + vPrev = _mm_madd_epi16(vPrev, vPrev);
- + vPrev = _mm_sra_epi32(vPrev, shifter);
- + norm = _mm_add_epi32(norm, vThis);
- + norm = _mm_sub_epi32(norm, vPrev);
- + }
- + v = _mm_srli_si128(norm, 8);
- + norm = _mm_add_epi32(norm, v);
- + v = _mm_srli_si128(norm, 4);
- + norm = _mm_add_epi32(norm, v); // norm.m128i_i32[0] is sum total
- +
- + #if defined _WIN64
- + __m128d Vcorr = _mm_cvtepi32_pd(accu);
- + __m128d Vdnorm = _mm_cvtepi32_pd(norm);
- + vd = _mm_load_sd(&dnorm);
- + Vdnorm = _mm_add_sd(vd, Vdnorm);
- + _mm_store_sd(&dnorm, Vdnorm); // feedback to dnorm
- +
- + const __m128d dmin = _mm_set_sd(1e-9);
- + if (_mm_comige_sd(Vdnorm, dmin)) {
- + Vdnorm = _mm_sqrt_sd(Vdnorm, Vdnorm);
- + Vcorr = _mm_div_sd(Vcorr, Vdnorm);
- + }
- + return _mm_cvtsd_f64(Vcorr);
- + #else
- + // Although it is faster than the above at Pentium 4. But at x64, it may be a slow.
- + __m128d Vdnorm = _mm_cvtepi32_pd(norm);
- + vd = _mm_load_sd(&dnorm);
- + Vdnorm = _mm_add_sd(vd, Vdnorm);
- + _mm_store_sd(&dnorm, Vdnorm); // feedback to dnorm
- +
- + double corr;
- + vd = _mm_cvtepi32_pd(accu);
- + _mm_store_sd(&corr, vd);
- +
- + const __m128d dmin = _mm_set_sd(1e-9);
- + if (_mm_comige_sd(Vdnorm, dmin)) {
- + return corr / sqrt(dnorm); // x87 is used even if it specifies -arch:SSE2
- + }
- + return corr;
- + #endif
- +}
- +
- +
- +// SSE2-optimized version of the function overlapStereo
- +void TDStretchSSE2::overlapStereo(short *output, const short *input)
- +{
- + const __m128i *pVinput = (__m128i*)input; // (unaligned)
- + const __m128i *pVMidBuf = (__m128i*)pMidBuffer; // (aligned)
- + const __m128i shifter = _mm_cvtsi32_si128(overlapDividerBitsPure + 1);
- + // note: Since _mm_set_epi16() is slow at Pentium4, _mm_set_epi32() is substituted.
- + __m128i adder = _mm_set1_epi32(0x2fffe); // [ 2, -2, 2, -2, 2, -2, 2, -2 ]
- + __m128i mix1 = _mm_set_epi32(
- + 0x10000 | (unsigned short)(overlapLength-1), // (short)[ 1, overlapLength-1,
- + 0x10000 | (unsigned short)(overlapLength-1), // 1, overlapLength-1,
- + (unsigned short)overlapLength, // 0, overlapLength,
- + (unsigned short)overlapLength); // 0, overlapLength ]
- + __m128i mix2 = _mm_add_epi16(mix1, adder);
- + __m128i *pVdest = (__m128i*)output; // (unaligned)
- + adder = _mm_add_epi16(adder, adder);
- +
- + for (int i = overlapLength / 4 ; i ; i--)
- + {
- + const __m128i vi = _mm_loadu_si128(pVinput);
- + const __m128i vm = _mm_load_si128(pVMidBuf);
- + __m128i v1 = _mm_unpacklo_epi16(vm, vi);
- + __m128i v2 = _mm_unpackhi_epi16(vm, vi);
- + v1 = _mm_madd_epi16(v1, mix1);
- + v2 = _mm_madd_epi16(v2, mix2);
- + v1 = _mm_sra_epi32(v1, shifter);
- + v2 = _mm_sra_epi32(v2, shifter);
- + v1 = _mm_packs_epi32(v1, v2);
- + _mm_storeu_si128(pVdest, v1);
- +
- + mix1 = _mm_add_epi16(mix1, adder);
- + mix2 = _mm_add_epi16(mix2, adder);
- + pVMidBuf++;
- + pVinput++;
- + pVdest++;
- + }
- +}
- +
- +
- +//////////////////////////////////////////////////////////////////////////////
- +//
- +// implementation of SSE2 optimized functions of class 'FIRFilter'
- +//
- +//////////////////////////////////////////////////////////////////////////////
- +
- +#include "FIRFilter.h"
- +
- +FIRFilterSSE2::FIRFilterSSE2() : FIRFilter()
- +{
- + filterCoeffsAlign = NULL;
- + filterCoeffsUnalign = NULL;
- +}
- +
- +
- +FIRFilterSSE2::~FIRFilterSSE2()
- +{
- + delete[] filterCoeffsUnalign;
- +}
- +
- +
- +// (overloaded) Calculates filter coefficients for SSE2 routine
- +void FIRFilterSSE2::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
- +{
- + FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
- +
- + // Ensure that filter coeffs array is aligned to 16-byte boundary
- + delete[] filterCoeffsUnalign;
- + filterCoeffsUnalign = new short[2 * newLength + (ALIGN_SIZE)/sizeof(short)];
- + filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER(filterCoeffsUnalign);
- + __m128i *VfilterCoeffsAlign = (__m128i*)filterCoeffsAlign;
- +
- + // rearrange the filter coefficients for SSE2 routines
- + for (uint i = 0; i < length; i += 4)
- + {
- + __m128i v = _mm_loadl_epi64((__m128i*)(coeffs + i)); // 3, 2, 1, 0
- + v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 1, 2, 0)); // 3, 1, 2, 0
- + v = _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 0, 0)); // 3, 1, 3, 1, 2, 0, 2, 0
- + _mm_store_si128(VfilterCoeffsAlign++, v);
- + }
- +}
- +
- +
- +// sse2-optimized version of the filter routine for stereo sound
- +uint FIRFilterSSE2::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const
- +{
- + if (length < 2) return 0;
- +
- + short *pVdest = dest;
- +
- + for (uint i = (numSamples - length) >> 1 ; i ; i--)
- + {
- + const __m128i *pVsrc = (__m128i*)src;
- + const __m128i *pVfilter = (__m128i*)filterCoeffsAlign; //16byte aligned
- + __m128i accu1 = _mm_setzero_si128();
- + __m128i accu2 = _mm_setzero_si128();
- +
- + for (uint j = lengthDiv8 * 2; j ; j--)
- + {
- + // accu1 accu2
- + // r0: s00*f00 + s04*f01 s02*f00 + s06*f01
- + // r1: s01*f02 + s05*f03 s03*f02 + s07*f03
- + // r2: s02*f04 + s06*f05 s04*f04 + s08*f05
- + // r3: s03*f06 + s07*f07 s05*f06 + s09*f07
- + __m128i v0 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+0));
- + __m128i v2 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+2));
- + const __m128i v4 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+4));
- + const __m128i v6 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+6));
- + const __m128i vf = _mm_load_si128(pVfilter);
- + v0 = _mm_unpacklo_epi16(v0, v4);
- + v2 = _mm_unpacklo_epi16(v2, v6);
- + v0 = _mm_madd_epi16(v0, vf);
- + v2 = _mm_madd_epi16(v2, vf);
- + pVsrc++;
- + accu1 = _mm_add_epi32(accu1, v0);
- + pVfilter++;
- + accu2 = _mm_add_epi32(accu2, v2);
- + }
- + // r0: accu1 - s00*f00 + s04*f01 + s02*f04 + s06*f05
- + // r1: s01*f02 + s05*f03 + s03*f06 + s07*f07
- + // r2: accu2 - s02*f00 + s06*f01 + s04*f04 + s08*f05
- + // r3: s03*f02 + s07*f03 + s05*f06 + s09*f07
- + const __m128i v1 = _mm_srli_si128(accu1, 8);
- + const __m128i v2 = _mm_srli_si128(accu2, 8);
- + accu1 = _mm_add_epi32(accu1, v1);
- + accu2 = _mm_add_epi32(accu2, v2);
- + accu1 = _mm_unpacklo_epi64(accu1, accu2);
- + accu1 = _mm_srai_epi32(accu1, resultDivFactor);
- + accu1 = _mm_packs_epi32(accu1, accu1);
- + _mm_storel_epi64((__m128i*)pVdest, accu1);
- + src += 4;
- + pVdest += 4;
- + }
- + return (numSamples & -2) - length;
- +}
- +#endif // SOUNDTOUCH_ALLOW_SSE2
- +
- +#ifdef SOUNDTOUCH_ALLOW_AVX2
- +
- +//////////////////////////////////////////////////////////////////////////////
- +//
- +// implementation of AVX2 optimized functions of class 'TDStretchAVX2'
- +//
- +//////////////////////////////////////////////////////////////////////////////
- +
- +#include "TDStretch.h"
- +#include <immintrin.h>
- +#include <math.h>
- +
- +// defined SOUNDTOUCH_INTEGER_SAMPLES
- +
- +// Calculates cross correlation of two buffers
- +double TDStretchAVX2::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm)
- +{
- + const __m256i *pVec1 = (__m256i*)pV1; // not 32byte aligned
- + const __m256i *pVec2 = (__m256i*)pV2; // 32byte aligned
- + const __m256i shifter = _mm256_set1_epi32(overlapDividerBitsNorm);
- + __m256i accu = _mm256_setzero_si256();
- + __m256i normaccu = _mm256_setzero_si256();
- +
- + // Process 16 parallel sets of 8 * stereo samples or 16 * mono samples
- + // during each round for improved CPU-level parallellization.
- +
- + for (int i = channels*overlapLength/16 ; i ; i--)
- + {
- + // Applies shifter immediately after product-sum to prevent overflow
- + const __m256i v = _mm256_loadu_si256(pVec1);
- + __m256i v1 = _mm256_madd_epi16(v, *pVec2++);
- + __m256i v2 = _mm256_madd_epi16(v, v);
- + v1 = _mm256_srav_epi32(v1, shifter);
- + v2 = _mm256_srav_epi32(v2, shifter);
- + accu = _mm256_add_epi32(accu, v1);
- + normaccu = _mm256_add_epi32(normaccu, v2);
- + pVec1++;
- + }
- + normaccu = _mm256_hadd_epi32(normaccu, accu);
- + __m128i vNorm = _mm256_extracti128_si256(normaccu, 1);
- + _mm256_zeroupper();
- + vNorm = _mm_add_epi32(_mm256_castsi256_si128(normaccu), vNorm);
- + vNorm = _mm_hadd_epi32(vNorm, vNorm); // r1=sum(accu), r0=sum(normaccu)
- +
- + __m128d vdNorm = _mm_cvtepi32_pd(vNorm); // xmm:r0=(double)sum(normaccu)
- + __m128d vdCorr = _mm_shuffle_pd(vdNorm, vdNorm, _MM_SHUFFLE2(0,1)); // xmm:r0=(double)sum(accu)
- + _mm_store_sd(&dnorm, vdNorm);
- +
- + if (_mm_cvtsi128_si32(vNorm) > 0) {
- + vdNorm = _mm_sqrt_sd(vdNorm, vdNorm);
- + vdCorr = _mm_div_sd(vdCorr, vdNorm);
- + }
- + return _mm_cvtsd_f64(vdCorr);
- +}
- +
- +
- +/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
- +double TDStretchAVX2::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm)
- +{
- + const __m256i *pVec1 = (__m256i*)pV1; // (unaligned)
- + const __m256i *pVec1prev = pVec1; // for previos round normalizer
- + const __m256i *pVec2 = (__m256i*)pV2; // (32byte aligned)
- + const __m256i shifter = _mm256_set1_epi32(overlapDividerBitsNorm);
- + const __m256i chThreshold = _mm256_set_epi16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
- + __m256i accu = _mm256_setzero_si256();
- + __m256i norm = _mm256_setzero_si256();
- +
- + // Process 8 parallel sets of 2 * stereo samples or 16 * mono samples
- + // during each round for improved CPU-level parallellization.
- + for (int i = channels * overlapLength / 16 ; i ; i--)
- + {
- + // Applies shifter immediately after product-sum to prevent overflow
- + __m256i v0 = _mm256_loadu_si256(pVec1++);
- + v0 = _mm256_madd_epi16(v0, *pVec2++);
- + v0 = _mm256_srav_epi32(v0, shifter);
- + accu = _mm256_add_epi32(accu, v0);
- + }
- + __m128i vcorr = _mm256_extracti128_si256(accu, 1);
- + vcorr = _mm_hadd_epi32(_mm256_castsi256_si128(accu), vcorr);
- + vcorr = _mm_hadd_epi32(vcorr, /* unused */ vcorr);
- + vcorr = _mm_hadd_epi32(vcorr, /* unused */ vcorr); // xmm:r0 = sum total
- +
- + // update normalizer with last samples of this round, and previous round
- + for (int ch = channels; ch > 0; ch -= sizeof(*pVec1)/sizeof(*pV1)) {
- + const __m256i restCh = _mm256_set1_epi16(ch);
- + const __m256i vMask = _mm256_cmpgt_epi16(restCh, chThreshold);
- + __m256i vThis = _mm256_loadu_si256(--pVec1);
- + __m256i vPrev = _mm256_loadu_si256(--pVec1prev);
- + vThis = _mm256_and_si256(vThis, vMask);
- + vPrev = _mm256_and_si256(vPrev, vMask);
- + vThis = _mm256_madd_epi16(vThis, vThis);
- + vPrev = _mm256_madd_epi16(vPrev, vPrev);
- + vThis = _mm256_srav_epi32(vThis, shifter);
- + vPrev = _mm256_srav_epi32(vPrev, shifter);
- + norm = _mm256_add_epi32(norm, vThis);
- + norm = _mm256_sub_epi32(norm, vPrev);
- + }
- + __m128i vnorm = _mm256_extracti128_si256(norm, 1);
- + _mm256_zeroupper();
- + vnorm = _mm_hadd_epi32(_mm256_castsi256_si128(norm), vnorm);
- + vnorm = _mm_hadd_epi32(vnorm, /* unused */ vnorm);
- + vnorm = _mm_hadd_epi32(vnorm, /* unused */ vnorm); // xmm:r0 = sum total
- +
- + __m128d vdcorr = _mm_cvtepi32_pd(vcorr);
- + __m128d vdnorm = _mm_cvtepi32_pd(vnorm);
- + __m128d vd = _mm_load_sd(&dnorm);
- + vdnorm = _mm_add_sd(vdnorm, vd);
- + _mm_store_sd(&dnorm, vdnorm);
- +
- + const __m128d vdmin = _mm_set_sd(1e-9);
- + if (_mm_comige_sd(vdnorm, vdmin)) {
- + vdnorm = _mm_sqrt_sd(vdnorm, /* unused */ vdnorm);
- + vdcorr = _mm_div_sd(vdcorr, vdnorm);
- + }
- + return _mm_cvtsd_f64(vdcorr);
- +}
- +
- +
- +// AVX2-optimized version of the function overlapStereo
- +void TDStretchAVX2::overlapStereo(short *output, const short *input)
- +{
- + const __m128i *pVinput = (__m128i*)input; // (not aligned)
- + const __m128i *pVMidBuf = (__m128i*)pMidBuffer; // (32byte aligned)
- + const __m256i shifter = _mm256_set1_epi32(overlapDividerBitsPure + 1);
- + const __m256i adder = _mm256_set1_epi32(0x4fffc); // [ 4,-4, 4,-4, 4,-4, 4,-4]
- + __m128i *pVdest = (__m128i*)output; // (not aligned)
- + __m256i mix = _mm256_set_epi32(
- + 0x30000 | (unsigned short)(overlapLength-3), // 3, overlapLength-3,
- + 0x30000 | (unsigned short)(overlapLength-3),
- + 0x20000 | (unsigned short)(overlapLength-2), // 2, overlapLength-2,
- + 0x20000 | (unsigned short)(overlapLength-2),
- + 0x10000 | (unsigned short)(overlapLength-1), // 1, overlapLength-1,
- + 0x10000 | (unsigned short)(overlapLength-1),
- + (unsigned short)overlapLength, // 0, overlapLength,
- + (unsigned short)overlapLength);
- +
- + for (int i = overlapLength / 4; i >= 0; i--)
- + {
- + const __m128i vinput = _mm_loadu_si128(pVinput++);
- + const __m128i vmidBuf = *pVMidBuf++;
- + __m128i vh = _mm_unpackhi_epi16(vmidBuf, vinput);
- + __m128i vl = _mm_unpacklo_epi16(vmidBuf, vinput);
- + __m256i dest = _mm256_inserti128_si256(_mm256_castsi128_si256(vl), vh, 1);
- + dest = _mm256_madd_epi16(dest, mix);
- + dest = _mm256_srav_epi32(dest, shifter);
- + __m128i v = _mm256_extracti128_si256(dest, 1);
- + v = _mm_packs_epi32(_mm256_castsi256_si128(dest), v);
- + _mm_storeu_si128(pVdest++, v);
- + mix = _mm256_add_epi16(mix, adder); // update overlap multiplier
- + }
- + _mm256_zeroupper();
- +}
- +
- +
- +//////////////////////////////////////////////////////////////////////////////
- +//
- +// implementation of AVX2 optimized functions of class 'FIRFilter'
- +//
- +//////////////////////////////////////////////////////////////////////////////
- +
- +#include "FIRFilter.h"
- +
- +FIRFilterAVX2::FIRFilterAVX2() : FIRFilter()
- +{
- + filterCoeffsAlign = NULL;
- + filterCoeffsUnalign = NULL;
- +}
- +
- +
- +FIRFilterAVX2::~FIRFilterAVX2()
- +{
- + delete[] filterCoeffsUnalign;
- +}
- +
- +
- +// (overloaded) Calculates filter coefficients for AVX2 routine
- +void FIRFilterAVX2::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
- +{
- + FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
- +
- + // Ensure that filter coeffs array is aligned to 32-byte boundary
- + delete[] filterCoeffsUnalign;
- + filterCoeffsUnalign = new short[2 * newLength + (ALIGN_SIZE)/sizeof(short)];
- + filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER(filterCoeffsUnalign);
- + __m128i *VfilterCoeffsAlign = (__m128i*)filterCoeffsAlign;
- +
- + // rearrange the filter coefficients for SSE2 routines
- + for (uint i = 0; i < length; i += 4)
- + {
- + __m128i v = _mm_loadl_epi64((__m128i*)(coeffs + i)); // 3, 2, 1, 0
- + v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 1, 2, 0)); // 3, 1, 2, 0
- + v = _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 0, 0)); // 3, 1, 3, 1, 2, 0, 2, 0
- + _mm_store_si128(VfilterCoeffsAlign++, v);
- + }
- +}
- +
- +
- +// AVX2-optimized version of the filter routine for stereo sound
- +uint FIRFilterAVX2::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const
- +{
- + if (length < 2) return 0;
- +
- + short *pVdest = dest;
- +
- + for (uint i = (numSamples - length) >> 1 ; i ; i--)
- + {
- + const __m256i *pVsrc = (__m256i*)src;
- + const __m256i *pVfilter = (__m256i*)filterCoeffsAlign; // 32byte aligned
- + __m256i accu1 = _mm256_setzero_si256();
- + __m256i accu2 = _mm256_setzero_si256();
- +
- + for (uint j = lengthDiv8; j ; j--)
- + {
- + const __m256i vfilter = _mm256_load_si256(pVfilter);
- + __m256i v0 = _mm256_loadu_si256((__m256i*)((short*)pVsrc+0));
- + __m256i v2 = _mm256_loadu_si256((__m256i*)((short*)pVsrc+2));
- + const __m256i v4 = _mm256_srli_si256(v0, 8);
- + const __m256i v6 = _mm256_srli_si256(v2, 8);
- + v0 = _mm256_unpacklo_epi16(v0, v4);
- + v2 = _mm256_unpacklo_epi16(v2, v6);
- + v0 = _mm256_madd_epi16(v0, vfilter);
- + v2 = _mm256_madd_epi16(v2, vfilter);
- + accu1 = _mm256_add_epi32(accu1, v0);
- + accu2 = _mm256_add_epi32(accu2, v2);
- + pVsrc++;
- + pVfilter++;
- + }
- + accu1 = _mm256_shuffle_epi32(accu1, _MM_SHUFFLE(3,1,2,0));
- + accu2 = _mm256_shuffle_epi32(accu2, _MM_SHUFFLE(3,1,2,0));
- + accu1 = _mm256_hadd_epi32(accu1, accu2);
- +
- + __m128i accu = _mm256_extracti128_si256(accu1, 1);
- + accu = _mm_add_epi32(_mm256_castsi256_si128(accu1), accu);
- + accu = _mm_srai_epi32(accu, resultDivFactor);
- + accu = _mm_packs_epi32(accu, /* unused */ accu);
- + _mm_storel_epi64((__m128i*)pVdest, accu);
- +
- + src += 4;
- + pVdest += 4;
- + }
- + _mm256_zeroupper();
- + return (numSamples & -2) - length;
- +}
- +#endif // SOUNDTOUCH_ALLOW_AVX2
- --- soundtouch-1.9.2.orig/source/SoundTouch/TDStretch.cpp Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SIMD/source/SoundTouch/TDStretch.cpp Sat Feb 13 00:04:07 2016
- @@ -721,9 +721,9 @@
- {
- delete[] pMidBufferUnaligned;
- - pMidBufferUnaligned = new SAMPLETYPE[overlapLength * channels + 16 / sizeof(SAMPLETYPE)];
- + pMidBufferUnaligned = new SAMPLETYPE[overlapLength * channels + (ALIGN_SIZE) / sizeof(SAMPLETYPE)];
- // ensure that 'pMidBuffer' is aligned to 16 byte boundary for efficiency
- - pMidBuffer = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER_16(pMidBufferUnaligned);
- + pMidBuffer = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER(pMidBufferUnaligned);
- clearMidBuffer();
- }
- @@ -748,6 +748,24 @@
- // Check if MMX/SSE instruction set extensions supported by CPU
- +#ifdef SOUNDTOUCH_ALLOW_AVX2
- + // AVX2 routines available
- + if (uExtensions & SUPPORT_AVX2)
- + {
- + return ::new TDStretchAVX2;
- + }
- + else
- +#endif
- +
- +#ifdef SOUNDTOUCH_ALLOW_SSE2
- + // SSE2 routines available only with integer sample types
- + if (uExtensions & SUPPORT_SSE2)
- + {
- + return ::new TDStretchSSE2;
- + }
- + else
- +#endif // SOUNDTOUCH_ALLOW_SSE2
- +
- #ifdef SOUNDTOUCH_ALLOW_MMX
- // MMX routines available only with integer sample types
- if (uExtensions & SUPPORT_MMX)
- --- soundtouch-1.9.2.orig/source/SoundTouch/TDStretch.h Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SIMD/source/SoundTouch/TDStretch.h Sat Feb 13 00:04:07 2016
- @@ -277,5 +277,29 @@
- #endif /// SOUNDTOUCH_ALLOW_SSE
- +
- +#ifdef SOUNDTOUCH_ALLOW_SSE2
- + /// Class that implements SSE2 optimized routines for 16bit integer samples type.
- + class TDStretchSSE2 : public TDStretch
- + {
- + protected:
- + double calcCrossCorr(const short *mixingPos, const short *compare, double &norm);
- + double calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm);
- + virtual void overlapStereo(short *output, const short *input);
- + };
- +
- +#endif /// SOUNDTOUCH_ALLOW_SSE2
- +
- +#ifdef SOUNDTOUCH_ALLOW_AVX2
- + /// Class that implements AVX2 optimized routines for 16bit integer samples type.
- + class TDStretchAVX2 : public TDStretch
- + {
- + protected:
- + double calcCrossCorr(const short *mixingPos, const short *compare, double &norm);
- + double calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm);
- + virtual void overlapStereo(short *output, const short *input);
- + };
- +#endif /// SOUNDTOUCH_ALLOW_AVX2
- +
- }
- #endif /// TDStretch_H
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement