Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- --- soundtouch-1.9.2.orig/include/STTypes.h Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SSE2/include/STTypes.h Fri Feb 12 01:54:14 2016
- @@ -98,8 +98,8 @@
- /// However, if you still prefer to select the sample format here
- /// also in GNU environment, then please #undef the INTEGER_SAMPLE
- /// and FLOAT_SAMPLE defines first as in comments above.
- - //#define SOUNDTOUCH_INTEGER_SAMPLES 1 //< 16bit integer samples
- - #define SOUNDTOUCH_FLOAT_SAMPLES 1 //< 32bit float samples
- + #define SOUNDTOUCH_INTEGER_SAMPLES 1 //< 16bit integer samples
- + //#define SOUNDTOUCH_FLOAT_SAMPLES 1 //< 32bit float samples
- #endif
- @@ -143,8 +143,12 @@
- #endif // SOUNDTOUCH_FLOAT_SAMPLES
- #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
- - // Allow MMX optimizations
- - #define SOUNDTOUCH_ALLOW_MMX 1
- + // Allow SSE2 optimizations
- + #define SOUNDTOUCH_ALLOW_SSE2 1
- + #ifndef _M_X64
- + // Allow MMX optimizations
- + #define SOUNDTOUCH_ALLOW_MMX 1
- + #endif
- #endif
- #else
- --- soundtouch-1.9.2.orig/source/SoundTouch/FIRFilter.cpp Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SSE2/source/SoundTouch/FIRFilter.cpp Fri Feb 12 01:54:14 2016
- @@ -303,6 +303,15 @@
- // Check if MMX/SSE instruction set extensions supported by CPU
- +#ifdef SOUNDTOUCH_ALLOW_SSE2
- + // SSE2 routines available only with integer sample types
- + if (uExtensions & SUPPORT_SSE2)
- + {
- + return ::new FIRFilterSSE2;
- + }
- + else
- +#endif // SOUNDTOUCH_ALLOW_SSE2
- +
- #ifdef SOUNDTOUCH_ALLOW_MMX
- // MMX routines available only with integer sample types
- if (uExtensions & SUPPORT_MMX)
- --- soundtouch-1.9.2.orig/source/SoundTouch/FIRFilter.h Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SSE2/source/SoundTouch/FIRFilter.h Fri Feb 12 01:54:14 2016
- @@ -141,6 +141,25 @@
- #endif // SOUNDTOUCH_ALLOW_SSE
- +
- +#ifdef SOUNDTOUCH_ALLOW_SSE2
- + /// Class that implements SSE2 optimized functions exclusive for 16bit integer samples type.
- + class FIRFilterSSE2 : public FIRFilter
- + {
- + protected:
- + short *filterCoeffsUnalign;
- + short *filterCoeffsAlign;
- +
- + virtual uint evaluateFilterStereo(short *dest, const short *src, uint numSamples) const;
- + public:
- + FIRFilterSSE2();
- + ~FIRFilterSSE2();
- +
- + virtual void setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor);
- + };
- +
- +#endif // SOUNDTOUCH_ALLOW_SSE2
- +
- }
- #endif // FIRFilter_H
- --- soundtouch-1.9.2.orig/source/SoundTouch/sse_optimized.cpp Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SSE2/source/SoundTouch/sse_optimized.cpp Fri Feb 12 11:54:13 2016
- @@ -370,3 +370,282 @@
- }
- #endif // SOUNDTOUCH_ALLOW_SSE
- +
- +#ifdef SOUNDTOUCH_ALLOW_SSE2
- +
- +// SSE2 routines available only with integer sample type
- +// Also refer to MMX optimized routines.
- +
- +//////////////////////////////////////////////////////////////////////////////
- +//
- +// implementation of SSE2 optimized functions of class 'TDStretchSSE2'
- +//
- +//////////////////////////////////////////////////////////////////////////////
- +
- +#include "TDStretch.h"
- +#include <emmintrin.h>
- +#include <math.h>
- +
- +// Calculates cross correlation of two buffers
- +double TDStretchSSE2::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm)
- +{
- + const __m128i *pVec1 = (__m128i*)pV1; // not 16byte aligned
- + const __m128i *pVec2 = (__m128i*)pV2; // 16byte aligned
- + const __m128i shifter = _mm_cvtsi32_si128(overlapDividerBitsNorm);
- + __m128i accu = _mm_setzero_si128();
- + __m128i normaccu = _mm_setzero_si128();
- + __m128i v; // for temporary
- +
- + // Process 8 parallel sets of 4 * stereo samples or 8 * mono samples
- + // during each round for improved CPU-level parallellization.
- +
- + for (int i = channels*overlapLength/16 ; i ; i--)
- + {
- + // Applies shifter immediately after product-sum to prevent overflow
- + __m128i n0 = _mm_loadu_si128(pVec1);
- + __m128i n1 = _mm_loadu_si128(pVec1+1);
- + __m128i a0 = _mm_madd_epi16(n0, *pVec2++); // a0 = pVec1[0] * pVec2[0]
- + n0 = _mm_madd_epi16(n0, n0); // n0 = pVec1[0]^2
- + __m128i a1 = _mm_madd_epi16(n1, *pVec2++); // a1 = pVec1[1] * pVec2[1]
- + n1 = _mm_madd_epi16(n1, n1); // n1 = pVec1[1]^2
- + a0 = _mm_sra_epi32(a0, shifter); // right arithmetic shift
- + n0 = _mm_sra_epi32(n0, shifter);
- + a1 = _mm_sra_epi32(a1, shifter);
- + n1 = _mm_sra_epi32(n1, shifter);
- + accu = _mm_add_epi32(accu, a0); // add to accumulator
- + normaccu = _mm_add_epi32(normaccu, n0);
- + accu = _mm_add_epi32(accu, a1);
- + normaccu = _mm_add_epi32(normaccu, n1);
- + pVec1 += 2;
- + }
- + // sum total
- + v = _mm_srli_si128(accu, 4);
- + accu = _mm_add_epi32(v, accu);
- + v = _mm_srli_si128(accu, 8);
- + accu = _mm_add_epi32(v, accu);
- + v = _mm_srli_si128(normaccu, 4);
- + normaccu = _mm_add_epi32(v, normaccu);
- + v = _mm_srli_si128(normaccu, 8);
- + normaccu = _mm_add_epi32(v, normaccu);
- +
- + __m128d Vcorr = _mm_cvtepi32_pd(accu); // int32 to double
- + __m128d Vdnorm = _mm_cvtepi32_pd(normaccu);
- + _mm_store_sd(&dnorm, Vdnorm); // feedback to dnorm
- +
- + if (_mm_cvtsi128_si32(normaccu) > 0) {
- + Vdnorm = _mm_sqrt_sd(Vdnorm, Vdnorm);
- + Vcorr = _mm_div_sd(Vcorr, Vdnorm);
- + }
- + return _mm_cvtsd_f64(Vcorr);
- +}
- +
- +
- +/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
- +double TDStretchSSE2::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm)
- +{
- + const __m128i *pVec1 = (__m128i*)pV1; // (unaligned)
- + const __m128i *pVec1prev = pVec1; // for previos round normalizer
- + const __m128i *pVec2 = (__m128i*)pV2; // (aligned)
- + const __m128i shifter = _mm_cvtsi32_si128(overlapDividerBitsNorm);
- + __m128i accu = _mm_setzero_si128();
- + __m128i norm = _mm_setzero_si128();
- + __m128i v; // for temporary
- + __m128d vd; // for temporary
- +
- + // Process 8 parallel sets of 2 * stereo samples or 16 * mono samples
- + // during each round for improved CPU-level parallellization.
- + for (int i = channels * overlapLength / 16 ; i ; i--)
- + {
- + // Applies shifter immediately after product-sum to prevent overflow
- + const __m128i vec1[] = {
- + _mm_loadu_si128(pVec1),
- + _mm_loadu_si128(pVec1+1)
- + };
- + __m128i v1 = _mm_madd_epi16(vec1[0], pVec2[0]);
- + v1 = _mm_sra_epi32(v1, shifter);
- + __m128i v2 = _mm_madd_epi16(vec1[1], pVec2[1]);
- + v2 = _mm_sra_epi32(v2, shifter);
- + pVec1 += 2;
- + accu = _mm_add_epi32(accu, v1);
- + pVec2 += 2;
- + accu = _mm_add_epi32(accu, v2);
- + }
- + v = _mm_srli_si128(accu, 8);
- + accu = _mm_add_epi32(v, accu);
- + v = _mm_srli_si128(accu, 4);
- + accu = _mm_add_epi32(v, accu); // accu.m128i_i32[0] is sum total
- +
- + // update normalizer with last samples of this round, and previous round
- + for (int ch = channels; ch > 0; ch -= sizeof(*pVec1)/sizeof(*pV1)) {
- + const __m128i vth = _mm_set_epi16(0,1,2,3,4,5,6,7);
- + const __m128i vch = _mm_set1_epi16(ch);
- + const __m128i vMask = _mm_cmpgt_epi16(vch, vth);
- + __m128i vThis = _mm_loadu_si128(--pVec1);
- + __m128i vPrev = _mm_loadu_si128(--pVec1prev);
- + vThis = _mm_and_si128(vThis, vMask); // this round
- + vPrev = _mm_and_si128(vPrev, vMask); // previos round
- +
- + vThis = _mm_madd_epi16(vThis, vThis);
- + vThis = _mm_sra_epi32(vThis, shifter);
- + vPrev = _mm_madd_epi16(vPrev, vPrev);
- + vPrev = _mm_sra_epi32(vPrev, shifter);
- + norm = _mm_add_epi32(norm, vThis);
- + norm = _mm_sub_epi32(norm, vPrev);
- + }
- + v = _mm_srli_si128(norm, 8);
- + norm = _mm_add_epi32(norm, v);
- + v = _mm_srli_si128(norm, 4);
- + norm = _mm_add_epi32(norm, v); // norm.m128i_i32[0] is sum total
- +
- + __m128d Vcorr = _mm_cvtepi32_pd(accu);
- + __m128d Vdnorm = _mm_cvtepi32_pd(norm);
- + vd = _mm_load_sd(&dnorm);
- + Vdnorm = _mm_add_sd(vd, Vdnorm);
- + _mm_store_sd(&dnorm, Vdnorm); // feedback to dnorm
- +
- + const __m128d dmin = _mm_set_sd(1e-9);
- + if (_mm_comige_sd(Vdnorm, dmin)) {
- + Vdnorm = _mm_sqrt_sd(Vdnorm, Vdnorm);
- + Vcorr = _mm_div_sd(Vcorr, Vdnorm);
- + }
- + return _mm_cvtsd_f64(Vcorr);
- +}
- +
- +
- +// SSE2-optimized version of the function overlapStereo
- +void TDStretchSSE2::overlapStereo(short *output, const short *input)
- +{
- + const __m128i *pVinput = (__m128i*)input; // (unaligned)
- + const __m128i *pVMidBuf = (__m128i*)pMidBuffer; // (aligned)
- + const __m128i shifter = _mm_cvtsi32_si128(overlapDividerBitsPure + 1);
- + // note: Since _mm_set_epi16() is slow at Pentium4, _mm_set_epi32() is substituted.
- + __m128i adder = _mm_set1_epi32(0x2fffe); // [ 2, -2, 2, -2, 2, -2, 2, -2 ]
- + __m128i mix1 = _mm_set_epi32(
- + 0x10000 | (unsigned short)(overlapLength-1), // (short)[ 1, overlapLength-1,
- + 0x10000 | (unsigned short)(overlapLength-1), // 1, overlapLength-1,
- + (unsigned short)overlapLength, // 0, overlapLength,
- + (unsigned short)overlapLength); // 0, overlapLength ]
- + __m128i mix2 = _mm_add_epi16(mix1, adder);
- + __m128i *pVdest = (__m128i*)output; // (unaligned)
- + adder = _mm_add_epi16(adder, adder);
- +
- + for (int i = overlapLength / 4 ; i ; i--)
- + {
- + const __m128i vi = _mm_loadu_si128(pVinput);
- + const __m128i vm = _mm_load_si128(pVMidBuf);
- + __m128i v1 = _mm_unpacklo_epi16(vm, vi);
- + __m128i v2 = _mm_unpackhi_epi16(vm, vi);
- + v1 = _mm_madd_epi16(v1, mix1);
- + v2 = _mm_madd_epi16(v2, mix2);
- + v1 = _mm_sra_epi32(v1, shifter);
- + v2 = _mm_sra_epi32(v2, shifter);
- + v1 = _mm_packs_epi32(v1, v2);
- + _mm_storeu_si128(pVdest, v1);
- +
- + mix1 = _mm_add_epi16(mix1, adder);
- + mix2 = _mm_add_epi16(mix2, adder);
- + pVMidBuf++;
- + pVinput++;
- + pVdest++;
- + }
- +}
- +
- +
- +//////////////////////////////////////////////////////////////////////////////
- +//
- +// implementation of SSE2 optimized functions of class 'FIRFilter'
- +//
- +//////////////////////////////////////////////////////////////////////////////
- +
- +#include "FIRFilter.h"
- +
- +FIRFilterSSE2::FIRFilterSSE2() : FIRFilter()
- +{
- + filterCoeffsAlign = NULL;
- + filterCoeffsUnalign = NULL;
- +}
- +
- +
- +FIRFilterSSE2::~FIRFilterSSE2()
- +{
- + delete[] filterCoeffsUnalign;
- + filterCoeffsAlign = NULL;
- + filterCoeffsUnalign = NULL;
- +}
- +
- +
- +// (overloaded) Calculates filter coefficients for SSE2 routine
- +void FIRFilterSSE2::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
- +{
- + FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
- +
- + // Ensure that filter coeffs array is aligned to 16-byte boundary
- + delete[] filterCoeffsUnalign;
- + filterCoeffsUnalign = new short[2 * newLength + 8];
- + filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
- + __m128i *VfilterCoeffsAlign = (__m128i*)filterCoeffsAlign;
- +
- + // rearrange the filter coefficients for SSE2 routines
- + for (uint i = 0; i < length; i += 4)
- + {
- + __m128i v = _mm_loadl_epi64((__m128i*)(coeffs + i)); // 3, 2, 1, 0
- + v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 1, 2, 0)); // 3, 1, 2, 0
- + v = _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 0, 0)); // 3, 1, 3, 1, 2, 0, 2, 0
- + _mm_store_si128(VfilterCoeffsAlign++, v);
- + }
- +}
- +
- +
- +// sse2-optimized version of the filter routine for stereo sound
- +uint FIRFilterSSE2::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const
- +{
- + if (length < 2) return 0;
- +
- + short *pVdest = dest;
- +
- + for (uint i = (numSamples - length) >> 1 ; i ; i--)
- + {
- + const __m128i *pVsrc = (__m128i*)src;
- + const __m128i *pVfilter = (__m128i*)filterCoeffsAlign; //16byte aligned
- + __m128i accu1 = _mm_setzero_si128();
- + __m128i accu2 = _mm_setzero_si128();
- +
- + for (uint j = lengthDiv8 * 2; j ; j--)
- + {
- + // accu1 accu2
- + // r0: s00*f00 + s04*f01 s02*f00 + s06*f01
- + // r1: s01*f02 + s05*f03 s03*f02 + s07*f03
- + // r2: s02*f04 + s06*f05 s04*f04 + s08*f05
- + // r3: s03*f06 + s07*f07 s05*f06 + s09*f07
- + __m128i v0 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+0));
- + __m128i v2 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+2));
- + const __m128i v4 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+4));
- + const __m128i v6 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+6));
- + const __m128i vf = _mm_load_si128(pVfilter);
- + v0 = _mm_unpacklo_epi16(v0, v4);
- + v2 = _mm_unpacklo_epi16(v2, v6);
- + v0 = _mm_madd_epi16(v0, vf);
- + v2 = _mm_madd_epi16(v2, vf);
- + pVsrc++;
- + accu1 = _mm_add_epi32(accu1, v0);
- + pVfilter++;
- + accu2 = _mm_add_epi32(accu2, v2);
- + }
- + // r0: accu1 - s00*f00 + s04*f01 + s02*f04 + s06*f05
- + // r1: s01*f02 + s05*f03 + s03*f06 + s07*f07
- + // r2: accu2 - s02*f00 + s06*f01 + s04*f04 + s08*f05
- + // r3: s03*f02 + s07*f03 + s05*f06 + s09*f07
- + const __m128i v1 = _mm_srli_si128(accu1, 8);
- + const __m128i v2 = _mm_srli_si128(accu2, 8);
- + accu1 = _mm_add_epi32(accu1, v1);
- + accu2 = _mm_add_epi32(accu2, v2);
- + accu1 = _mm_unpacklo_epi64(accu1, accu2);
- + accu1 = _mm_srai_epi32(accu1, resultDivFactor);
- + accu1 = _mm_packs_epi32(accu1, accu1);
- + _mm_storel_epi64((__m128i*)pVdest, accu1);
- + src += 4;
- + pVdest += 4;
- + }
- + return (numSamples & -2) - length;
- +}
- +#endif // SOUNDTOUCH_ALLOW_SSE2
- --- soundtouch-1.9.2.orig/source/SoundTouch/TDStretch.cpp Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SSE2/source/SoundTouch/TDStretch.cpp Fri Feb 12 01:54:14 2016
- @@ -748,6 +748,15 @@
- // Check if MMX/SSE instruction set extensions supported by CPU
- +#ifdef SOUNDTOUCH_ALLOW_SSE2
- + // SSE2 routines available only with integer sample types
- + if (uExtensions & SUPPORT_SSE2)
- + {
- + return ::new TDStretchSSE2;
- + }
- + else
- +#endif // SOUNDTOUCH_ALLOW_SSE2
- +
- #ifdef SOUNDTOUCH_ALLOW_MMX
- // MMX routines available only with integer sample types
- if (uExtensions & SUPPORT_MMX)
- --- soundtouch-1.9.2.orig/source/SoundTouch/TDStretch.h Sun Sep 20 16:40:59 2015
- +++ soundtouch-1.9.2.SSE2/source/SoundTouch/TDStretch.h Fri Feb 12 12:14:54 2016
- @@ -277,5 +277,18 @@
- #endif /// SOUNDTOUCH_ALLOW_SSE
- +
- +#ifdef SOUNDTOUCH_ALLOW_SSE2
- + /// Class that implements SSE2 optimized routines for 16bit integer samples type.
- + class TDStretchSSE2 : public TDStretch
- + {
- + protected:
- + double calcCrossCorr(const short *mixingPos, const short *compare, double &norm);
- + double calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm);
- + virtual void overlapStereo(short *output, const short *input);
- + };
- +
- +#endif /// SOUNDTOUCH_ALLOW_SSE2
- +
- }
- #endif /// TDStretch_H
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement