Advertisement
Guest User

SoundTouch-1.9.2_SSE2.patch

a guest
Feb 11th, 2016
243
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 15.95 KB | None | 0 0
  1. --- soundtouch-1.9.2.orig/include/STTypes.h Sun Sep 20 16:40:59 2015
  2. +++ soundtouch-1.9.2.SSE2/include/STTypes.h Fri Feb 12 01:54:14 2016
  3. @@ -98,8 +98,8 @@
  4.          ///   However, if you still prefer to select the sample format here
  5.          ///   also in GNU environment, then please #undef the INTEGER_SAMPLE
  6.          ///   and FLOAT_SAMPLE defines first as in comments above.
  7. -        //#define SOUNDTOUCH_INTEGER_SAMPLES     1    //< 16bit integer samples
  8. -        #define SOUNDTOUCH_FLOAT_SAMPLES       1    //< 32bit float samples
  9. +        #define SOUNDTOUCH_INTEGER_SAMPLES     1    //< 16bit integer samples
  10. +        //#define SOUNDTOUCH_FLOAT_SAMPLES       1    //< 32bit float samples
  11.      
  12.      #endif
  13.  
  14. @@ -143,8 +143,12 @@
  15.          #endif // SOUNDTOUCH_FLOAT_SAMPLES
  16.  
  17.          #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
  18. -            // Allow MMX optimizations
  19. -            #define SOUNDTOUCH_ALLOW_MMX   1
  20. +            // Allow SSE2 optimizations
  21. +            #define SOUNDTOUCH_ALLOW_SSE2      1
  22. +            #ifndef _M_X64
  23. +                // Allow MMX optimizations
  24. +                #define SOUNDTOUCH_ALLOW_MMX   1
  25. +            #endif
  26.          #endif
  27.  
  28.      #else
  29. --- soundtouch-1.9.2.orig/source/SoundTouch/FIRFilter.cpp   Sun Sep 20 16:40:59 2015
  30. +++ soundtouch-1.9.2.SSE2/source/SoundTouch/FIRFilter.cpp   Fri Feb 12 01:54:14 2016
  31. @@ -303,6 +303,15 @@
  32.  
  33.      // Check if MMX/SSE instruction set extensions supported by CPU
  34.  
  35. +#ifdef SOUNDTOUCH_ALLOW_SSE2
  36. +    // SSE2 routines available only with integer sample types
  37. +    if (uExtensions & SUPPORT_SSE2)
  38. +    {
  39. +        return ::new FIRFilterSSE2;
  40. +    }
  41. +    else
  42. +#endif // SOUNDTOUCH_ALLOW_SSE2
  43. +
  44.  #ifdef SOUNDTOUCH_ALLOW_MMX
  45.      // MMX routines available only with integer sample types
  46.      if (uExtensions & SUPPORT_MMX)
  47. --- soundtouch-1.9.2.orig/source/SoundTouch/FIRFilter.h Sun Sep 20 16:40:59 2015
  48. +++ soundtouch-1.9.2.SSE2/source/SoundTouch/FIRFilter.h Fri Feb 12 01:54:14 2016
  49. @@ -141,6 +141,25 @@
  50.  
  51.  #endif // SOUNDTOUCH_ALLOW_SSE
  52.  
  53. +
  54. +#ifdef SOUNDTOUCH_ALLOW_SSE2
  55. +    /// Class that implements SSE2 optimized functions exclusive for 16bit integer samples type.
  56. +    class FIRFilterSSE2 : public FIRFilter
  57. +    {
  58. +    protected:
  59. +        short *filterCoeffsUnalign;
  60. +        short *filterCoeffsAlign;
  61. +
  62. +        virtual uint evaluateFilterStereo(short *dest, const short *src, uint numSamples) const;
  63. +    public:
  64. +        FIRFilterSSE2();
  65. +        ~FIRFilterSSE2();
  66. +
  67. +        virtual void setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor);
  68. +    };
  69. +
  70. +#endif // SOUNDTOUCH_ALLOW_SSE2
  71. +
  72.  }
  73.  
  74.  #endif  // FIRFilter_H
  75. --- soundtouch-1.9.2.orig/source/SoundTouch/sse_optimized.cpp   Sun Sep 20 16:40:59 2015
  76. +++ soundtouch-1.9.2.SSE2/source/SoundTouch/sse_optimized.cpp   Fri Feb 12 11:54:13 2016
  77. @@ -370,3 +370,282 @@
  78.  }
  79.  
  80.  #endif  // SOUNDTOUCH_ALLOW_SSE
  81. +
  82. +#ifdef SOUNDTOUCH_ALLOW_SSE2
  83. +
  84. +// SSE2 routines available only with integer sample type
  85. +// Also refer to MMX optimized routines.
  86. +
  87. +//////////////////////////////////////////////////////////////////////////////
  88. +//
  89. +// implementation of SSE2 optimized functions of class 'TDStretchSSE2'
  90. +//
  91. +//////////////////////////////////////////////////////////////////////////////
  92. +
  93. +#include "TDStretch.h"
  94. +#include <emmintrin.h>
  95. +#include <math.h>
  96. +
  97. +// Calculates cross correlation of two buffers
  98. +double TDStretchSSE2::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm)
  99. +{
  100. +    const   __m128i *pVec1      = (__m128i*)pV1;    // not 16byte aligned
  101. +    const   __m128i *pVec2      = (__m128i*)pV2;    // 16byte aligned
  102. +    const   __m128i shifter     = _mm_cvtsi32_si128(overlapDividerBitsNorm);
  103. +            __m128i accu        = _mm_setzero_si128();
  104. +            __m128i normaccu    = _mm_setzero_si128();
  105. +            __m128i v;                              // for temporary
  106. +
  107. +    // Process 8 parallel sets of 4 * stereo samples or 8 * mono samples
  108. +    // during each round for improved CPU-level parallellization.
  109. +
  110. +    for (int i = channels*overlapLength/16 ; i ; i--)
  111. +    {
  112. +        // Applies shifter immediately after product-sum to prevent overflow
  113. +        __m128i n0 = _mm_loadu_si128(pVec1);
  114. +        __m128i n1 = _mm_loadu_si128(pVec1+1);
  115. +        __m128i a0 = _mm_madd_epi16(n0, *pVec2++); // a0 = pVec1[0] * pVec2[0]
  116. +                n0 = _mm_madd_epi16(n0, n0);       // n0 = pVec1[0]^2
  117. +        __m128i a1 = _mm_madd_epi16(n1, *pVec2++); // a1 = pVec1[1] * pVec2[1]
  118. +                n1 = _mm_madd_epi16(n1, n1);       // n1 = pVec1[1]^2
  119. +                a0 = _mm_sra_epi32(a0, shifter);   // right arithmetic shift
  120. +                n0 = _mm_sra_epi32(n0, shifter);
  121. +                a1 = _mm_sra_epi32(a1, shifter);
  122. +                n1 = _mm_sra_epi32(n1, shifter);
  123. +        accu     = _mm_add_epi32(accu, a0);        // add to accumulator
  124. +        normaccu = _mm_add_epi32(normaccu, n0);
  125. +        accu     = _mm_add_epi32(accu, a1);
  126. +        normaccu = _mm_add_epi32(normaccu, n1);
  127. +        pVec1 += 2;
  128. +    }
  129. +    // sum total
  130. +    v    = _mm_srli_si128(accu, 4);
  131. +    accu = _mm_add_epi32(v, accu);
  132. +    v    = _mm_srli_si128(accu, 8);
  133. +    accu = _mm_add_epi32(v, accu);
  134. +    v        = _mm_srli_si128(normaccu, 4);
  135. +    normaccu = _mm_add_epi32(v, normaccu);
  136. +    v        = _mm_srli_si128(normaccu, 8);
  137. +    normaccu = _mm_add_epi32(v, normaccu);
  138. +
  139. +    __m128d Vcorr  = _mm_cvtepi32_pd(accu);        // int32 to double
  140. +    __m128d Vdnorm = _mm_cvtepi32_pd(normaccu);
  141. +    _mm_store_sd(&dnorm, Vdnorm);                  // feedback to dnorm
  142. +
  143. +    if (_mm_cvtsi128_si32(normaccu) > 0) {
  144. +        Vdnorm = _mm_sqrt_sd(Vdnorm, Vdnorm);
  145. +        Vcorr = _mm_div_sd(Vcorr, Vdnorm);
  146. +    }
  147. +    return _mm_cvtsd_f64(Vcorr);
  148. +}
  149. +
  150. +
  151. +/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
  152. +double TDStretchSSE2::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm)
  153. +{
  154. +    const   __m128i *pVec1     = (__m128i*)pV1;    // (unaligned)
  155. +    const   __m128i *pVec1prev = pVec1;            // for previos round normalizer
  156. +    const   __m128i *pVec2     = (__m128i*)pV2;    // (aligned)
  157. +    const   __m128i shifter    = _mm_cvtsi32_si128(overlapDividerBitsNorm);
  158. +            __m128i accu       = _mm_setzero_si128();
  159. +            __m128i norm       = _mm_setzero_si128();
  160. +            __m128i v;                              // for temporary
  161. +            __m128d vd;                             // for temporary
  162. +
  163. +    // Process 8 parallel sets of 2 * stereo samples or 16 * mono samples
  164. +    // during each round for improved CPU-level parallellization.
  165. +    for (int i = channels * overlapLength / 16 ; i ; i--)
  166. +    {
  167. +        // Applies shifter immediately after product-sum to prevent overflow
  168. +        const __m128i vec1[] = {
  169. +            _mm_loadu_si128(pVec1),
  170. +            _mm_loadu_si128(pVec1+1)
  171. +        };
  172. +        __m128i v1 = _mm_madd_epi16(vec1[0], pVec2[0]);
  173. +                v1 = _mm_sra_epi32(v1, shifter);
  174. +        __m128i v2 = _mm_madd_epi16(vec1[1], pVec2[1]);
  175. +                v2 = _mm_sra_epi32(v2, shifter);
  176. +        pVec1 += 2;
  177. +        accu = _mm_add_epi32(accu, v1);
  178. +        pVec2 += 2;
  179. +        accu = _mm_add_epi32(accu, v2);
  180. +    }
  181. +    v    = _mm_srli_si128(accu, 8);
  182. +    accu = _mm_add_epi32(v, accu);
  183. +    v    = _mm_srli_si128(accu, 4);
  184. +    accu = _mm_add_epi32(v, accu); // accu.m128i_i32[0] is sum total
  185. +
  186. +    // update normalizer with last samples of this round, and previous round
  187. +    for (int ch = channels; ch > 0; ch -= sizeof(*pVec1)/sizeof(*pV1)) {
  188. +        const __m128i vth = _mm_set_epi16(0,1,2,3,4,5,6,7);
  189. +        const __m128i vch = _mm_set1_epi16(ch);
  190. +        const __m128i vMask = _mm_cmpgt_epi16(vch, vth);
  191. +        __m128i vThis = _mm_loadu_si128(--pVec1);
  192. +        __m128i vPrev = _mm_loadu_si128(--pVec1prev);
  193. +        vThis = _mm_and_si128(vThis, vMask); // this round
  194. +        vPrev = _mm_and_si128(vPrev, vMask); // previos round
  195. +
  196. +        vThis = _mm_madd_epi16(vThis, vThis);
  197. +        vThis = _mm_sra_epi32(vThis, shifter);
  198. +        vPrev = _mm_madd_epi16(vPrev, vPrev);
  199. +        vPrev = _mm_sra_epi32(vPrev, shifter);
  200. +        norm  = _mm_add_epi32(norm, vThis);
  201. +        norm  = _mm_sub_epi32(norm, vPrev);
  202. +    }
  203. +    v    = _mm_srli_si128(norm, 8);
  204. +    norm = _mm_add_epi32(norm, v);
  205. +    v    = _mm_srli_si128(norm, 4);
  206. +    norm = _mm_add_epi32(norm, v); // norm.m128i_i32[0] is sum total
  207. +
  208. +    __m128d Vcorr  = _mm_cvtepi32_pd(accu);
  209. +    __m128d Vdnorm = _mm_cvtepi32_pd(norm);
  210. +    vd = _mm_load_sd(&dnorm);
  211. +    Vdnorm = _mm_add_sd(vd, Vdnorm);
  212. +    _mm_store_sd(&dnorm, Vdnorm);  // feedback to dnorm
  213. +
  214. +    const __m128d dmin = _mm_set_sd(1e-9);
  215. +    if (_mm_comige_sd(Vdnorm, dmin)) {
  216. +        Vdnorm = _mm_sqrt_sd(Vdnorm, Vdnorm);
  217. +        Vcorr = _mm_div_sd(Vcorr, Vdnorm);
  218. +    }
  219. +    return _mm_cvtsd_f64(Vcorr);
  220. +}
  221. +
  222. +
  223. +// SSE2-optimized version of the function overlapStereo
  224. +void TDStretchSSE2::overlapStereo(short *output, const short *input)
  225. +{
  226. +    const   __m128i *pVinput  = (__m128i*)input;                // (unaligned)
  227. +    const   __m128i *pVMidBuf = (__m128i*)pMidBuffer;           // (aligned)
  228. +    const   __m128i shifter   = _mm_cvtsi32_si128(overlapDividerBitsPure + 1);
  229. +            // note: Since _mm_set_epi16() is slow at Pentium4, _mm_set_epi32() is substituted.
  230. +            __m128i adder     = _mm_set1_epi32(0x2fffe);        // [ 2, -2, 2, -2, 2, -2, 2, -2 ]
  231. +            __m128i mix1      = _mm_set_epi32(
  232. +                0x10000 | (unsigned short)(overlapLength-1),    // (short)[ 1, overlapLength-1,
  233. +                0x10000 | (unsigned short)(overlapLength-1),    //          1, overlapLength-1,
  234. +                          (unsigned short)overlapLength,        //          0, overlapLength,
  235. +                          (unsigned short)overlapLength);       //          0, overlapLength ]
  236. +            __m128i mix2      = _mm_add_epi16(mix1, adder);
  237. +            __m128i *pVdest   = (__m128i*)output;               // (unaligned)
  238. +    adder = _mm_add_epi16(adder, adder);
  239. +
  240. +    for (int i = overlapLength / 4 ; i ; i--)
  241. +    {
  242. +        const __m128i vi = _mm_loadu_si128(pVinput);
  243. +        const __m128i vm = _mm_load_si128(pVMidBuf);
  244. +        __m128i v1 = _mm_unpacklo_epi16(vm, vi);
  245. +        __m128i v2 = _mm_unpackhi_epi16(vm, vi);
  246. +        v1 = _mm_madd_epi16(v1, mix1);
  247. +        v2 = _mm_madd_epi16(v2, mix2);
  248. +        v1 = _mm_sra_epi32(v1, shifter);
  249. +        v2 = _mm_sra_epi32(v2, shifter);
  250. +        v1 = _mm_packs_epi32(v1, v2);
  251. +        _mm_storeu_si128(pVdest, v1);
  252. +
  253. +        mix1 = _mm_add_epi16(mix1, adder);
  254. +        mix2 = _mm_add_epi16(mix2, adder);
  255. +        pVMidBuf++;
  256. +        pVinput++;
  257. +        pVdest++;
  258. +    }
  259. +}
  260. +
  261. +
  262. +//////////////////////////////////////////////////////////////////////////////
  263. +//
  264. +// implementation of SSE2 optimized functions of class 'FIRFilter'
  265. +//
  266. +//////////////////////////////////////////////////////////////////////////////
  267. +
  268. +#include "FIRFilter.h"
  269. +
  270. +FIRFilterSSE2::FIRFilterSSE2() : FIRFilter()
  271. +{
  272. +    filterCoeffsAlign = NULL;
  273. +    filterCoeffsUnalign = NULL;
  274. +}
  275. +
  276. +
  277. +FIRFilterSSE2::~FIRFilterSSE2()
  278. +{
  279. +    delete[] filterCoeffsUnalign;
  280. +    filterCoeffsAlign = NULL;
  281. +    filterCoeffsUnalign = NULL;
  282. +}
  283. +
  284. +
  285. +// (overloaded) Calculates filter coefficients for SSE2 routine
  286. +void FIRFilterSSE2::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
  287. +{
  288. +    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
  289. +
  290. +    // Ensure that filter coeffs array is aligned to 16-byte boundary
  291. +    delete[] filterCoeffsUnalign;
  292. +    filterCoeffsUnalign = new short[2 * newLength + 8];
  293. +    filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
  294. +    __m128i *VfilterCoeffsAlign = (__m128i*)filterCoeffsAlign;
  295. +
  296. +    // rearrange the filter coefficients for SSE2 routines
  297. +    for (uint i = 0; i < length; i += 4)
  298. +    {
  299. +        __m128i v = _mm_loadl_epi64((__m128i*)(coeffs + i)); // 3, 2, 1, 0
  300. +        v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 1, 2, 0)); // 3, 1, 2, 0
  301. +        v = _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 1, 0, 0));   // 3, 1, 3, 1, 2, 0, 2, 0
  302. +        _mm_store_si128(VfilterCoeffsAlign++, v);
  303. +    }
  304. +}
  305. +
  306. +
  307. +// sse2-optimized version of the filter routine for stereo sound
  308. +uint FIRFilterSSE2::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const
  309. +{
  310. +    if (length < 2) return 0;
  311. +
  312. +    short *pVdest = dest;
  313. +
  314. +    for (uint i = (numSamples - length) >> 1 ; i ; i--)
  315. +    {
  316. +        const   __m128i *pVsrc    = (__m128i*)src;
  317. +        const   __m128i *pVfilter = (__m128i*)filterCoeffsAlign; //16byte aligned
  318. +                __m128i accu1     = _mm_setzero_si128();
  319. +                __m128i accu2     = _mm_setzero_si128();
  320. +
  321. +        for (uint j = lengthDiv8 * 2; j ; j--)
  322. +        {
  323. +            //           accu1                accu2
  324. +            // r0: s00*f00 + s04*f01    s02*f00 + s06*f01
  325. +            // r1: s01*f02 + s05*f03    s03*f02 + s07*f03
  326. +            // r2: s02*f04 + s06*f05    s04*f04 + s08*f05
  327. +            // r3: s03*f06 + s07*f07    s05*f06 + s09*f07
  328. +                  __m128i v0 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+0));
  329. +                  __m128i v2 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+2));
  330. +            const __m128i v4 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+4));
  331. +            const __m128i v6 = _mm_loadl_epi64((__m128i*)((short*)pVsrc+6));
  332. +            const __m128i vf = _mm_load_si128(pVfilter);
  333. +            v0 = _mm_unpacklo_epi16(v0, v4);
  334. +            v2 = _mm_unpacklo_epi16(v2, v6);
  335. +            v0 = _mm_madd_epi16(v0, vf);
  336. +            v2 = _mm_madd_epi16(v2, vf);
  337. +            pVsrc++;
  338. +            accu1 = _mm_add_epi32(accu1, v0);
  339. +            pVfilter++;
  340. +            accu2 = _mm_add_epi32(accu2, v2);
  341. +        }
  342. +        // r0: accu1 - s00*f00 + s04*f01 + s02*f04 + s06*f05
  343. +        // r1:         s01*f02 + s05*f03 + s03*f06 + s07*f07
  344. +        // r2: accu2 - s02*f00 + s06*f01 + s04*f04 + s08*f05
  345. +        // r3:         s03*f02 + s07*f03 + s05*f06 + s09*f07
  346. +        const __m128i v1 = _mm_srli_si128(accu1, 8);
  347. +        const __m128i v2 = _mm_srli_si128(accu2, 8);
  348. +        accu1 = _mm_add_epi32(accu1, v1);
  349. +        accu2 = _mm_add_epi32(accu2, v2);
  350. +        accu1 = _mm_unpacklo_epi64(accu1, accu2);
  351. +        accu1 = _mm_srai_epi32(accu1, resultDivFactor);
  352. +        accu1 = _mm_packs_epi32(accu1, accu1);
  353. +        _mm_storel_epi64((__m128i*)pVdest, accu1);
  354. +        src += 4;
  355. +        pVdest += 4;
  356. +    }
  357. +    return (numSamples & -2) - length;
  358. +}
  359. +#endif  // SOUNDTOUCH_ALLOW_SSE2
  360. --- soundtouch-1.9.2.orig/source/SoundTouch/TDStretch.cpp   Sun Sep 20 16:40:59 2015
  361. +++ soundtouch-1.9.2.SSE2/source/SoundTouch/TDStretch.cpp   Fri Feb 12 01:54:14 2016
  362. @@ -748,6 +748,15 @@
  363.  
  364.      // Check if MMX/SSE instruction set extensions supported by CPU
  365.  
  366. +#ifdef SOUNDTOUCH_ALLOW_SSE2
  367. +    // SSE2 routines available only with integer sample types
  368. +    if (uExtensions & SUPPORT_SSE2)
  369. +    {
  370. +        return ::new TDStretchSSE2;
  371. +    }
  372. +    else
  373. +#endif // SOUNDTOUCH_ALLOW_SSE2
  374. +
  375.  #ifdef SOUNDTOUCH_ALLOW_MMX
  376.      // MMX routines available only with integer sample types
  377.      if (uExtensions & SUPPORT_MMX)
  378. --- soundtouch-1.9.2.orig/source/SoundTouch/TDStretch.h Sun Sep 20 16:40:59 2015
  379. +++ soundtouch-1.9.2.SSE2/source/SoundTouch/TDStretch.h Fri Feb 12 12:14:54 2016
  380. @@ -277,5 +277,18 @@
  381.  
  382.  #endif /// SOUNDTOUCH_ALLOW_SSE
  383.  
  384. +
  385. +#ifdef SOUNDTOUCH_ALLOW_SSE2
  386. +    /// Class that implements SSE2 optimized routines for 16bit integer samples type.
  387. +    class TDStretchSSE2 : public TDStretch
  388. +    {
  389. +    protected:
  390. +        double calcCrossCorr(const short *mixingPos, const short *compare, double &norm);
  391. +        double calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm);
  392. +        virtual void overlapStereo(short *output, const short *input);
  393. +    };
  394. +
  395. +#endif /// SOUNDTOUCH_ALLOW_SSE2
  396. +
  397.  }
  398.  #endif  /// TDStretch_H
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement