Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- diff --git a/lib/rbcodec/codecs/libopus/celt/mdct.c b/lib/rbcodec/codecs/libopus/celt/mdct.c
- old mode 100644
- new mode 100755
- index 72ea180..974f2c1
- --- a/lib/rbcodec/codecs/libopus/celt/mdct.c
- +++ b/lib/rbcodec/codecs/libopus/celt/mdct.c
- @@ -210,6 +210,194 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
- }
- #endif
- +#if 1
- +void pre_rotate_960_armv5(kiss_fft_scalar * out, const kiss_twiddle_scalar *t, kiss_fft_scalar *in){
- +
- + int N=960; //deliberately do not halve this so that we will have half word offsets for ldrsh
- + //int N=2;
- + kiss_fft_scalar *in2 = in+(960-1);
- + kiss_fft_scalar in3=in;
- + int i, num=960;
- + //memset(out, 0, 960*4);
- + kiss_fft_scalar * temp=out;
- + kiss_twiddle_scalar *t2 = t+480;
- + //DEBUGF("initial values: xp1: %u, xp2: %u, out: %u t1: %u\n", (int)in, (int)in2,(int) out, (int)t2);
- + asm volatile (
- +
- + "0:"
- +
- + "ldr r4, [%[t1], %[n]]; " //bottom of this contains t1[n], unfortunately we count down
- + "ldr r6, [%[xp1]]; " //n is in units of half words
- + "ldr r5, [%[t2], -%[n]]; " //contains t1[n], t1[n+1] because we count up
- + "ldr r9, [%[xp2]]; "
- + "add %[xp1], %[xp1], #8;"
- +
- + //compute yr
- + "smulwb r7, r6, r4;" //r8 isn't used, stupid armv4 ops
- +
- + "smulwb r10, r9, r5;" //remember to put 16 bit val in last register for early term
- + "sub %[xp2], %[xp2], #8;" //decrement the pointer while we wait for the multiply to finish
- +
- + "sub r8, r7, r10;" //r7, r10 free
- + "mov r8, r8, lsl #1;"
- +
- +
- + //compute yi
- + "smulwb r10, r6, r5;" //r6, r5 free
- + "smlawb r5, r9, r4, r10;" //r4, r9 free
- + "mov r6, #13;" //sneak a constant load into this pipeline buble
- +
- + //"add r7, r5, r10;" //add both then make negative and finally shift, probably not optimal
- + "rsb r7, r5, #0;"
- + "mov r7, r7, lsl #1;"
- +
- +
- + "smulwb r10, r7, r6;" //r10=S_MUL(yi,13)
- + "smlawb r9, r8, r6, r7;" //r4=S_MUL(yr,13)
- +
- + "sub r5, r8, r10;"
- + //"add r9, r7, r4;"
- +
- + //*yp++ =...
- + "stmia %[out]!, {r5, r9};"
- +
- + "subs %[n], %[n], #2;" //decrement by 1 half word
- + "bne 0b;"
- +
- +
- + : [out] "+r" (out), [n] "+r" (N),[xp1] "+r" (in), [xp2] "+r" (in2) //6 registers used
- + : [t1] "r" (t), [t2] "r" (t+480)
- + : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "memory", "cc"); //can use 7 more before we'll run out
- +
- +/*
- + DEBUGF("final values: xp1: %u, xp2: %u, out: %u\n", (int)in, (int)in2,(int) out);
- + DEBUGF("final values: xp1: %u (%d) for N: %d\n", (int)in, ((int)in)-((int)in3), N);
- + DEBUGF("out values 1->%d\n", num);
- + for(i=0; i<num;i++)
- + DEBUGF("%u ", temp[i]);
- + DEBUGF("\n");
- + exit(0);
- + */
- +
- +}
- +#elif 0
- +void pre_rotate_960_armv4(kiss_fft_scalar * out, const kiss_twiddle_scalar *t, kiss_fft_scalar *in){
- +
- + int N=960; //deliberately do not halve this so that we will have half word offsets for ldrsh
- + //int N=2;
- + kiss_fft_scalar *in2 = in+(960-1);
- + kiss_fft_scalar in3=in;
- + int i, num=960;
- + //memset(out, 0, 960*4);
- + kiss_fft_scalar * temp=out;
- + kiss_twiddle_scalar *t2 = t+480;
- + DEBUGF("initial values: xp1: %u, xp2: %u, out: %u t1: %u t2: %u\n", (int)in, (int)in2,(int) out,(int)t, (int)t2);
- + asm volatile (
- +
- + "0:"
- + //we can't use ldm for either t or xp variables
- + "ldr r4, [%[t1], %[n]]; "
- + "ldr r6, [%[xp1]]; " //n is in units of half words
- + "ldrh r5, [%[t2], -%[n]]; "
- + "ldr r9, [%[xp2]]; "
- + "add %[xp1], %[xp1], #8;"
- + "mov r4, r4, lsl #16;"
- + "mov r5, r5, lsl #16;"
- +
- + //compute yr
- + "smull r8, r7, r6, r4;" //r8 isn't used, stupid armv4 ops
- + "sub %[xp2], %[xp2], #8;" //decrement the pointer while we wait for the multiply to finish
- +
- + "smull r8, r10, r9, r5;" //remember to put 16 bit val in last register for early term
- +
- +
- + "sub r8, r7, r10;" //r7, r10 free
- + "mov r8, r8, lsl #1;"
- +
- +
- + //compute yi
- + "smull r7, r10, r6, r5;" //r6, r5 free
- + "smull r7, r5, r9, r4;" //r4, r9 free
- + "mov r6, #13;" //sneak a constant load into this pipeline buble
- +
- + "add r7, r5, r10;" //add both then make negative and finally shift, probably not optimal
- + "rsb r7, r7, #0;"
- + "mov r7, r7, lsl #1;"
- +
- +
- + "smull r5, r10, r7, r6;" //r10=S_MUL(yi,13)
- + "smull r5, r4, r8, r6;" //r4=S_MUL(yr,13)
- +
- + "sub r5, r8, r10;"
- + "add r9, r7, r4;"
- +
- + //*yp++ =...
- + "stmia %[out]!, {r5, r9};"
- +
- + "subs %[n], %[n], #2;" //decrement by 1 half word
- + "bne 0b;"
- +
- +
- + : [out] "+r" (out), [n] "+r" (N),[xp1] "+r" (in), [xp2] "+r" (in2) //6 registers used
- + : [t1] "r" (t), [t2] "r" (t+480)
- + : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "memory", "cc"); //can use 7 more before we'll run out
- +
- +}
- +#else
- +
- +
- + void pre_rotate_960_armv4(kiss_fft_scalar * out, const kiss_twiddle_scalar *t, kiss_fft_scalar *in){
- + //DEBUGF("N: %d overlap: %d shift: %d stride: %d\n", N, overlap, shift, stride); //N: 1920 overlap: 120 shift: 0 stride: 1
- + int i;//memset(out, 0, 960*4);
- + /* Temp pointers to make it really clear to the compiler what we're doing */
- + const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
- + const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+(960-1);
- + const kiss_twiddle_scalar *t2 = t+480;
- + kiss_fft_scalar * OPUS_RESTRICT yp = out;
- +
- + // int ii, num=960;
- +
- +
- + //DEBUGF("final values: xp1: %u, xp2: %u, out: %u\n", (int)in, (int)in2,(int) out);
- + //DEBUGF("final values: xp1: %u (%d) for N: %d\n", (int)in, ((int)in)-((int)in3), N);
- +
- + //const kiss_twiddle_scalar *t = &l->trig[0];
- +
- + for(i=480 ;i>0;i--)
- + {
- + kiss_fft_scalar yr, yi;
- + yr = -S_MUL(*xp2, t2[-i]) + S_MUL(*xp1,t[i]);
- + yi = -S_MUL(*xp2, t[i]) - S_MUL(*xp1,t2[-i]);
- +
- + DEBUGF("yr: %u yi: %u, t1: %u, t2: %u xp2: %u xp1: %u, SMUL: %u\n", yr, yi, (int)&(t[i]),(int)&(t2[-i]), *xp2, *xp1, S_MUL(*xp1,t[i]));
- + exit(0);
- + /* works because the cos is nearly one */
- + *yp++ = yr - S_MUL(yi,13); //sine==13 for N=1920
- + *yp++ = yi + S_MUL(yr,13);
- + xp1+=2;
- + xp2-=2;
- + }
- +
- +/* DEBUGF("out values 1->%d\n", num);
- + for(ii=0; ii<num;ii++)
- + DEBUGF("%u ", out[ii]);
- + DEBUGF("\n");
- + exit(0);*/
- +#if 0
- + for(i=0;i<480;i++)
- + {
- + kiss_fft_scalar yr, yi;
- + yr = -S_MUL(*xp2, t[i]) + S_MUL(*xp1,t[(480-i)]); //[0 479], [480 1]
- + yi = -S_MUL(*xp2, t[(480-i)]) - S_MUL(*xp1,t[i]); //[480, 1], [0 479]
- + /* works because the cos is nearly one */
- + *yp++ = yr - S_MUL(yi,13); //sine==13 for N=1920
- + *yp++ = yi + S_MUL(yr,13);
- + xp1+=2;
- + xp2-=2;
- + }
- +#endif
- +}
- +#endif
- void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
- const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
- {
- @@ -231,25 +419,27 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
- sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
- #endif
- - /* Pre-rotate */
- - {
- - /* Temp pointers to make it really clear to the compiler what we're doing */
- - const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
- - const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
- - kiss_fft_scalar * OPUS_RESTRICT yp = f2;
- - const kiss_twiddle_scalar *t = &l->trig[0];
- - for(i=0;i<N4;i++)
- - {
- - kiss_fft_scalar yr, yi;
- - yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
- - yi = -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
- - /* works because the cos is nearly one */
- - *yp++ = yr - S_MUL(yi,sine);
- - *yp++ = yi + S_MUL(yr,sine);
- - xp1+=2*stride;
- - xp2-=2*stride;
- - }
- - }
- + if(shift > 1)
- + {
- + /* Temp pointers to make it really clear to the compiler what we're doing */
- + const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
- + const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
- + kiss_fft_scalar * OPUS_RESTRICT yp = f2;
- + const kiss_twiddle_scalar *t = &l->trig[0];
- + for(i=0;i<N4;i++)
- + {
- + kiss_fft_scalar yr, yi;
- + yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
- + yi = -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
- + /* works because the cos is nearly one */
- + *yp++ = yr - S_MUL(yi,sine);
- + *yp++ = yi + S_MUL(yr,sine);
- +
- + xp1+=2*stride;
- + xp2-=2*stride;
- + }
- + }else
- + pre_rotate_960_armv4(f2, &l->trig[0], in);
- /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
- opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1)));
- diff --git a/lib/rbcodec/codecs/libopus/config.h b/lib/rbcodec/codecs/libopus/config.h
- old mode 100644
- new mode 100755
- index 1ce75ff..5ac2f8f
- --- a/lib/rbcodec/codecs/libopus/config.h
- +++ b/lib/rbcodec/codecs/libopus/config.h
- @@ -43,7 +43,7 @@
- #if ARM_ARCH == 4
- #define OPUS_ARM_INLINE_ASM
- #elif ARM_ARCH > 4
- -#define OPUS_ARM_INLINE_EDSP
- +#define OPUS_ARM_INLINE_ASM
- #endif
- #endif
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement