Untitled

diff --git a/lib/rbcodec/codecs/libopus/celt/mdct.c b/lib/rbcodec/codecs/libopus/celt/mdct.c
old mode 100644
new mode 100755
index 72ea180..974f2c1
--- a/lib/rbcodec/codecs/libopus/celt/mdct.c
+++ b/lib/rbcodec/codecs/libopus/celt/mdct.c
@@ -210,6 +210,194 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
 }
 #endif

+#if 1
+void pre_rotate_960_armv5(kiss_fft_scalar * out, const kiss_twiddle_scalar *t, kiss_fft_scalar *in){
+
+    int N=960; //deliberately do not halve this so that we will have half word offsets for ldrsh
+    //int N=2;
+    kiss_fft_scalar *in2 = in+(960-1);
+    kiss_fft_scalar in3=in;
+               int i, num=960;
+   //memset(out, 0, 960*4);
+   kiss_fft_scalar * temp=out;
+     kiss_twiddle_scalar *t2 = t+480;
+     //DEBUGF("initial values: xp1:  %u,  xp2:  %u, out:  %u t1:  %u\n", (int)in, (int)in2,(int) out, (int)t2);
+      asm volatile (
+
+         "0:"
+
+         "ldr r4, [%[t1], %[n]]; "     //bottom of this contains t1[n], unfortunately we count down
+         "ldr r6, [%[xp1]]; "          //n is in units of half words
+         "ldr r5, [%[t2], -%[n]]; "    //contains t1[n], t1[n+1] because we count up
+         "ldr r9, [%[xp2]]; "
+         "add %[xp1], %[xp1], #8;"
+
+          //compute yr
+         "smulwb r7, r6, r4;"      //r8 isn't used, stupid armv4 ops
+
+         "smulwb r10, r9, r5;"     //remember to put 16 bit val in last register for early term
+         "sub %[xp2], %[xp2], #8;"     //decrement the pointer while we wait for the multiply to finish
+
+         "sub r8, r7, r10;"                //r7, r10 free
+         "mov r8, r8, lsl #1;"
+
+
+         //compute yi
+         "smulwb r10, r6, r5;"     //r6, r5 free
+         "smlawb r5, r9, r4, r10;"     //r4, r9 free
+         "mov r6,  #13;"               //sneak a constant load into this pipeline buble
+
+         //"add r7, r5, r10;"  //add both then make negative and finally shift, probably not optimal
+         "rsb r7, r5, #0;"
+         "mov r7, r7, lsl #1;"
+
+
+         "smulwb r10, r7, r6;"             //r10=S_MUL(yi,13)
+         "smlawb r9, r8, r6, r7;"              //r4=S_MUL(yr,13)
+
+         "sub r5, r8, r10;"
+         //"add r9, r7, r4;"
+
+         //*yp++ =...
+         "stmia %[out]!, {r5, r9};"
+
+         "subs  %[n], %[n], #2;"       //decrement by 1 half word
+         "bne   0b;"
+
+
+          : [out] "+r" (out), [n] "+r" (N),[xp1] "+r" (in), [xp2] "+r" (in2)  //6 registers used
+          : [t1] "r" (t), [t2] "r" (t+480)
+          : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "memory", "cc");    //can use 7 more before we'll run out
+
+/*
+          DEBUGF("final values:   xp1:  %u,  xp2:  %u, out:  %u\n", (int)in, (int)in2,(int) out);
+          DEBUGF("final values:   xp1:  %u (%d) for N: %d\n", (int)in, ((int)in)-((int)in3), N);
+              DEBUGF("out values 1->%d\n", num);
+              for(i=0; i<num;i++)
+               DEBUGF("%u ", temp[i]);
+           DEBUGF("\n");
+ exit(0);
+ */
+
+}
+#elif 0
+void pre_rotate_960_armv4(kiss_fft_scalar * out, const kiss_twiddle_scalar *t, kiss_fft_scalar *in){
+
+    int N=960; //deliberately do not halve this so that we will have half word offsets for ldrsh
+    //int N=2;
+    kiss_fft_scalar *in2 = in+(960-1);
+    kiss_fft_scalar in3=in;
+               int i, num=960;
+   //memset(out, 0, 960*4);
+   kiss_fft_scalar * temp=out;
+     kiss_twiddle_scalar *t2 = t+480;
+     DEBUGF("initial values: xp1:  %u,  xp2:  %u, out:  %u t1:  %u t2:  %u\n", (int)in, (int)in2,(int) out,(int)t, (int)t2);
+      asm volatile (
+
+         "0:"
+         //we can't use ldm for either t or xp variables
+         "ldr r4, [%[t1], %[n]]; "
+         "ldr r6, [%[xp1]]; "          //n is in units of half words
+         "ldrh r5, [%[t2], -%[n]]; "
+         "ldr r9, [%[xp2]]; "
+         "add %[xp1], %[xp1], #8;"
+         "mov r4, r4, lsl #16;"
+         "mov r5, r5, lsl #16;"
+
+          //compute yr
+         "smull r8, r7, r6, r4;"       //r8 isn't used, stupid armv4 ops
+         "sub %[xp2], %[xp2], #8;"     //decrement the pointer while we wait for the multiply to finish
+
+         "smull r8, r10, r9, r5;"      //remember to put 16 bit val in last register for early term
+
+
+         "sub r8, r7, r10;"                //r7, r10 free
+         "mov r8, r8, lsl #1;"
+
+
+         //compute yi
+         "smull r7, r10, r6, r5;"      //r6, r5 free
+         "smull r7, r5, r9, r4;"       //r4, r9 free
+         "mov r6,  #13;"               //sneak a constant load into this pipeline buble
+
+         "add r7, r5, r10;"    //add both then make negative and finally shift, probably not optimal
+         "rsb r7, r7, #0;"
+         "mov r7, r7, lsl #1;"
+
+
+         "smull r5, r10, r7, r6;"              //r10=S_MUL(yi,13)
+         "smull r5, r4, r8, r6;"               //r4=S_MUL(yr,13)
+
+         "sub r5, r8, r10;"
+         "add r9, r7, r4;"
+
+         //*yp++ =...
+         "stmia %[out]!, {r5, r9};"
+
+         "subs  %[n], %[n], #2;"       //decrement by 1 half word
+         "bne   0b;"
+
+
+          : [out] "+r" (out), [n] "+r" (N),[xp1] "+r" (in), [xp2] "+r" (in2)  //6 registers used
+          : [t1] "r" (t), [t2] "r" (t+480)
+          : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "memory", "cc");    //can use 7 more before we'll run out
+
+}
+#else
+
+
+ void pre_rotate_960_armv4(kiss_fft_scalar * out, const kiss_twiddle_scalar *t, kiss_fft_scalar *in){
+   //DEBUGF("N: %d overlap: %d shift: %d stride: %d\n", N, overlap, shift, stride);    //N: 1920 overlap: 120 shift: 0 stride: 1
+   int i;//memset(out, 0, 960*4);
+     /* Temp pointers to make it really clear to the compiler what we're doing */
+     const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
+     const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+(960-1);
+     const kiss_twiddle_scalar *t2 = t+480;
+     kiss_fft_scalar * OPUS_RESTRICT yp = out;
+
+   //      int ii, num=960;
+
+
+          //DEBUGF("final values:   xp1:  %u,  xp2:  %u, out:  %u\n", (int)in, (int)in2,(int) out);
+          //DEBUGF("final values:   xp1:  %u (%d) for N: %d\n", (int)in, ((int)in)-((int)in3), N);
+
+     //const kiss_twiddle_scalar *t = &l->trig[0];
+
+     for(i=480 ;i>0;i--)
+     {
+        kiss_fft_scalar yr, yi;
+        yr = -S_MUL(*xp2, t2[-i]) + S_MUL(*xp1,t[i]);
+        yi =  -S_MUL(*xp2, t[i]) - S_MUL(*xp1,t2[-i]);
+
+        DEBUGF("yr:  %u yi: %u, t1: %u, t2: %u xp2: %u xp1: %u, SMUL: %u\n", yr, yi, (int)&(t[i]),(int)&(t2[-i]), *xp2, *xp1, S_MUL(*xp1,t[i]));
+       exit(0);
+        /* works because the cos is nearly one */
+        *yp++ = yr - S_MUL(yi,13); //sine==13 for N=1920
+        *yp++ = yi + S_MUL(yr,13);
+        xp1+=2;
+        xp2-=2;
+     }
+
+/*        DEBUGF("out values 1->%d\n", num);
+          for(ii=0; ii<num;ii++)
+           DEBUGF("%u ", out[ii]);
+           DEBUGF("\n");
+ exit(0);*/
+#if 0
+     for(i=0;i<480;i++)
+     {
+        kiss_fft_scalar yr, yi;
+        yr = -S_MUL(*xp2, t[i]) + S_MUL(*xp1,t[(480-i)]);      //[0 479], [480 1]
+        yi =  -S_MUL(*xp2, t[(480-i)]) - S_MUL(*xp1,t[i]);     //[480, 1], [0 479]
+        /* works because the cos is nearly one */
+        *yp++ = yr - S_MUL(yi,13); //sine==13 for N=1920
+        *yp++ = yi + S_MUL(yr,13);
+        xp1+=2;
+        xp2-=2;
+     }
+#endif
+}
+#endif
 void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
       const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
 {
@@ -231,25 +419,27 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
    sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
 #endif

-   /* Pre-rotate */
-   {
-      /* Temp pointers to make it really clear to the compiler what we're doing */
-      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
-      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
-      kiss_fft_scalar * OPUS_RESTRICT yp = f2;
-      const kiss_twiddle_scalar *t = &l->trig[0];
-      for(i=0;i<N4;i++)
-      {
-         kiss_fft_scalar yr, yi;
-         yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
-         yi =  -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
-         /* works because the cos is nearly one */
-         *yp++ = yr - S_MUL(yi,sine);
-         *yp++ = yi + S_MUL(yr,sine);
-         xp1+=2*stride;
-         xp2-=2*stride;
-      }
-   }
+   if(shift > 1)
+      {
+         /* Temp pointers to make it really clear to the compiler what we're doing */
+         const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
+         const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
+         kiss_fft_scalar * OPUS_RESTRICT yp = f2;
+         const kiss_twiddle_scalar *t = &l->trig[0];
+         for(i=0;i<N4;i++)
+         {
+            kiss_fft_scalar yr, yi;
+            yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
+            yi =  -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
+            /* works because the cos is nearly one */
+            *yp++ = yr - S_MUL(yi,sine);
+            *yp++ = yi + S_MUL(yr,sine);
+
+            xp1+=2*stride;
+            xp2-=2*stride;
+         }
+   }else
+       pre_rotate_960_armv4(f2, &l->trig[0], in);

    /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
    opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1)));
diff --git a/lib/rbcodec/codecs/libopus/config.h b/lib/rbcodec/codecs/libopus/config.h
old mode 100644
new mode 100755
index 1ce75ff..5ac2f8f
--- a/lib/rbcodec/codecs/libopus/config.h
+++ b/lib/rbcodec/codecs/libopus/config.h
@@ -43,7 +43,7 @@
 #if ARM_ARCH == 4
 #define OPUS_ARM_INLINE_ASM
 #elif ARM_ARCH > 4
-#define OPUS_ARM_INLINE_EDSP
+#define OPUS_ARM_INLINE_ASM
 #endif
 #endif