Advertisement
Guest User

Untitled

a guest
Dec 25th, 2013
343
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
ARM 9.08 KB | None | 0 0
  1. diff --git a/lib/rbcodec/codecs/libopus/celt/mdct.c b/lib/rbcodec/codecs/libopus/celt/mdct.c
  2. old mode 100644
  3. new mode 100755
  4. index 72ea180..974f2c1
  5. --- a/lib/rbcodec/codecs/libopus/celt/mdct.c
  6. +++ b/lib/rbcodec/codecs/libopus/celt/mdct.c
  7. @@ -210,6 +210,194 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
  8.  }
  9.  #endif
  10.  
  11. +#if 1
  12. +void pre_rotate_960_armv5(kiss_fft_scalar * out, const kiss_twiddle_scalar *t, kiss_fft_scalar *in){
  13. +
  14. +    int N=960; //deliberately do not halve this so that we will have half word offsets for ldrsh
  15. +    //int N=2;
  16. +    kiss_fft_scalar *in2 = in+(960-1);
  17. +    kiss_fft_scalar in3=in;
  18. +               int i, num=960;
  19. +   //memset(out, 0, 960*4);
  20. +   kiss_fft_scalar * temp=out;
  21. +     kiss_twiddle_scalar *t2 = t+480;
  22. +     //DEBUGF("initial values: xp1:  %u,  xp2:  %u, out:  %u t1:  %u\n", (int)in, (int)in2,(int) out, (int)t2);
  23. +      asm volatile (
  24. +
  25. +         "0:"
  26. +
  27. +         "ldr r4, [%[t1], %[n]]; "     //bottom of this contains t1[n], unfortunately we count down
  28. +         "ldr r6, [%[xp1]]; "          //n is in units of half words
  29. +         "ldr r5, [%[t2], -%[n]]; "    //contains t1[n], t1[n+1] because we count up
  30. +         "ldr r9, [%[xp2]]; "
  31. +         "add %[xp1], %[xp1], #8;"
  32. +
  33. +          //compute yr
  34. +         "smulwb r7, r6, r4;"      //r8 isn't used, stupid armv4 ops
  35. +
  36. +         "smulwb r10, r9, r5;"     //remember to put 16 bit val in last register for early term
  37. +         "sub %[xp2], %[xp2], #8;"     //decrement the pointer while we wait for the multiply to finish
  38. +
  39. +         "sub r8, r7, r10;"                //r7, r10 free
  40. +         "mov r8, r8, lsl #1;"
  41. +
  42. +
  43. +         //compute yi
  44. +         "smulwb r10, r6, r5;"     //r6, r5 free
  45. +         "smlawb r5, r9, r4, r10;"     //r4, r9 free
  46. +         "mov r6,  #13;"               //sneak a constant load into this pipeline buble
  47. +
  48. +         //"add r7, r5, r10;"  //add both then make negative and finally shift, probably not optimal
  49. +         "rsb r7, r5, #0;"
  50. +         "mov r7, r7, lsl #1;"
  51. +
  52. +
  53. +         "smulwb r10, r7, r6;"             //r10=S_MUL(yi,13)
  54. +         "smlawb r9, r8, r6, r7;"              //r4=S_MUL(yr,13)
  55. +
  56. +         "sub r5, r8, r10;"
  57. +         //"add r9, r7, r4;"
  58. +
  59. +         //*yp++ =...
  60. +         "stmia %[out]!, {r5, r9};"
  61. +
  62. +         "subs  %[n], %[n], #2;"       //decrement by 1 half word
  63. +         "bne   0b;"
  64. +
  65. +
  66. +          : [out] "+r" (out), [n] "+r" (N),[xp1] "+r" (in), [xp2] "+r" (in2)  //6 registers used
  67. +          : [t1] "r" (t), [t2] "r" (t+480)
  68. +          : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "memory", "cc");    //can use 7 more before we'll run out
  69. +
  70. +/*
  71. +          DEBUGF("final values:   xp1:  %u,  xp2:  %u, out:  %u\n", (int)in, (int)in2,(int) out);
  72. +          DEBUGF("final values:   xp1:  %u (%d) for N: %d\n", (int)in, ((int)in)-((int)in3), N);
  73. +              DEBUGF("out values 1->%d\n", num);
  74. +              for(i=0; i<num;i++)
  75. +               DEBUGF("%u ", temp[i]);
  76. +           DEBUGF("\n");
  77. + exit(0);
  78. + */
  79. +
  80. +}
  81. +#elif 0
  82. +void pre_rotate_960_armv4(kiss_fft_scalar * out, const kiss_twiddle_scalar *t, kiss_fft_scalar *in){
  83. +
  84. +    int N=960; //deliberately do not halve this so that we will have half word offsets for ldrsh
  85. +    //int N=2;
  86. +    kiss_fft_scalar *in2 = in+(960-1);
  87. +    kiss_fft_scalar in3=in;
  88. +               int i, num=960;
  89. +   //memset(out, 0, 960*4);
  90. +   kiss_fft_scalar * temp=out;
  91. +     kiss_twiddle_scalar *t2 = t+480;
  92. +     DEBUGF("initial values: xp1:  %u,  xp2:  %u, out:  %u t1:  %u t2:  %u\n", (int)in, (int)in2,(int) out,(int)t, (int)t2);
  93. +      asm volatile (
  94. +
  95. +         "0:"
  96. +         //we can't use ldm for either t or xp variables
  97. +         "ldr r4, [%[t1], %[n]]; "
  98. +         "ldr r6, [%[xp1]]; "          //n is in units of half words
  99. +         "ldrh r5, [%[t2], -%[n]]; "
  100. +         "ldr r9, [%[xp2]]; "
  101. +         "add %[xp1], %[xp1], #8;"
  102. +         "mov r4, r4, lsl #16;"
  103. +         "mov r5, r5, lsl #16;"
  104. +
  105. +          //compute yr
  106. +         "smull r8, r7, r6, r4;"       //r8 isn't used, stupid armv4 ops
  107. +         "sub %[xp2], %[xp2], #8;"     //decrement the pointer while we wait for the multiply to finish
  108. +
  109. +         "smull r8, r10, r9, r5;"      //remember to put 16 bit val in last register for early term
  110. +
  111. +
  112. +         "sub r8, r7, r10;"                //r7, r10 free
  113. +         "mov r8, r8, lsl #1;"
  114. +
  115. +
  116. +         //compute yi
  117. +         "smull r7, r10, r6, r5;"      //r6, r5 free
  118. +         "smull r7, r5, r9, r4;"       //r4, r9 free
  119. +         "mov r6,  #13;"               //sneak a constant load into this pipeline buble
  120. +
  121. +         "add r7, r5, r10;"    //add both then make negative and finally shift, probably not optimal
  122. +         "rsb r7, r7, #0;"
  123. +         "mov r7, r7, lsl #1;"
  124. +
  125. +
  126. +         "smull r5, r10, r7, r6;"              //r10=S_MUL(yi,13)
  127. +         "smull r5, r4, r8, r6;"               //r4=S_MUL(yr,13)
  128. +
  129. +         "sub r5, r8, r10;"
  130. +         "add r9, r7, r4;"
  131. +
  132. +         //*yp++ =...
  133. +         "stmia %[out]!, {r5, r9};"
  134. +
  135. +         "subs  %[n], %[n], #2;"       //decrement by 1 half word
  136. +         "bne   0b;"
  137. +
  138. +
  139. +          : [out] "+r" (out), [n] "+r" (N),[xp1] "+r" (in), [xp2] "+r" (in2)  //6 registers used
  140. +          : [t1] "r" (t), [t2] "r" (t+480)
  141. +          : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "memory", "cc");    //can use 7 more before we'll run out
  142. +
  143. +}
  144. +#else
  145. +
  146. +
  147. + void pre_rotate_960_armv4(kiss_fft_scalar * out, const kiss_twiddle_scalar *t, kiss_fft_scalar *in){
  148. +   //DEBUGF("N: %d overlap: %d shift: %d stride: %d\n", N, overlap, shift, stride);    //N: 1920 overlap: 120 shift: 0 stride: 1
  149. +   int i;//memset(out, 0, 960*4);
  150. +     /* Temp pointers to make it really clear to the compiler what we're doing */
  151. +     const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
  152. +     const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+(960-1);
  153. +     const kiss_twiddle_scalar *t2 = t+480;
  154. +     kiss_fft_scalar * OPUS_RESTRICT yp = out;
  155. +
  156. +   //      int ii, num=960;
  157. +
  158. +
  159. +          //DEBUGF("final values:   xp1:  %u,  xp2:  %u, out:  %u\n", (int)in, (int)in2,(int) out);
  160. +          //DEBUGF("final values:   xp1:  %u (%d) for N: %d\n", (int)in, ((int)in)-((int)in3), N);
  161. +
  162. +     //const kiss_twiddle_scalar *t = &l->trig[0];
  163. +
  164. +     for(i=480 ;i>0;i--)
  165. +     {
  166. +        kiss_fft_scalar yr, yi;
  167. +        yr = -S_MUL(*xp2, t2[-i]) + S_MUL(*xp1,t[i]);
  168. +        yi =  -S_MUL(*xp2, t[i]) - S_MUL(*xp1,t2[-i]);
  169. +
  170. +        DEBUGF("yr:  %u yi: %u, t1: %u, t2: %u xp2: %u xp1: %u, SMUL: %u\n", yr, yi, (int)&(t[i]),(int)&(t2[-i]), *xp2, *xp1, S_MUL(*xp1,t[i]));
  171. +       exit(0);
  172. +        /* works because the cos is nearly one */
  173. +        *yp++ = yr - S_MUL(yi,13); //sine==13 for N=1920
  174. +        *yp++ = yi + S_MUL(yr,13);
  175. +        xp1+=2;
  176. +        xp2-=2;
  177. +     }
  178. +
  179. +/*        DEBUGF("out values 1->%d\n", num);
  180. +          for(ii=0; ii<num;ii++)
  181. +           DEBUGF("%u ", out[ii]);
  182. +           DEBUGF("\n");
  183. + exit(0);*/
  184. +#if 0
  185. +     for(i=0;i<480;i++)
  186. +     {
  187. +        kiss_fft_scalar yr, yi;
  188. +        yr = -S_MUL(*xp2, t[i]) + S_MUL(*xp1,t[(480-i)]);      //[0 479], [480 1]
  189. +        yi =  -S_MUL(*xp2, t[(480-i)]) - S_MUL(*xp1,t[i]);     //[480, 1], [0 479]
  190. +        /* works because the cos is nearly one */
  191. +        *yp++ = yr - S_MUL(yi,13); //sine==13 for N=1920
  192. +        *yp++ = yi + S_MUL(yr,13);
  193. +        xp1+=2;
  194. +        xp2-=2;
  195. +     }
  196. +#endif
  197. +}
  198. +#endif
  199. void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
  200.       const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
  201. {
  202. @@ -231,25 +419,27 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
  203.    sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
  204. #endif
  205.  
  206. -   /* Pre-rotate */
  207. -   {
  208. -      /* Temp pointers to make it really clear to the compiler what we're doing */
  209. -      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
  210. -      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
  211. -      kiss_fft_scalar * OPUS_RESTRICT yp = f2;
  212. -      const kiss_twiddle_scalar *t = &l->trig[0];
  213. -      for(i=0;i<N4;i++)
  214. -      {
  215. -         kiss_fft_scalar yr, yi;
  216. -         yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
  217. -         yi =  -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
  218. -         /* works because the cos is nearly one */
  219. -         *yp++ = yr - S_MUL(yi,sine);
  220. -         *yp++ = yi + S_MUL(yr,sine);
  221. -         xp1+=2*stride;
  222. -         xp2-=2*stride;
  223. -      }
  224. -   }
  225. +   if(shift > 1)
  226. +      {
  227. +         /* Temp pointers to make it really clear to the compiler what we're doing */
  228. +         const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
  229. +         const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
  230. +         kiss_fft_scalar * OPUS_RESTRICT yp = f2;
  231. +         const kiss_twiddle_scalar *t = &l->trig[0];
  232. +         for(i=0;i<N4;i++)
  233. +         {
  234. +            kiss_fft_scalar yr, yi;
  235. +            yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
  236. +            yi =  -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
  237. +            /* works because the cos is nearly one */
  238. +            *yp++ = yr - S_MUL(yi,sine);
  239. +            *yp++ = yi + S_MUL(yr,sine);
  240. +
  241. +            xp1+=2*stride;
  242. +            xp2-=2*stride;
  243. +         }
  244. +   }else
  245. +       pre_rotate_960_armv4(f2, &l->trig[0], in);
  246.  
  247.    /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
  248.    opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1)));
  249. diff --git a/lib/rbcodec/codecs/libopus/config.h b/lib/rbcodec/codecs/libopus/config.h
  250. old mode 100644
  251. new mode 100755
  252. index 1ce75ff..5ac2f8f
  253. --- a/lib/rbcodec/codecs/libopus/config.h
  254. +++ b/lib/rbcodec/codecs/libopus/config.h
  255. @@ -43,7 +43,7 @@
  256. #if ARM_ARCH == 4
  257. #define OPUS_ARM_INLINE_ASM
  258. #elif ARM_ARCH > 4
  259. -#define OPUS_ARM_INLINE_EDSP
  260. +#define OPUS_ARM_INLINE_ASM
  261. #endif
  262. #endif
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement