Advertisement
Guest User

Untitled

a guest
May 13th, 2016
163
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 17.74 KB | None | 0 0
  1. /*
  2. * Cedarx framework.
  3. * Copyright (c) 2008-2015 Allwinner Technology Co. Ltd.
  4. * Copyright (c) 2014 Ning Fang <fangning@allwinnertech.com>
  5. * Copyright (c) 2016 Andreas Baierl <ichgeh@imkreisrum.de>
  6. *
  7. * This file is part of Cedarx.
  8. *
  9. * Cedarx is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * This program is distributed "as is" WITHOUT ANY WARRANTY of any
  15. * kind, whether express or implied; without even the implied warranty
  16. * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU Lesser General Public License for more details.
  18. */
  19.  
  20. static void map32x32_to_yuv_Y(unsigned char* srcY,
  21.                               unsigned char* tarY,
  22.                               unsigned int coded_width,
  23.                               unsigned int coded_height)
  24. {
  25.     unsigned int i, j, l, m, n;
  26.     unsigned int mb_width, mb_height;
  27.     unsigned int twomb_line, twomb_width;
  28.     unsigned long offset;
  29.     unsigned char *ptr;
  30. #ifdef USE_NEON
  31.     unsigned char *dst_asm, *src_asm;
  32. #endif
  33.     unsigned vdecbuf_width, vdecbuf_height;
  34.     int nWidthMatchFlag;
  35.     int nLeftValidLine; // in the bottom macroblock(32*32), the valid line is < 32.
  36.     ptr = srcY;
  37.  
  38.     mb_width =  ((coded_width  + 31) & (~31)) >> 4;
  39.     mb_height = ((coded_height + 31) & (~31)) >> 4;
  40.     twomb_line =  (mb_height + 1) >> 1;
  41.     twomb_width = (mb_width  + 1) >> 1;
  42.  
  43.     if(twomb_line < 1 || twomb_width < 1) {
  44.         printf("fatal error! twomb_line=%d, twomb_width=%d", twomb_line, twomb_width);
  45.     }
  46.  
  47.     vdecbuf_width = twomb_width * 32;
  48.     vdecbuf_height = twomb_line * 32;
  49.  
  50.     if(vdecbuf_width > coded_width) {
  51.         nWidthMatchFlag = 0;
  52.         if((vdecbuf_width - coded_width) != 16) {
  53.             printf("fatal error! vdecbuf_width=%d, gpubuf_width=%d,  the program will crash!", vdecbuf_width, coded_width);
  54.         } else {
  55.             // printf("(f:%s, l:%d) Be careful! vdecbuf_width=%d, gpubuf_width=%d", __FUNCTION__, __LINE__, vdecbuf_width, coded_width);
  56.         }
  57.     } else if(vdecbuf_width == coded_width) {
  58.         nWidthMatchFlag = 1;
  59.     } else {
  60.         printf("fatal error! vdecbuf_width=%d <= gpubuf_width=%d, the program will crash!", vdecbuf_width, coded_width);
  61.         nWidthMatchFlag = 0;
  62.     }
  63.  
  64.     /* process every macroblock line except the last one */
  65.     for(i = 0; i < twomb_line - 1; i++) {
  66.         /* process every macroblock in line except the last one */
  67.         for(j = 0; j < twomb_width - 1; j++) {
  68.             /* process every line within a macroblock */
  69.             for(l = 0; l < 32; l++) {
  70.                 m= i * 32 + l;
  71.                 n= j * 32;
  72.                 offset = m * coded_width + n;
  73. #ifdef USE_NEON
  74.                 dst_asm = tarY + offset;
  75.                 src_asm = ptr;
  76.                 asm volatile (
  77.                         "vld1.8         {d0 - d3}, [%[src_asm]]              \n\t"
  78.                         "vst1.8         {d0 - d3}, [%[dst_asm]]              \n\t"
  79.                         : [dst_asm] "+r" (dst_asm), [src_asm] "+r" (src_asm)
  80.                         :  // [srcY] "r" (srcY)
  81.                         : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  82.                         );
  83. #else
  84.                 memcpy(tarY + offset, ptr, 32);
  85. #endif
  86.                 ptr += 32; // 32 byte in one process.
  87.             }
  88.         }
  89.  
  90.         /* process last macroblock of one line, gpu buf must be 16byte align or 32 byte align */
  91.         /* j == twomb_width - 1 */
  92.         for(l = 0; l < 32; l++) {
  93.             m = i * 32 + l;
  94.             n = j * 32;
  95.             offset = m * coded_width + n;
  96. #ifdef USE_NEON
  97.             dst_asm = tarY + offset;
  98.             src_asm = ptr;
  99.             if(nWidthMatchFlag) {
  100.                 asm volatile (
  101.                     "vld1.8         {d0 - d3}, [%[src_asm]]              \n\t"
  102.                     "vst1.8         {d0 - d3}, [%[dst_asm]]              \n\t"
  103.                     : [dst_asm] "+r" (dst_asm), [src_asm] "+r" (src_asm)
  104.                     :  // [srcY] "r" (srcY)
  105.                     : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  106.                 );
  107.             } else {
  108.                 asm volatile (
  109.                     "vld1.8         {d0,d1}, [%[src_asm]]              \n\t"
  110.                     "vst1.8         {d0,d1}, [%[dst_asm]]              \n\t"
  111.                     : [dst_asm] "+r" (dst_asm), [src_asm] "+r" (src_asm)
  112.                     :  // [srcY] "r" (srcY)
  113.                     : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  114.                 );
  115.             }
  116. #else
  117.             memcpy(tarY + offset, ptr, 32);
  118. #endif
  119.             ptr += 32; // 32 byte in one process.
  120.         }
  121.     }
  122.  
  123.     /* process last macroblock line */
  124.     /* i == twomb_line - 1 */
  125.     nLeftValidLine = coded_height - ((twomb_line - 1) * 32);
  126.     /* process every macroblock in last line except the last one */
  127.     for(j = 0; j < twomb_width - 1; j++) {
  128.         /* process every line within a macroblock */
  129.         for(l = 0; l < nLeftValidLine; l++) {
  130.             m = i * 32 + l;
  131.             n= j * 32;
  132.             offset = m * coded_width + n;
  133. #ifdef USE_NEON
  134.             dst_asm = tarY + offset;
  135.             src_asm = ptr;
  136.             asm volatile (
  137.                 "vld1.8         {d0 - d3}, [%[src_asm]]              \n\t"
  138.                 "vst1.8         {d0 - d3}, [%[dst_asm]]              \n\t"
  139.                 : [dst_asm] "+r" (dst_asm), [src_asm] "+r" (src_asm)
  140.                 : // [srcY] "r" (srcY)
  141.                 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  142.             );
  143. #else
  144.             memcpy(tarY + offset, ptr, 32);
  145. #endif
  146.             ptr += 32; //32 byte in one process.
  147.         }
  148.         ptr += (32 - nLeftValidLine) * 32;
  149.     }
  150.  
  151.     /* process last macroblock of last line, gpu buf must be 16byte align or 32 byte align */
  152.     /* j == twomb_width - 1 */
  153.     /* process every line within last macroblock in line */
  154.     for(l = 0; l < nLeftValidLine; l++) {
  155.         m= i * 32 + l;
  156.         n= j * 32;
  157.         offset = m * coded_width + n;
  158. #ifdef USE_NEON
  159.         dst_asm = tarY + offset;
  160.         src_asm = ptr;
  161.         if(nWidthMatchFlag) {
  162.             asm volatile (
  163.                 "vld1.8         {d0 - d3}, [%[src_asm]]              \n\t"
  164.                 "vst1.8         {d0 - d3}, [%[dst_asm]]              \n\t"
  165.                 : [dst_asm] "+r" (dst_asm), [src_asm] "+r" (src_asm)
  166.                 : // [srcY] "r" (srcY)
  167.                 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  168.             );
  169.         } else {
  170.             asm volatile (
  171.                 "vld1.8         {d0,d1}, [%[src_asm]]              \n\t"
  172.                 "vst1.8         {d0,d1}, [%[dst_asm]]              \n\t"
  173.                 : [dst_asm] "+r" (dst_asm), [src_asm] "+r" (src_asm)
  174.                 : // [srcY] "r" (srcY)
  175.                 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  176.             );
  177.         }
  178. #else
  179.         memcpy(tarY + offset, ptr, 32);
  180. #endif
  181.         ptr += 32; // 32 byte in one process.
  182.     }
  183.     ptr += (32 - nLeftValidLine) * 32;
  184. }
  185.  
  186. static void map32x32_to_yuv_UV( // int mode,
  187.                               unsigned char* srcUV,
  188.                               unsigned char* tarU,
  189.                               unsigned char* tarV,
  190.                               unsigned char* tarUV,
  191.                               unsigned int coded_width,
  192.                               unsigned int coded_height)
  193. {
  194.     unsigned int i, j, l, m, n, k;
  195.     unsigned int mb_width, mb_height;
  196.     unsigned int twomb_line, twomb_width;
  197.     unsigned long offset;
  198.     unsigned char *ptr;
  199.     unsigned char *dst0_asm, *dst1_asm, *dst2_asm, *src_asm;
  200.     unsigned vdecbuf_width, vdecbuf_height; // in pixel
  201.     int nWidthMatchFlag;
  202.     int nLeftValidLine; // in the bottom macroblock(32*32), the valid line is < 32.
  203.     unsigned char line[16];
  204.  
  205. //  int dst_stride = mode == 0 ? (coded_width + 15) & (~15) : coded_width;
  206.     int dst_stride = coded_width;
  207.  
  208.     ptr = srcUV;
  209.  
  210.     mb_width =  ((coded_width + 15) & (~15)) >> 4;   //vdec's uvBuf is 32byte align, so uBuf and vBuf is 16byte align!
  211.     mb_height = ((coded_height + 31) & (~31)) >> 4;
  212.     twomb_line = (mb_height + 1) >> 1;
  213.     twomb_width = mb_width; //vdec mb32 is uv interleave, so uv_32 byte == u_16byte
  214.  
  215.     if(twomb_line < 1 || twomb_width < 1) {
  216.         printf("map32x32_to_yuv_C() fatal error! twomb_line=%d, twomb_width=%d", twomb_line, twomb_width);
  217.     }
  218.  
  219.     // vdec mb32 uvBuf, one vdec_macro_block, extract u component, u's width and height.
  220.     vdecbuf_width = twomb_width * 16;
  221.     vdecbuf_height = twomb_line * 32;
  222.  
  223.     if(vdecbuf_width > coded_width) {
  224.         nWidthMatchFlag = 0;
  225.         if((vdecbuf_width - coded_width) != 8)
  226.         {
  227.             printf("fatal error! vdec_UVbuf_width=%d, gpu_UVbuf_width=%d,  the program will crash!", vdecbuf_width, coded_width);
  228.         }
  229.         else
  230.         {
  231.             // printf("(f:%s, l:%d) vdec_UVbuf_width=%d, gpu_UVbuf_width=%d, not match, gpu_uvBuf is 8byte align?", __FUNCTION__, __LINE__, vdecbuf_width, coded_width);
  232.         }
  233.     } else if(vdecbuf_width == coded_width) {
  234.         nWidthMatchFlag = 1;
  235.     } else {
  236.         printf("fatal error! vdec_UVbuf_width=%d <= gpu_UVbuf_width=%d, the program will crash!", vdecbuf_width, coded_width);
  237.         nWidthMatchFlag = 0;
  238.     }
  239.  
  240.     /* process every macroblock line except the last one */
  241.     for(i = 0; i < twomb_line - 1; i++) {
  242.         /* process every macroblock in line except the last one */
  243.         for(j = 0; j < twomb_width - 1; j++) {
  244.             /* process every line within a macroblock */
  245.             for(l = 0; l < 32; l++) {
  246.                 m = i * 32 + l;
  247.                 n = j * 16;
  248.  
  249. #ifdef USE_NEON
  250. /* U + V separate */
  251.                 offset = m * dst_stride + n;
  252.                 dst0_asm = tarU + offset;
  253.                 dst1_asm = tarV + offset;
  254.                 src_asm = ptr;
  255. //              for(k = 0; k < 16; k++) {
  256. //                  dst0_asm[k] = src_asm[2 * k];
  257. //                  dst1_asm[k] = src_asm[2 * k + 1];
  258. //              }
  259.                 asm volatile (
  260.                     "vld2.8         {d0 - d3}, [%[src_asm]]              \n\t"
  261.                     "vst1.8         {d0,d1}, [%[dst0_asm]]              \n\t"
  262.                     "vst1.8         {d2,d3}, [%[dst1_asm]]              \n\t"
  263.                      : [dst0_asm] "+r" (dst0_asm), [dst1_asm] "+r" (dst1_asm), [src_asm] "+r" (src_asm)
  264.                      :  //[srcY] "r" (srcY)
  265.                      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  266.                  );
  267. /* UV interleaved */
  268.                 offset = m * dst_stride + n * 2;
  269.                 dst2_asm = tarUV + offset;
  270.                 src_asm = ptr;
  271.                 asm volatile (
  272.                     "vld1.8         {d0 - d3}, [%[src_asm]]              \n\t"
  273.                     "vst1.8         {d0 - d3}, [%[dst2_asm]]              \n\t"
  274.                     : [dst2_asm] "+r" (dst2_asm), [src_asm] "+r" (src_asm)
  275.                     :  // [srcY] "r" (srcY)
  276.                     : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  277.                 );
  278. #else
  279. /* U + V separate */
  280.                 offset = m * dst_stride + n;
  281.                 for(k = 0; k < 16; k++) {
  282.                     memcpy(tarU + offset + 2 * k, ptr, 2);
  283.                     memcpy(tarY + offset + 2 * k + 1, ptr, 2);
  284.                 }
  285. /* UV interleaved */
  286.                 offset = m * dst_stride + n * 2;
  287.                 memcpy(tarUV + offset, ptr, 32);
  288. #endif
  289.                 ptr += 32;
  290.             }
  291.         }
  292.  
  293.         /* process last macroblock of one line, gpu buf must be 16byte align or 32 byte align */
  294.         /* j == twomb_width - 1 */
  295.         for(l = 0; l < 32; l++) {
  296.             m = i * 32 + l;
  297.             n= j * 16;
  298.  
  299.  
  300. #ifdef USE_NEON
  301.             /* U + V separate */
  302.             offset = m * dst_stride + n;
  303.             dst0_asm = tarU + offset;
  304.             dst1_asm = tarV + offset;
  305.             src_asm = ptr;
  306. //                  for(k=0;k<16;k++)
  307. //                  {
  308. //                      dst0_asm[k] = src_asm[2*k];
  309. //                      dst1_asm[k] = src_asm[2*k+1];
  310. //                  }
  311.  
  312.             if(nWidthMatchFlag) {
  313.                 asm volatile (
  314.                     "vld2.8         {d0 - d3}, [%[src_asm]]              \n\t"
  315.                     "vst1.8         {d0,d1}, [%[dst0_asm]]              \n\t"
  316.                     "vst1.8         {d2,d3}, [%[dst1_asm]]              \n\t"
  317.                      : [dst0_asm] "+r" (dst0_asm), [dst1_asm] "+r" (dst1_asm), [src_asm] "+r" (src_asm)
  318.                      :  //[srcY] "r" (srcY)
  319.                      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  320.                  );
  321.             }
  322.             else {
  323.                 asm volatile (
  324.                     "vld2.8         {d0,d1}, [%[src_asm]]              \n\t"
  325.                     "vst1.8         {d0}, [%[dst0_asm]]              \n\t"
  326.                     "vst1.8         {d1}, [%[dst1_asm]]              \n\t"
  327.                      : [dst0_asm] "+r" (dst0_asm), [dst1_asm] "+r" (dst1_asm), [src_asm] "+r" (src_asm)
  328.                      :  //[srcY] "r" (srcY)
  329.                      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  330.                  );
  331.             }
  332.  
  333.             /* UV interleaved */
  334.             offset = m * dst_stride + n * 2;
  335.             dst2_asm = tarU + offset;
  336.             src_asm = ptr;
  337.             if(nWidthMatchFlag) {
  338.                 asm volatile (
  339.                     "vld1.8         {d0 - d3}, [%[src_asm]]              \n\t"
  340.                     "vst1.8         {d0 - d3}, [%[dst2_asm]]              \n\t"
  341.                     : [dst2_asm] "+r" (dst2_asm), [src_asm] "+r" (src_asm)
  342.                     :  // [srcY] "r" (srcY)
  343.                     : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  344.                 );
  345.             } else {
  346.                 asm volatile (
  347.                     "vld1.8         {d0,d1}, [%[src_asm]]              \n\t"
  348.                     "vst1.8         {d0,d1}, [%[dst2_asm]]              \n\t"
  349.                     : [dst2_asm] "+r" (dst2_asm), [src_asm] "+r" (src_asm)
  350.                     :  // [srcY] "r" (srcY)
  351.                     : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  352.                 );
  353.             }
  354. #else
  355.             /* U + V separate */
  356.             offset = m * dst_stride + n;
  357.             for(k = 0; k < 16; k++) {
  358.                 memcpy(tarU + offset + 2 * k, ptr, 2);
  359.                 memcpy(tarY + offset + 2 * k + 1, ptr, 2);
  360.             }
  361.             /* UV interleaved */
  362.             offset = m * dst_stride + n * 2;
  363.             memcpy(tarUV + offset, ptr, 32);
  364. #endif
  365.             ptr += 32;
  366.         }
  367.     }
  368.  
  369.     /* process last macroblock line */
  370.     /* i == twomb_line - 1 */
  371.     nLeftValidLine = coded_height - ((twomb_line - 1) * 32);
  372.     /* process every macroblock in last line except the last one */
  373.     for(j = 0; j < twomb_width - 1; j++) {
  374.         /* process every line within a macroblock */
  375.         for(l = 0; l < nLeftValidLine; l++) {
  376.             m = i * 32 + l;
  377.             n = j * 16;
  378.  
  379. #ifdef USE_NEON
  380.             /* U + V separate */
  381.             offset = m * dst_stride + n;
  382.             dst0_asm = tarU + offset;
  383.             dst1_asm = tarV + offset;
  384.             src_asm = ptr;
  385.             asm volatile (
  386.                 "vld2.8         {d0 - d3}, [%[src_asm]]              \n\t"
  387.                 "vst1.8         {d0,d1}, [%[dst0_asm]]              \n\t"
  388.                 "vst1.8         {d2,d3}, [%[dst1_asm]]              \n\t"
  389.                  : [dst0_asm] "+r" (dst0_asm), [dst1_asm] "+r" (dst1_asm), [src_asm] "+r" (src_asm)
  390.                  :  //[srcY] "r" (srcY)
  391.                  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  392.              );
  393.  
  394.             /* UV interleaved */
  395.             offset = m * dst_stride + n * 2;
  396.             dst2_asm = tarUV + offset;
  397.             src_asm = ptr;
  398.             asm volatile (
  399.                 "vld1.8         {d0 - d3}, [%[src_asm]]              \n\t"
  400.                 "vst1.8         {d0 - d3}, [%[dst2_asm]]              \n\t"
  401.                 : [dst2_asm] "+r" (dst2_asm), [src_asm] "+r" (src_asm)
  402.                 : // [srcY] "r" (srcY)
  403.                 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  404.             );
  405. #else
  406.             /* U + V separate */
  407.             offset = m * dst_stride + n;
  408.             for(k = 0; k < 16; k++) {
  409.                 memcpy(tarU + offset + 2 * k, ptr, 2);
  410.                 memcpy(tarY + offset + 2 * k + 1, ptr, 2);
  411.             }
  412.             /* UV interleaved */
  413.             offset = m * dst_stride + n * 2;
  414.             memcpy(tarUV + offset, ptr, 32);
  415. #endif
  416.             ptr += 32;  //32 byte in one process.
  417.         }
  418.         ptr += (32 - nLeftValidLine) * 32;
  419.     }
  420.  
  421.  
  422.     /* process last macroblock of last line, gpu buf must be 16byte align or 32 byte align */
  423.     /* j == twomb_width - 1 */
  424.     /* process every line within last macroblock in line */
  425.     for(l = 0; l < nLeftValidLine; l++) {
  426.         m = i * 32 + l;
  427.         n = j * 16;
  428.  
  429. #ifdef USE_NEON
  430.         /* U + V separate */
  431.         offset = m * dst_stride + n;
  432.         dst0_asm = tarU + offset;
  433.         dst1_asm = tarV + offset;
  434.         src_asm = ptr;
  435.         if(nWidthMatchFlag) {
  436.             asm volatile (
  437.                 "vld2.8         {d0 - d3}, [%[src_asm]]              \n\t"
  438.                 "vst1.8         {d0,d1}, [%[dst0_asm]]              \n\t"
  439.                 "vst1.8         {d2,d3}, [%[dst1_asm]]              \n\t"
  440.                  : [dst0_asm] "+r" (dst0_asm), [dst1_asm] "+r" (dst1_asm), [src_asm] "+r" (src_asm)
  441.                  :  //[srcY] "r" (srcY)
  442.                  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  443.              );
  444.         } else {
  445.             asm volatile (
  446.                 "vld2.8         {d0,d1}, [%[src_asm]]              \n\t"
  447.                 "vst1.8         {d0}, [%[dst0_asm]]              \n\t"
  448.                 "vst1.8         {d1}, [%[dst1_asm]]              \n\t"
  449.                  : [dst0_asm] "+r" (dst0_asm), [dst1_asm] "+r" (dst1_asm), [src_asm] "+r" (src_asm)
  450.                  :  //[srcY] "r" (srcY)
  451.                  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  452.              );
  453.         }
  454.  
  455.         /* UV interleaved */
  456.         offset = m * dst_stride + n * 2;
  457.         dst2_asm = tarUV + offset;
  458.         src_asm = ptr;
  459.         if(nWidthMatchFlag) {
  460.             asm volatile (
  461.                 "vld1.8         {d0 - d3}, [%[src_asm]]              \n\t"
  462.                 "vst1.8         {d0 - d3}, [%[dst_asm]]              \n\t"
  463.                 : [dst2_asm] "+r" (dst2_asm), [src_asm] "+r" (src_asm)
  464.                 : // [srcY] "r" (srcY)
  465.                 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  466.             );
  467.         } else {
  468.             asm volatile (
  469.                 "vld1.8         {d0,d1}, [%[src_asm]]              \n\t"
  470.                 "vst1.8         {d0,d1}, [%[dst_asm]]              \n\t"
  471.                 : [dst2_asm] "+r" (dst2_asm), [src_asm] "+r" (src_asm)
  472.                 : // [srcY] "r" (srcY)
  473.                 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
  474.             );
  475.         }
  476. #else
  477.         /* U + V separate */
  478.         offset = m * dst_stride + n;
  479.         for(k = 0; k < 16; k++) {
  480.             memcpy(tarU + offset + 2 * k, ptr, 2);
  481.             memcpy(tarY + offset + 2 * k + 1, ptr, 2);
  482.         }
  483.  
  484.         /* UV interleaved */
  485.         offset = m * dst_stride + n * 2;
  486.         memcpy(tarUV + offset, ptr, 32);
  487. #endif
  488.         ptr += 32;
  489.     }
  490.     ptr += (32-nLeftValidLine)*32;
  491.  }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement