Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- * Cedarx framework.
- * Copyright (c) 2008-2015 Allwinner Technology Co. Ltd.
- * Copyright (c) 2014 Ning Fang <fangning@allwinnertech.com>
- * Copyright (c) 2016 Andreas Baierl <ichgeh@imkreisrum.de>
- *
- * This file is part of Cedarx.
- *
- * Cedarx is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Lesser General Public License for more details.
- */
- static void map32x32_to_yuv_Y(unsigned char* srcY,
- unsigned char* tarY,
- unsigned int coded_width,
- unsigned int coded_height)
- {
- unsigned int i, j, l, m, n;
- unsigned int mb_width, mb_height;
- unsigned int twomb_line, twomb_width;
- unsigned long offset;
- unsigned char *ptr;
- #ifdef USE_NEON
- unsigned char *dst_asm, *src_asm;
- #endif
- unsigned vdecbuf_width, vdecbuf_height;
- int nWidthMatchFlag;
- int nLeftValidLine; // in the bottom macroblock(32*32), the valid line is < 32.
- ptr = srcY;
- mb_width = ((coded_width + 31) & (~31)) >> 4;
- mb_height = ((coded_height + 31) & (~31)) >> 4;
- twomb_line = (mb_height + 1) >> 1;
- twomb_width = (mb_width + 1) >> 1;
- if(twomb_line < 1 || twomb_width < 1) {
- printf("fatal error! twomb_line=%d, twomb_width=%d", twomb_line, twomb_width);
- }
- vdecbuf_width = twomb_width * 32;
- vdecbuf_height = twomb_line * 32;
- if(vdecbuf_width > coded_width) {
- nWidthMatchFlag = 0;
- if((vdecbuf_width - coded_width) != 16) {
- printf("fatal error! vdecbuf_width=%d, gpubuf_width=%d, the program will crash!", vdecbuf_width, coded_width);
- } else {
- // printf("(f:%s, l:%d) Be careful! vdecbuf_width=%d, gpubuf_width=%d", __FUNCTION__, __LINE__, vdecbuf_width, coded_width);
- }
- } else if(vdecbuf_width == coded_width) {
- nWidthMatchFlag = 1;
- } else {
- printf("fatal error! vdecbuf_width=%d <= gpubuf_width=%d, the program will crash!", vdecbuf_width, coded_width);
- nWidthMatchFlag = 0;
- }
- /* process every macroblock line except the last one */
- for(i = 0; i < twomb_line - 1; i++) {
- /* process every macroblock in line except the last one */
- for(j = 0; j < twomb_width - 1; j++) {
- /* process every line within a macroblock */
- for(l = 0; l < 32; l++) {
- m= i * 32 + l;
- n= j * 32;
- offset = m * coded_width + n;
- #ifdef USE_NEON
- dst_asm = tarY + offset;
- src_asm = ptr;
- asm volatile (
- "vld1.8 {d0 - d3}, [%[src_asm]] \n\t"
- "vst1.8 {d0 - d3}, [%[dst_asm]] \n\t"
- : [dst_asm] "+r" (dst_asm), [src_asm] "+r" (src_asm)
- : // [srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- #else
- memcpy(tarY + offset, ptr, 32);
- #endif
- ptr += 32; // 32 byte in one process.
- }
- }
- /* process last macroblock of one line, gpu buf must be 16byte align or 32 byte align */
- /* j == twomb_width - 1 */
- for(l = 0; l < 32; l++) {
- m = i * 32 + l;
- n = j * 32;
- offset = m * coded_width + n;
- #ifdef USE_NEON
- dst_asm = tarY + offset;
- src_asm = ptr;
- if(nWidthMatchFlag) {
- asm volatile (
- "vld1.8 {d0 - d3}, [%[src_asm]] \n\t"
- "vst1.8 {d0 - d3}, [%[dst_asm]] \n\t"
- : [dst_asm] "+r" (dst_asm), [src_asm] "+r" (src_asm)
- : // [srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- } else {
- asm volatile (
- "vld1.8 {d0,d1}, [%[src_asm]] \n\t"
- "vst1.8 {d0,d1}, [%[dst_asm]] \n\t"
- : [dst_asm] "+r" (dst_asm), [src_asm] "+r" (src_asm)
- : // [srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- }
- #else
- memcpy(tarY + offset, ptr, 32);
- #endif
- ptr += 32; // 32 byte in one process.
- }
- }
- /* process last macroblock line */
- /* i == twomb_line - 1 */
- nLeftValidLine = coded_height - ((twomb_line - 1) * 32);
- /* process every macroblock in last line except the last one */
- for(j = 0; j < twomb_width - 1; j++) {
- /* process every line within a macroblock */
- for(l = 0; l < nLeftValidLine; l++) {
- m = i * 32 + l;
- n= j * 32;
- offset = m * coded_width + n;
- #ifdef USE_NEON
- dst_asm = tarY + offset;
- src_asm = ptr;
- asm volatile (
- "vld1.8 {d0 - d3}, [%[src_asm]] \n\t"
- "vst1.8 {d0 - d3}, [%[dst_asm]] \n\t"
- : [dst_asm] "+r" (dst_asm), [src_asm] "+r" (src_asm)
- : // [srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- #else
- memcpy(tarY + offset, ptr, 32);
- #endif
- ptr += 32; //32 byte in one process.
- }
- ptr += (32 - nLeftValidLine) * 32;
- }
- /* process last macroblock of last line, gpu buf must be 16byte align or 32 byte align */
- /* j == twomb_width - 1 */
- /* process every line within last macroblock in line */
- for(l = 0; l < nLeftValidLine; l++) {
- m= i * 32 + l;
- n= j * 32;
- offset = m * coded_width + n;
- #ifdef USE_NEON
- dst_asm = tarY + offset;
- src_asm = ptr;
- if(nWidthMatchFlag) {
- asm volatile (
- "vld1.8 {d0 - d3}, [%[src_asm]] \n\t"
- "vst1.8 {d0 - d3}, [%[dst_asm]] \n\t"
- : [dst_asm] "+r" (dst_asm), [src_asm] "+r" (src_asm)
- : // [srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- } else {
- asm volatile (
- "vld1.8 {d0,d1}, [%[src_asm]] \n\t"
- "vst1.8 {d0,d1}, [%[dst_asm]] \n\t"
- : [dst_asm] "+r" (dst_asm), [src_asm] "+r" (src_asm)
- : // [srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- }
- #else
- memcpy(tarY + offset, ptr, 32);
- #endif
- ptr += 32; // 32 byte in one process.
- }
- ptr += (32 - nLeftValidLine) * 32;
- }
- static void map32x32_to_yuv_UV( // int mode,
- unsigned char* srcUV,
- unsigned char* tarU,
- unsigned char* tarV,
- unsigned char* tarUV,
- unsigned int coded_width,
- unsigned int coded_height)
- {
- unsigned int i, j, l, m, n, k;
- unsigned int mb_width, mb_height;
- unsigned int twomb_line, twomb_width;
- unsigned long offset;
- unsigned char *ptr;
- unsigned char *dst0_asm, *dst1_asm, *dst2_asm, *src_asm;
- unsigned vdecbuf_width, vdecbuf_height; // in pixel
- int nWidthMatchFlag;
- int nLeftValidLine; // in the bottom macroblock(32*32), the valid line is < 32.
- unsigned char line[16];
- // int dst_stride = mode == 0 ? (coded_width + 15) & (~15) : coded_width;
- int dst_stride = coded_width;
- ptr = srcUV;
- mb_width = ((coded_width + 15) & (~15)) >> 4; //vdec's uvBuf is 32byte align, so uBuf and vBuf is 16byte align!
- mb_height = ((coded_height + 31) & (~31)) >> 4;
- twomb_line = (mb_height + 1) >> 1;
- twomb_width = mb_width; //vdec mb32 is uv interleave, so uv_32 byte == u_16byte
- if(twomb_line < 1 || twomb_width < 1) {
- printf("map32x32_to_yuv_C() fatal error! twomb_line=%d, twomb_width=%d", twomb_line, twomb_width);
- }
- // vdec mb32 uvBuf, one vdec_macro_block, extract u component, u's width and height.
- vdecbuf_width = twomb_width * 16;
- vdecbuf_height = twomb_line * 32;
- if(vdecbuf_width > coded_width) {
- nWidthMatchFlag = 0;
- if((vdecbuf_width - coded_width) != 8)
- {
- printf("fatal error! vdec_UVbuf_width=%d, gpu_UVbuf_width=%d, the program will crash!", vdecbuf_width, coded_width);
- }
- else
- {
- // printf("(f:%s, l:%d) vdec_UVbuf_width=%d, gpu_UVbuf_width=%d, not match, gpu_uvBuf is 8byte align?", __FUNCTION__, __LINE__, vdecbuf_width, coded_width);
- }
- } else if(vdecbuf_width == coded_width) {
- nWidthMatchFlag = 1;
- } else {
- printf("fatal error! vdec_UVbuf_width=%d <= gpu_UVbuf_width=%d, the program will crash!", vdecbuf_width, coded_width);
- nWidthMatchFlag = 0;
- }
- /* process every macroblock line except the last one */
- for(i = 0; i < twomb_line - 1; i++) {
- /* process every macroblock in line except the last one */
- for(j = 0; j < twomb_width - 1; j++) {
- /* process every line within a macroblock */
- for(l = 0; l < 32; l++) {
- m = i * 32 + l;
- n = j * 16;
- #ifdef USE_NEON
- /* U + V separate */
- offset = m * dst_stride + n;
- dst0_asm = tarU + offset;
- dst1_asm = tarV + offset;
- src_asm = ptr;
- // for(k = 0; k < 16; k++) {
- // dst0_asm[k] = src_asm[2 * k];
- // dst1_asm[k] = src_asm[2 * k + 1];
- // }
- asm volatile (
- "vld2.8 {d0 - d3}, [%[src_asm]] \n\t"
- "vst1.8 {d0,d1}, [%[dst0_asm]] \n\t"
- "vst1.8 {d2,d3}, [%[dst1_asm]] \n\t"
- : [dst0_asm] "+r" (dst0_asm), [dst1_asm] "+r" (dst1_asm), [src_asm] "+r" (src_asm)
- : //[srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- /* UV interleaved */
- offset = m * dst_stride + n * 2;
- dst2_asm = tarUV + offset;
- src_asm = ptr;
- asm volatile (
- "vld1.8 {d0 - d3}, [%[src_asm]] \n\t"
- "vst1.8 {d0 - d3}, [%[dst2_asm]] \n\t"
- : [dst2_asm] "+r" (dst2_asm), [src_asm] "+r" (src_asm)
- : // [srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- #else
- /* U + V separate */
- offset = m * dst_stride + n;
- for(k = 0; k < 16; k++) {
- memcpy(tarU + offset + 2 * k, ptr, 2);
- memcpy(tarY + offset + 2 * k + 1, ptr, 2);
- }
- /* UV interleaved */
- offset = m * dst_stride + n * 2;
- memcpy(tarUV + offset, ptr, 32);
- #endif
- ptr += 32;
- }
- }
- /* process last macroblock of one line, gpu buf must be 16byte align or 32 byte align */
- /* j == twomb_width - 1 */
- for(l = 0; l < 32; l++) {
- m = i * 32 + l;
- n= j * 16;
- #ifdef USE_NEON
- /* U + V separate */
- offset = m * dst_stride + n;
- dst0_asm = tarU + offset;
- dst1_asm = tarV + offset;
- src_asm = ptr;
- // for(k=0;k<16;k++)
- // {
- // dst0_asm[k] = src_asm[2*k];
- // dst1_asm[k] = src_asm[2*k+1];
- // }
- if(nWidthMatchFlag) {
- asm volatile (
- "vld2.8 {d0 - d3}, [%[src_asm]] \n\t"
- "vst1.8 {d0,d1}, [%[dst0_asm]] \n\t"
- "vst1.8 {d2,d3}, [%[dst1_asm]] \n\t"
- : [dst0_asm] "+r" (dst0_asm), [dst1_asm] "+r" (dst1_asm), [src_asm] "+r" (src_asm)
- : //[srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- }
- else {
- asm volatile (
- "vld2.8 {d0,d1}, [%[src_asm]] \n\t"
- "vst1.8 {d0}, [%[dst0_asm]] \n\t"
- "vst1.8 {d1}, [%[dst1_asm]] \n\t"
- : [dst0_asm] "+r" (dst0_asm), [dst1_asm] "+r" (dst1_asm), [src_asm] "+r" (src_asm)
- : //[srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- }
- /* UV interleaved */
- offset = m * dst_stride + n * 2;
- dst2_asm = tarU + offset;
- src_asm = ptr;
- if(nWidthMatchFlag) {
- asm volatile (
- "vld1.8 {d0 - d3}, [%[src_asm]] \n\t"
- "vst1.8 {d0 - d3}, [%[dst2_asm]] \n\t"
- : [dst2_asm] "+r" (dst2_asm), [src_asm] "+r" (src_asm)
- : // [srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- } else {
- asm volatile (
- "vld1.8 {d0,d1}, [%[src_asm]] \n\t"
- "vst1.8 {d0,d1}, [%[dst2_asm]] \n\t"
- : [dst2_asm] "+r" (dst2_asm), [src_asm] "+r" (src_asm)
- : // [srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- }
- #else
- /* U + V separate */
- offset = m * dst_stride + n;
- for(k = 0; k < 16; k++) {
- memcpy(tarU + offset + 2 * k, ptr, 2);
- memcpy(tarY + offset + 2 * k + 1, ptr, 2);
- }
- /* UV interleaved */
- offset = m * dst_stride + n * 2;
- memcpy(tarUV + offset, ptr, 32);
- #endif
- ptr += 32;
- }
- }
- /* process last macroblock line */
- /* i == twomb_line - 1 */
- nLeftValidLine = coded_height - ((twomb_line - 1) * 32);
- /* process every macroblock in last line except the last one */
- for(j = 0; j < twomb_width - 1; j++) {
- /* process every line within a macroblock */
- for(l = 0; l < nLeftValidLine; l++) {
- m = i * 32 + l;
- n = j * 16;
- #ifdef USE_NEON
- /* U + V separate */
- offset = m * dst_stride + n;
- dst0_asm = tarU + offset;
- dst1_asm = tarV + offset;
- src_asm = ptr;
- asm volatile (
- "vld2.8 {d0 - d3}, [%[src_asm]] \n\t"
- "vst1.8 {d0,d1}, [%[dst0_asm]] \n\t"
- "vst1.8 {d2,d3}, [%[dst1_asm]] \n\t"
- : [dst0_asm] "+r" (dst0_asm), [dst1_asm] "+r" (dst1_asm), [src_asm] "+r" (src_asm)
- : //[srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- /* UV interleaved */
- offset = m * dst_stride + n * 2;
- dst2_asm = tarUV + offset;
- src_asm = ptr;
- asm volatile (
- "vld1.8 {d0 - d3}, [%[src_asm]] \n\t"
- "vst1.8 {d0 - d3}, [%[dst2_asm]] \n\t"
- : [dst2_asm] "+r" (dst2_asm), [src_asm] "+r" (src_asm)
- : // [srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- #else
- /* U + V separate */
- offset = m * dst_stride + n;
- for(k = 0; k < 16; k++) {
- memcpy(tarU + offset + 2 * k, ptr, 2);
- memcpy(tarY + offset + 2 * k + 1, ptr, 2);
- }
- /* UV interleaved */
- offset = m * dst_stride + n * 2;
- memcpy(tarUV + offset, ptr, 32);
- #endif
- ptr += 32; //32 byte in one process.
- }
- ptr += (32 - nLeftValidLine) * 32;
- }
- /* process last macroblock of last line, gpu buf must be 16byte align or 32 byte align */
- /* j == twomb_width - 1 */
- /* process every line within last macroblock in line */
- for(l = 0; l < nLeftValidLine; l++) {
- m = i * 32 + l;
- n = j * 16;
- #ifdef USE_NEON
- /* U + V separate */
- offset = m * dst_stride + n;
- dst0_asm = tarU + offset;
- dst1_asm = tarV + offset;
- src_asm = ptr;
- if(nWidthMatchFlag) {
- asm volatile (
- "vld2.8 {d0 - d3}, [%[src_asm]] \n\t"
- "vst1.8 {d0,d1}, [%[dst0_asm]] \n\t"
- "vst1.8 {d2,d3}, [%[dst1_asm]] \n\t"
- : [dst0_asm] "+r" (dst0_asm), [dst1_asm] "+r" (dst1_asm), [src_asm] "+r" (src_asm)
- : //[srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- } else {
- asm volatile (
- "vld2.8 {d0,d1}, [%[src_asm]] \n\t"
- "vst1.8 {d0}, [%[dst0_asm]] \n\t"
- "vst1.8 {d1}, [%[dst1_asm]] \n\t"
- : [dst0_asm] "+r" (dst0_asm), [dst1_asm] "+r" (dst1_asm), [src_asm] "+r" (src_asm)
- : //[srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- }
- /* UV interleaved */
- offset = m * dst_stride + n * 2;
- dst2_asm = tarUV + offset;
- src_asm = ptr;
- if(nWidthMatchFlag) {
- asm volatile (
- "vld1.8 {d0 - d3}, [%[src_asm]] \n\t"
- "vst1.8 {d0 - d3}, [%[dst_asm]] \n\t"
- : [dst2_asm] "+r" (dst2_asm), [src_asm] "+r" (src_asm)
- : // [srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- } else {
- asm volatile (
- "vld1.8 {d0,d1}, [%[src_asm]] \n\t"
- "vst1.8 {d0,d1}, [%[dst_asm]] \n\t"
- : [dst2_asm] "+r" (dst2_asm), [src_asm] "+r" (src_asm)
- : // [srcY] "r" (srcY)
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
- );
- }
- #else
- /* U + V separate */
- offset = m * dst_stride + n;
- for(k = 0; k < 16; k++) {
- memcpy(tarU + offset + 2 * k, ptr, 2);
- memcpy(tarY + offset + 2 * k + 1, ptr, 2);
- }
- /* UV interleaved */
- offset = m * dst_stride + n * 2;
- memcpy(tarUV + offset, ptr, 32);
- #endif
- ptr += 32;
- }
- ptr += (32-nLeftValidLine)*32;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement