Untitled

void InvertHorizontalRow16s8sTo16sBuffered(PIXEL *lowpass,              // Row of horizontal lowpass coefficients
                                           int lowpass_quantization,    // lowpass quantization factor
                                           PIXEL8S *highpass_data,      // Row of horizontal highpass coefficients
                                           int highpass_quantization,   // highpass quantization factor
                                           PIXEL *output,               // Row of reconstructed results
                                           int width,                   // Length of each row of horizontal coefficients
                                           PIXEL *buffer)               // Buffer to hold the dequantized values
{
#if _DECODE_LOWPASS_16S
    const int column_step = 8;
    const int last_column = width - 1;
    int post_column = last_column - (last_column % column_step);
    int column;

    PIXEL *highline = buffer;

    __m128i low1_epi16;     // Lowpass coefficients
    __m128i low2_epi16;
    __m128i high1_epi16;    // Current eight highpass coefficients
    __m128i high2_epi16;    // Next eight highpass coefficients

    __m128i half_epi16 = _mm_set1_epi16(4);

#if _UNALIGNED
    // The fast loop computes output points starting at the third column
    __m128i *outptr = (__m128i *)&output[2];
#else
    // The fast loop merges values from different phases to allow aligned stores
    __m128i *outptr = (__m128i *)&output[0];

    // Two 16-bit coefficients from the previous loop iteration
    //short remainder[2];
#endif

    PIXEL *colptr;

    int32_t even;
    int32_t odd;
    //int32_t lsb;

    PIXEL *highpass = (PIXEL *)highpass_data;

    // Adjust the end of the fast loop if necessary
    if (post_column == last_column)
        post_column -= column_step;

    // Undo quantization for the highpass row
#if _DEQUANTIZE_IN_FSM
    highline = highpass;
#else
    DequantizeBandRow16s(highpass, width, highpass_quantization, highline);
#endif
#if (0)
    int x;
    printf("highline \n");
    for (x = 0; x < width/2; x++){
        printf("%d ,", highline[x]);
    }
    printf("\n");

#endif
    // Start processing at the beginning of the row
    column = 0;

    // Process the first two output points with special filters for the left border
    even = 0;
    odd = 0;

    // Apply the even reconstruction filter to the lowpass band
    even += 11 * lowpass[column + 0];
    even -=  4 * lowpass[column + 1];
    even +=  1 * lowpass[column + 2];
    even += 4;
    even >>= 3;

    // Add the highpass correction
    even += highline[column];
    even = DivideByShift(even, 1);

#if _UNALIGNED
    printf("hello amigo\n");
    // Place the even result in the even column
    output[0] = SATURATE(even);
#else
    // The output value will be stored later
    //remainder[0] = SATURATE(even);
#endif

    // Apply the odd reconstruction filter to the lowpass band
    odd += 5 * lowpass[column + 0];
    odd += 4 * lowpass[column + 1];
    odd -= 1 * lowpass[column + 2];
    odd += 4;
    odd >>= 3;

    // Subtract the highpass correction
    odd -= highline[column];
    odd = DivideByShift(odd, 1);

#if _UNALIGNED
    // Place the odd result in the odd column
    output[1] = SATURATE(odd);
#else
    // The output value will be stored later
    //remainder[1] = SATURATE(odd);
#endif

/*
    colptr = &output[2];
    for (column = 2; column < 8; column++)
    {
        int32_t even = 0;       // Result of convolution with even filter
        int32_t odd = 0;        // Result of convolution with odd filter

        // Apply the even reconstruction filter to the lowpass band
        even += lowpass[column - 1];
        even -= lowpass[column + 1];
        even += ROUNDING(even,8);
        even = DivideByShift(even, 3);
        even += lowpass[column + 0];

        // Add the highpass correction
        even += highline[column];
        even = DivideByShift(even, 1);

        // Place the even result in the even column
        *(colptr++) = SATURATE(even);

        // Apply the odd reconstruction filter to the lowpass band
        odd -= lowpass[column - 1];
        odd += lowpass[column + 1];
        odd += ROUNDING(odd,8);
        odd = DivideByShift(odd, 3);
        odd += lowpass[column + 0];

        // Subtract the highpass correction
        odd -= highline[column];
        odd = DivideByShift(odd, 1);

        // Place the odd result in the odd column
        *(colptr++) = SATURATE(odd);
    }
*/

#if (0 && _FASTLOOP && XMMOPT)
    printf("XMMOPT");
    // Preload the first eight lowpass coefficients
    low1_epi16 = _mm_load_si128((__m128i *)&lowpass[column]);
//  low1_epi16 = _mm_adds_epi16(low1_epi16, overflowprotect_epi16);

    // Preload the first eight highpass coefficients
    high1_epi16 = _mm_load_si128((__m128i *)&highline[column]);

    // The reconstruction filters use pixels starting at the first column
    for (; column < post_column; column += column_step)
    {
        __m128i even_epi16;     // Result of convolution with even filter
        __m128i odd_epi16;      // Result of convolution with odd filter
        __m128i temp_epi16;
        __m128i out_epi16;      // Reconstructed data
        //__m128i high_epi16;
        uint32_t temp;      // Temporary register for last two values


        // Preload the next eight lowpass coefficients
        low2_epi16 = _mm_load_si128((__m128i *)&lowpass[column+8]);

//      low2_epi16 = _mm_adds_epi16(low2_epi16, overflowprotect_epi16);

        // Compute the first two even and two odd output points //

        // Apply the even reconstruction filter to the lowpass band
/*      even_epi16 = low1_epi16;
        temp_epi16 = _mm_slli_epi16(low1_epi16, 3);
        temp_epi16 = _mm_srli_si128(temp_epi16, 1*2);
        even_epi16 = _mm_adds_epi16(even_epi16, temp_epi16);
        temp_epi16 = _mm_srli_si128(low1_epi16, 2*2);
        even_epi16 = _mm_subs_epi16(even_epi16, temp_epi16);

#if 1
        // Apply the rounding adjustment
        even_epi16 = _mm_adds_epi16(even_epi16, _mm_set1_epi16(4));
#endif
        // Divide by eight
        even_epi16 = _mm_srai_epi16(even_epi16, 3);
*/
        // better math.
        even_epi16 = low1_epi16;
        temp_epi16 = _mm_srli_si128(even_epi16, 2*2);
        even_epi16 = _mm_subs_epi16(even_epi16, temp_epi16);
        even_epi16 = _mm_adds_epi16(even_epi16, half_epi16);
        even_epi16 = _mm_srai_epi16(even_epi16, 3);
        temp_epi16 = _mm_srli_si128(low1_epi16, 1*2);
        even_epi16 = _mm_adds_epi16(even_epi16, temp_epi16);

        // Shift the highpass correction by one column
        high1_epi16 = _mm_srli_si128(high1_epi16, 1*2);

        // Prescale for 8bit output - DAN 4/5/02
        //high_epi16 = _mm_slli_epi16(high1_epi16, prescale);

        // Add the highpass correction and divide by two
        even_epi16 = _mm_adds_epi16(even_epi16, high1_epi16);
        even_epi16 = _mm_srai_epi16(even_epi16, 1);

        // Apply the odd reconstruction filter to the lowpass band
/*      odd_epi16 = _mm_slli_epi16(low1_epi16, 3);
        odd_epi16 = _mm_srli_si128(odd_epi16, 1*2);
        temp_epi16 = _mm_srli_si128(low1_epi16, 2*2);
        odd_epi16 = _mm_adds_epi16(odd_epi16, temp_epi16);
        odd_epi16 = _mm_subs_epi16(odd_epi16, low1_epi16);

#if 1
        // Apply the rounding adjustment
        odd_epi16 = _mm_adds_epi16(odd_epi16, _mm_set1_epi16(4));
#endif
        // Divide by eight
        odd_epi16 = _mm_srai_epi16(odd_epi16, 3);
*/
        // Apply the odd reconstruction filter to the lowpass band
        // better math.
        odd_epi16 = _mm_srli_si128(low1_epi16, 2*2);
        temp_epi16 = low1_epi16;
        odd_epi16 = _mm_subs_epi16(odd_epi16, temp_epi16);
        odd_epi16 = _mm_adds_epi16(odd_epi16, half_epi16);
        odd_epi16 = _mm_srai_epi16(odd_epi16, 3);
        temp_epi16 = _mm_srli_si128(low1_epi16, 1*2);
        odd_epi16 = _mm_adds_epi16(odd_epi16, temp_epi16);

        // Subtract the highpass correction and divide by two
        odd_epi16 = _mm_subs_epi16(odd_epi16, high1_epi16);
        odd_epi16 = _mm_srai_epi16(odd_epi16, 1);

        // Interleave the first four even and odd results
        out_epi16 = _mm_unpacklo_epi16(even_epi16, odd_epi16);
        //out_epi16 = _mm_max_epi16(out_epi16, _mm_setzero_si64());

#if _UNALIGNED
        // Store the first eight output values
        _mm_storeu_si128(outptr++, out_epi16);
#else
        // Combine the new output values with the two values from the previous phase
        out_epi16 = _mm_shuffle_epi32(out_epi16, _MM_SHUFFLE(2, 1, 0, 3));
        temp = _mm_cvtsi128_si32(out_epi16);
        out_epi16 = _mm_insert_epi16(out_epi16, even, 0);
        out_epi16 = _mm_insert_epi16(out_epi16, odd, 1);

        // Store eight output values
        _mm_store_si128(outptr++, out_epi16);

        // Save the remaining two output values
        //*((int *)remainder) = temp;
        even = (short)temp;
        odd = (short)(temp >> 16);
#endif

        // Compute the second four even and four odd output points //

        // Preload the highpass correction
        high2_epi16 = _mm_load_si128((__m128i *)&highline[column+8]);

        // Shift in the new pixels for the next stage of the loop
        low1_epi16 = _mm_srli_si128(low1_epi16, 4*2);
        temp_epi16 = _mm_slli_si128(low2_epi16, 4*2);
        low1_epi16 = _mm_or_si128(low1_epi16, temp_epi16);

/*      // Apply the even reconstruction filter to the lowpass band
        even_epi16 = low1_epi16;
        temp_epi16 = _mm_slli_epi16(low1_epi16, 3);
        temp_epi16 = _mm_srli_si128(temp_epi16, 1*2);
        even_epi16 = _mm_adds_epi16(even_epi16, temp_epi16);
        temp_epi16 = _mm_srli_si128(low1_epi16, 2*2);
        even_epi16 = _mm_subs_epi16(even_epi16, temp_epi16);

#if 1
        // Apply the rounding adjustment
        even_epi16 = _mm_adds_epi16(even_epi16, _mm_set1_epi16(4));
#endif
        // Divide by eight
        even_epi16 = _mm_srai_epi16(even_epi16, 3);
*/
        // better math.
        even_epi16 = low1_epi16;
        temp_epi16 = _mm_srli_si128(even_epi16, 2*2);
        even_epi16 = _mm_subs_epi16(even_epi16, temp_epi16);
        even_epi16 = _mm_adds_epi16(even_epi16, half_epi16);
        even_epi16 = _mm_srai_epi16(even_epi16, 3);
        temp_epi16 = _mm_srli_si128(low1_epi16, 1*2);
        even_epi16 = _mm_adds_epi16(even_epi16, temp_epi16);


        // Shift in the next four highpass coefficients
        high1_epi16 = _mm_srli_si128(high1_epi16, 4*2);
        temp_epi16 = _mm_slli_si128(high2_epi16, 3*2);
        high1_epi16 = _mm_or_si128(high1_epi16, temp_epi16);

        // Prescale for 8bit output - DAN 4/5/02
        //high_epi16 = _mm_slli_epi16(high1_epi16, prescale);

        // Add the highpass correction and divide by two
        even_epi16 = _mm_adds_epi16(even_epi16, high1_epi16);
        even_epi16 = _mm_srai_epi16(even_epi16, 1);

        // Apply the odd reconstruction filter to the lowpass band
/*      odd_epi16 = _mm_slli_epi16(low1_epi16, 3);
        odd_epi16 = _mm_srli_si128(odd_epi16, 1*2);
        temp_epi16 = _mm_srli_si128(low1_epi16, 2*2);
        odd_epi16 = _mm_adds_epi16(odd_epi16, temp_epi16);
        odd_epi16 = _mm_subs_epi16(odd_epi16, low1_epi16);

#if 1
        // Apply the rounding adjustment
        odd_epi16 = _mm_adds_epi16(odd_epi16, _mm_set1_epi16(4));
#endif
        // Divide by eight
        odd_epi16 = _mm_srai_epi16(odd_epi16, 3);
*/

        // Apply the odd reconstruction filter to the lowpass band
        // better math.
        odd_epi16 = _mm_srli_si128(low1_epi16, 2*2);
        temp_epi16 = low1_epi16;
        odd_epi16 = _mm_subs_epi16(odd_epi16, temp_epi16);
        odd_epi16 = _mm_adds_epi16(odd_epi16, half_epi16);
        odd_epi16 = _mm_srai_epi16(odd_epi16, 3);
        temp_epi16 = _mm_srli_si128(low1_epi16, 1*2);

        odd_epi16 = _mm_adds_epi16(odd_epi16, temp_epi16);
        // Subtract the highpass correction and divide by two
        odd_epi16 = _mm_subs_epi16(odd_epi16, high1_epi16);
        odd_epi16 = _mm_srai_epi16(odd_epi16, 1);

        // Interleave the second four even and odd results
        out_epi16 = _mm_unpacklo_epi16(even_epi16, odd_epi16);
        //out_epi16 = _mm_max_epi16(out_epi16, _mm_setzero_si64());

#if _UNALIGNED
        // Store the first eight output values
        _mm_storeu_si128(outptr++, out_epi16);
#else
        // Combine the new output values with the two values from the previous phase
        out_epi16 = _mm_shuffle_epi32(out_epi16, _MM_SHUFFLE(2, 1, 0, 3));
        temp = _mm_cvtsi128_si32(out_epi16);
        out_epi16 = _mm_insert_epi16(out_epi16, even, 0);
        out_epi16 = _mm_insert_epi16(out_epi16, odd, 1);

        // Store eight output values
        _mm_store_si128(outptr++, out_epi16);

        // Save the remaining two output values
        even = (short)temp;
        odd = (short)(temp >> 16);
#endif

        // Prepare for the next loop iteration //

        // The second eight lowpass coefficients will be the current values
        low1_epi16 = low2_epi16;

        // The second eight highpass coefficients will be the current values
        high1_epi16 = high2_epi16;
    }

    // Should have exited the loop with the column equal to the post processing column
    assert(column == post_column);

#endif

    // The fast processing loop is one column behind the actual column
    column++;

    // Get the pointer to the next output value
    colptr = (PIXEL *)outptr;

#if _UNALIGNED
    // The last two output points have already been stored
#else
    // Store the last two output points produced by the loop
    *(colptr++) = SATURATE(even);
    *(colptr++) = SATURATE(odd);
#endif
    // Process the rest of the columns up to the last column in the row
    for (; column < last_column; column++)
    {
        int32_t even = 0;       // Result of convolution with even filter
        int32_t odd = 0;        // Result of convolution with odd filter

        // Apply the even reconstruction filter to the lowpass band
        even = lowpass[column - 1];
        even -= lowpass[column + 1];
        even += 4; //DAN20050921
        even >>= 3;
        even += lowpass[column + 0];

        // Add the highpass correction
        even += highpass[column];
        even = DivideByShift(even, 1);

        // Place the even result in the even column
        //even >>= _INVERSE_TEMPORAL_PRESCALE;
        *(colptr++) = SATURATE(even);

        // Apply the odd reconstruction filter to the lowpass band
        odd = -lowpass[column - 1];
        odd += lowpass[column + 1];
        odd += 4; //DAN20050921
        odd >>= 3;
        odd += lowpass[column + 0];

        // Subtract the highpass correction
        odd -= highpass[column];
        odd = DivideByShift(odd, 1);


        // Place the odd result in the odd column
        //odd >>= _INVERSE_TEMPORAL_PRESCALE;
        *(colptr++) = SATURATE(odd);
    }

    // Should have exited the loop at the column for right border processing
    assert(column == last_column);

    // Process the last two output points with special filters for the right border
    even = 0;
    odd = 0;

    // Apply the even reconstruction filter to the lowpass band
    even += 5 * lowpass[column + 0];
    even += 4 * lowpass[column - 1];
    even -= 1 * lowpass[column - 2];
    even += 4;
    even >>= 3;

    // Add the highpass correction
    even += highline[column];
    even = DivideByShift(even, 1);

    // Place the even result in the even column
    *(colptr++) = SATURATE(even);

    // Apply the odd reconstruction filter to the lowpass band
    odd += 11 * lowpass[column + 0];
    odd -=  4 * lowpass[column - 1];
    odd +=  1 * lowpass[column - 2];
    odd += 4;
    odd >>= 3;

    // Subtract the highpass correction
    odd -= highline[column];
    odd = DivideByShift(odd, 1);

    // Place the odd result in the odd column
    *(colptr++) = SATURATE(odd);

#else
#error Have not implemented 8-bit lowpass coefficients
#endif
}

#endif