Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- void InvertHorizontalRow16s8sTo16sBuffered(PIXEL *lowpass, // Row of horizontal lowpass coefficients
- int lowpass_quantization, // lowpass quantization factor
- PIXEL8S *highpass_data, // Row of horizontal highpass coefficients
- int highpass_quantization, // highpass quantization factor
- PIXEL *output, // Row of reconstructed results
- int width, // Length of each row of horizontal coefficients
- PIXEL *buffer) // Buffer to hold the dequantized values
- {
- #if _DECODE_LOWPASS_16S
- const int column_step = 8;
- const int last_column = width - 1;
- int post_column = last_column - (last_column % column_step);
- int column;
- PIXEL *highline = buffer;
- __m128i low1_epi16; // Lowpass coefficients
- __m128i low2_epi16;
- __m128i high1_epi16; // Current eight highpass coefficients
- __m128i high2_epi16; // Next eight highpass coefficients
- __m128i half_epi16 = _mm_set1_epi16(4);
- #if _UNALIGNED
- // The fast loop computes output points starting at the third column
- __m128i *outptr = (__m128i *)&output[2];
- #else
- // The fast loop merges values from different phases to allow aligned stores
- __m128i *outptr = (__m128i *)&output[0];
- // Two 16-bit coefficients from the previous loop iteration
- //short remainder[2];
- #endif
- PIXEL *colptr;
- int32_t even;
- int32_t odd;
- //int32_t lsb;
- PIXEL *highpass = (PIXEL *)highpass_data;
- // Adjust the end of the fast loop if necessary
- if (post_column == last_column)
- post_column -= column_step;
- // Undo quantization for the highpass row
- #if _DEQUANTIZE_IN_FSM
- highline = highpass;
- #else
- DequantizeBandRow16s(highpass, width, highpass_quantization, highline);
- #endif
- #if (0)
- int x;
- printf("highline \n");
- for (x = 0; x < width/2; x++){
- printf("%d ,", highline[x]);
- }
- printf("\n");
- #endif
- // Start processing at the beginning of the row
- column = 0;
- // Process the first two output points with special filters for the left border
- even = 0;
- odd = 0;
- // Apply the even reconstruction filter to the lowpass band
- even += 11 * lowpass[column + 0];
- even -= 4 * lowpass[column + 1];
- even += 1 * lowpass[column + 2];
- even += 4;
- even >>= 3;
- // Add the highpass correction
- even += highline[column];
- even = DivideByShift(even, 1);
- #if _UNALIGNED
- printf("hello amigo\n");
- // Place the even result in the even column
- output[0] = SATURATE(even);
- #else
- // The output value will be stored later
- //remainder[0] = SATURATE(even);
- #endif
- // Apply the odd reconstruction filter to the lowpass band
- odd += 5 * lowpass[column + 0];
- odd += 4 * lowpass[column + 1];
- odd -= 1 * lowpass[column + 2];
- odd += 4;
- odd >>= 3;
- // Subtract the highpass correction
- odd -= highline[column];
- odd = DivideByShift(odd, 1);
- #if _UNALIGNED
- // Place the odd result in the odd column
- output[1] = SATURATE(odd);
- #else
- // The output value will be stored later
- //remainder[1] = SATURATE(odd);
- #endif
- /*
- colptr = &output[2];
- for (column = 2; column < 8; column++)
- {
- int32_t even = 0; // Result of convolution with even filter
- int32_t odd = 0; // Result of convolution with odd filter
- // Apply the even reconstruction filter to the lowpass band
- even += lowpass[column - 1];
- even -= lowpass[column + 1];
- even += ROUNDING(even,8);
- even = DivideByShift(even, 3);
- even += lowpass[column + 0];
- // Add the highpass correction
- even += highline[column];
- even = DivideByShift(even, 1);
- // Place the even result in the even column
- *(colptr++) = SATURATE(even);
- // Apply the odd reconstruction filter to the lowpass band
- odd -= lowpass[column - 1];
- odd += lowpass[column + 1];
- odd += ROUNDING(odd,8);
- odd = DivideByShift(odd, 3);
- odd += lowpass[column + 0];
- // Subtract the highpass correction
- odd -= highline[column];
- odd = DivideByShift(odd, 1);
- // Place the odd result in the odd column
- *(colptr++) = SATURATE(odd);
- }
- */
- #if (0 && _FASTLOOP && XMMOPT)
- printf("XMMOPT");
- // Preload the first eight lowpass coefficients
- low1_epi16 = _mm_load_si128((__m128i *)&lowpass[column]);
- // low1_epi16 = _mm_adds_epi16(low1_epi16, overflowprotect_epi16);
- // Preload the first eight highpass coefficients
- high1_epi16 = _mm_load_si128((__m128i *)&highline[column]);
- // The reconstruction filters use pixels starting at the first column
- for (; column < post_column; column += column_step)
- {
- __m128i even_epi16; // Result of convolution with even filter
- __m128i odd_epi16; // Result of convolution with odd filter
- __m128i temp_epi16;
- __m128i out_epi16; // Reconstructed data
- //__m128i high_epi16;
- uint32_t temp; // Temporary register for last two values
- // Preload the next eight lowpass coefficients
- low2_epi16 = _mm_load_si128((__m128i *)&lowpass[column+8]);
- // low2_epi16 = _mm_adds_epi16(low2_epi16, overflowprotect_epi16);
- // Compute the first two even and two odd output points //
- // Apply the even reconstruction filter to the lowpass band
- /* even_epi16 = low1_epi16;
- temp_epi16 = _mm_slli_epi16(low1_epi16, 3);
- temp_epi16 = _mm_srli_si128(temp_epi16, 1*2);
- even_epi16 = _mm_adds_epi16(even_epi16, temp_epi16);
- temp_epi16 = _mm_srli_si128(low1_epi16, 2*2);
- even_epi16 = _mm_subs_epi16(even_epi16, temp_epi16);
- #if 1
- // Apply the rounding adjustment
- even_epi16 = _mm_adds_epi16(even_epi16, _mm_set1_epi16(4));
- #endif
- // Divide by eight
- even_epi16 = _mm_srai_epi16(even_epi16, 3);
- */
- // better math.
- even_epi16 = low1_epi16;
- temp_epi16 = _mm_srli_si128(even_epi16, 2*2);
- even_epi16 = _mm_subs_epi16(even_epi16, temp_epi16);
- even_epi16 = _mm_adds_epi16(even_epi16, half_epi16);
- even_epi16 = _mm_srai_epi16(even_epi16, 3);
- temp_epi16 = _mm_srli_si128(low1_epi16, 1*2);
- even_epi16 = _mm_adds_epi16(even_epi16, temp_epi16);
- // Shift the highpass correction by one column
- high1_epi16 = _mm_srli_si128(high1_epi16, 1*2);
- // Prescale for 8bit output - DAN 4/5/02
- //high_epi16 = _mm_slli_epi16(high1_epi16, prescale);
- // Add the highpass correction and divide by two
- even_epi16 = _mm_adds_epi16(even_epi16, high1_epi16);
- even_epi16 = _mm_srai_epi16(even_epi16, 1);
- // Apply the odd reconstruction filter to the lowpass band
- /* odd_epi16 = _mm_slli_epi16(low1_epi16, 3);
- odd_epi16 = _mm_srli_si128(odd_epi16, 1*2);
- temp_epi16 = _mm_srli_si128(low1_epi16, 2*2);
- odd_epi16 = _mm_adds_epi16(odd_epi16, temp_epi16);
- odd_epi16 = _mm_subs_epi16(odd_epi16, low1_epi16);
- #if 1
- // Apply the rounding adjustment
- odd_epi16 = _mm_adds_epi16(odd_epi16, _mm_set1_epi16(4));
- #endif
- // Divide by eight
- odd_epi16 = _mm_srai_epi16(odd_epi16, 3);
- */
- // Apply the odd reconstruction filter to the lowpass band
- // better math.
- odd_epi16 = _mm_srli_si128(low1_epi16, 2*2);
- temp_epi16 = low1_epi16;
- odd_epi16 = _mm_subs_epi16(odd_epi16, temp_epi16);
- odd_epi16 = _mm_adds_epi16(odd_epi16, half_epi16);
- odd_epi16 = _mm_srai_epi16(odd_epi16, 3);
- temp_epi16 = _mm_srli_si128(low1_epi16, 1*2);
- odd_epi16 = _mm_adds_epi16(odd_epi16, temp_epi16);
- // Subtract the highpass correction and divide by two
- odd_epi16 = _mm_subs_epi16(odd_epi16, high1_epi16);
- odd_epi16 = _mm_srai_epi16(odd_epi16, 1);
- // Interleave the first four even and odd results
- out_epi16 = _mm_unpacklo_epi16(even_epi16, odd_epi16);
- //out_epi16 = _mm_max_epi16(out_epi16, _mm_setzero_si64());
- #if _UNALIGNED
- // Store the first eight output values
- _mm_storeu_si128(outptr++, out_epi16);
- #else
- // Combine the new output values with the two values from the previous phase
- out_epi16 = _mm_shuffle_epi32(out_epi16, _MM_SHUFFLE(2, 1, 0, 3));
- temp = _mm_cvtsi128_si32(out_epi16);
- out_epi16 = _mm_insert_epi16(out_epi16, even, 0);
- out_epi16 = _mm_insert_epi16(out_epi16, odd, 1);
- // Store eight output values
- _mm_store_si128(outptr++, out_epi16);
- // Save the remaining two output values
- //*((int *)remainder) = temp;
- even = (short)temp;
- odd = (short)(temp >> 16);
- #endif
- // Compute the second four even and four odd output points //
- // Preload the highpass correction
- high2_epi16 = _mm_load_si128((__m128i *)&highline[column+8]);
- // Shift in the new pixels for the next stage of the loop
- low1_epi16 = _mm_srli_si128(low1_epi16, 4*2);
- temp_epi16 = _mm_slli_si128(low2_epi16, 4*2);
- low1_epi16 = _mm_or_si128(low1_epi16, temp_epi16);
- /* // Apply the even reconstruction filter to the lowpass band
- even_epi16 = low1_epi16;
- temp_epi16 = _mm_slli_epi16(low1_epi16, 3);
- temp_epi16 = _mm_srli_si128(temp_epi16, 1*2);
- even_epi16 = _mm_adds_epi16(even_epi16, temp_epi16);
- temp_epi16 = _mm_srli_si128(low1_epi16, 2*2);
- even_epi16 = _mm_subs_epi16(even_epi16, temp_epi16);
- #if 1
- // Apply the rounding adjustment
- even_epi16 = _mm_adds_epi16(even_epi16, _mm_set1_epi16(4));
- #endif
- // Divide by eight
- even_epi16 = _mm_srai_epi16(even_epi16, 3);
- */
- // better math.
- even_epi16 = low1_epi16;
- temp_epi16 = _mm_srli_si128(even_epi16, 2*2);
- even_epi16 = _mm_subs_epi16(even_epi16, temp_epi16);
- even_epi16 = _mm_adds_epi16(even_epi16, half_epi16);
- even_epi16 = _mm_srai_epi16(even_epi16, 3);
- temp_epi16 = _mm_srli_si128(low1_epi16, 1*2);
- even_epi16 = _mm_adds_epi16(even_epi16, temp_epi16);
- // Shift in the next four highpass coefficients
- high1_epi16 = _mm_srli_si128(high1_epi16, 4*2);
- temp_epi16 = _mm_slli_si128(high2_epi16, 3*2);
- high1_epi16 = _mm_or_si128(high1_epi16, temp_epi16);
- // Prescale for 8bit output - DAN 4/5/02
- //high_epi16 = _mm_slli_epi16(high1_epi16, prescale);
- // Add the highpass correction and divide by two
- even_epi16 = _mm_adds_epi16(even_epi16, high1_epi16);
- even_epi16 = _mm_srai_epi16(even_epi16, 1);
- // Apply the odd reconstruction filter to the lowpass band
- /* odd_epi16 = _mm_slli_epi16(low1_epi16, 3);
- odd_epi16 = _mm_srli_si128(odd_epi16, 1*2);
- temp_epi16 = _mm_srli_si128(low1_epi16, 2*2);
- odd_epi16 = _mm_adds_epi16(odd_epi16, temp_epi16);
- odd_epi16 = _mm_subs_epi16(odd_epi16, low1_epi16);
- #if 1
- // Apply the rounding adjustment
- odd_epi16 = _mm_adds_epi16(odd_epi16, _mm_set1_epi16(4));
- #endif
- // Divide by eight
- odd_epi16 = _mm_srai_epi16(odd_epi16, 3);
- */
- // Apply the odd reconstruction filter to the lowpass band
- // better math.
- odd_epi16 = _mm_srli_si128(low1_epi16, 2*2);
- temp_epi16 = low1_epi16;
- odd_epi16 = _mm_subs_epi16(odd_epi16, temp_epi16);
- odd_epi16 = _mm_adds_epi16(odd_epi16, half_epi16);
- odd_epi16 = _mm_srai_epi16(odd_epi16, 3);
- temp_epi16 = _mm_srli_si128(low1_epi16, 1*2);
- odd_epi16 = _mm_adds_epi16(odd_epi16, temp_epi16);
- // Subtract the highpass correction and divide by two
- odd_epi16 = _mm_subs_epi16(odd_epi16, high1_epi16);
- odd_epi16 = _mm_srai_epi16(odd_epi16, 1);
- // Interleave the second four even and odd results
- out_epi16 = _mm_unpacklo_epi16(even_epi16, odd_epi16);
- //out_epi16 = _mm_max_epi16(out_epi16, _mm_setzero_si64());
- #if _UNALIGNED
- // Store the first eight output values
- _mm_storeu_si128(outptr++, out_epi16);
- #else
- // Combine the new output values with the two values from the previous phase
- out_epi16 = _mm_shuffle_epi32(out_epi16, _MM_SHUFFLE(2, 1, 0, 3));
- temp = _mm_cvtsi128_si32(out_epi16);
- out_epi16 = _mm_insert_epi16(out_epi16, even, 0);
- out_epi16 = _mm_insert_epi16(out_epi16, odd, 1);
- // Store eight output values
- _mm_store_si128(outptr++, out_epi16);
- // Save the remaining two output values
- even = (short)temp;
- odd = (short)(temp >> 16);
- #endif
- // Prepare for the next loop iteration //
- // The second eight lowpass coefficients will be the current values
- low1_epi16 = low2_epi16;
- // The second eight highpass coefficients will be the current values
- high1_epi16 = high2_epi16;
- }
- // Should have exited the loop with the column equal to the post processing column
- assert(column == post_column);
- #endif
- // The fast processing loop is one column behind the actual column
- column++;
- // Get the pointer to the next output value
- colptr = (PIXEL *)outptr;
- #if _UNALIGNED
- // The last two output points have already been stored
- #else
- // Store the last two output points produced by the loop
- *(colptr++) = SATURATE(even);
- *(colptr++) = SATURATE(odd);
- #endif
- // Process the rest of the columns up to the last column in the row
- for (; column < last_column; column++)
- {
- int32_t even = 0; // Result of convolution with even filter
- int32_t odd = 0; // Result of convolution with odd filter
- // Apply the even reconstruction filter to the lowpass band
- even = lowpass[column - 1];
- even -= lowpass[column + 1];
- even += 4; //DAN20050921
- even >>= 3;
- even += lowpass[column + 0];
- // Add the highpass correction
- even += highpass[column];
- even = DivideByShift(even, 1);
- // Place the even result in the even column
- //even >>= _INVERSE_TEMPORAL_PRESCALE;
- *(colptr++) = SATURATE(even);
- // Apply the odd reconstruction filter to the lowpass band
- odd = -lowpass[column - 1];
- odd += lowpass[column + 1];
- odd += 4; //DAN20050921
- odd >>= 3;
- odd += lowpass[column + 0];
- // Subtract the highpass correction
- odd -= highpass[column];
- odd = DivideByShift(odd, 1);
- // Place the odd result in the odd column
- //odd >>= _INVERSE_TEMPORAL_PRESCALE;
- *(colptr++) = SATURATE(odd);
- }
- // Should have exited the loop at the column for right border processing
- assert(column == last_column);
- // Process the last two output points with special filters for the right border
- even = 0;
- odd = 0;
- // Apply the even reconstruction filter to the lowpass band
- even += 5 * lowpass[column + 0];
- even += 4 * lowpass[column - 1];
- even -= 1 * lowpass[column - 2];
- even += 4;
- even >>= 3;
- // Add the highpass correction
- even += highline[column];
- even = DivideByShift(even, 1);
- // Place the even result in the even column
- *(colptr++) = SATURATE(even);
- // Apply the odd reconstruction filter to the lowpass band
- odd += 11 * lowpass[column + 0];
- odd -= 4 * lowpass[column - 1];
- odd += 1 * lowpass[column - 2];
- odd += 4;
- odd >>= 3;
- // Subtract the highpass correction
- odd -= highline[column];
- odd = DivideByShift(odd, 1);
- // Place the odd result in the odd column
- *(colptr++) = SATURATE(odd);
- #else
- #error Have not implemented 8-bit lowpass coefficients
- #endif
- }
- #endif
Add Comment
Please, Sign In to add comment