Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <windows.h>
- #include <avisynth.h>
- #include <immintrin.h>
- void (*CoreFilterPtr)(const unsigned char*, unsigned char*, int, int, int, int, int, int);
- void Invert(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads)
- {
- if (bits == 32)
- {
- float* dstp = reinterpret_cast<float*>(_dstp);
- const float* srcp = reinterpret_cast<const float*>(_srcp);
- #pragma omp parallel for num_threads(threads)
- for (auto y = 0; y < height; y++)
- {
- float* local_dstp = dstp + y * dst_pitch;
- const float* local_srcp = srcp + y * src_pitch;
- for (auto x = 0; x < row_size; x++)
- {
- local_dstp[x] = (float)(1.0f - local_srcp[x]);
- }
- }
- }
- else if (bits == 16 || bits == 14 || bits == 12 || bits == 10)
- {
- uint16_t max_pixel = (1 << bits) - 1;
- uint16_t* dstp = reinterpret_cast<uint16_t*>(_dstp);
- const uint16_t* srcp = reinterpret_cast<const uint16_t*>(_srcp);
- #pragma omp parallel for num_threads(threads)
- for (auto y = 0; y < height; y++)
- {
- uint16_t* local_dstp = dstp + y * dst_pitch;
- const uint16_t* local_srcp = srcp + y * src_pitch;
- for (auto x = 0; x < row_size; x++)
- {
- local_dstp[x] = (uint16_t)(local_srcp[x] ^ max_pixel);
- }
- }
- }
- else
- {
- uint8_t* dstp = reinterpret_cast<uint8_t*>(_dstp);
- const uint8_t* srcp = reinterpret_cast<const uint8_t*>(_srcp);
- #pragma omp parallel for num_threads(threads)
- for (auto y = 0; y < height; y++)
- {
- uint8_t* local_dstp = dstp + y * dst_pitch;
- const uint8_t* local_srcp = srcp + y * src_pitch;
- for (auto x = 0; x < row_size; x++)
- {
- local_dstp[x] = (uint8_t)(local_srcp[x] ^ 255);
- }
- }
- }
- }
- void Invert_AVX2(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads)
- {
- if (bits == 32)
- {
- float* dstp = reinterpret_cast<float*>(_dstp);
- const float* srcp = reinterpret_cast<const float*>(_srcp);
- #pragma omp parallel for num_threads(threads)
- for (auto y = 0; y < height; y++)
- {
- float* local_dstp = (float*)(dstp + y * dst_pitch);
- float* local_srcp = (float*)(srcp + y * src_pitch);
- __m256 vector_max = _mm256_set1_ps(1.0f);
- auto row_size_mod = row_size - (row_size % 112);
- for (auto column = 0; column < row_size_mod; column += 112)
- {
- __m256 vector_src_01 = _mm256_loadu_ps(local_srcp);
- __m256 vector_src_02 = _mm256_loadu_ps(local_srcp + 8);
- __m256 vector_src_03 = _mm256_loadu_ps(local_srcp + 16);
- __m256 vector_src_04 = _mm256_loadu_ps(local_srcp + 24);
- __m256 vector_src_05 = _mm256_loadu_ps(local_srcp + 32);
- __m256 vector_src_06 = _mm256_loadu_ps(local_srcp + 40);
- __m256 vector_src_07 = _mm256_loadu_ps(local_srcp + 48);
- __m256 vector_src_08 = _mm256_loadu_ps(local_srcp + 56);
- __m256 vector_src_09 = _mm256_loadu_ps(local_srcp + 64);
- __m256 vector_src_10 = _mm256_loadu_ps(local_srcp + 72);
- __m256 vector_src_11 = _mm256_loadu_ps(local_srcp + 80);
- __m256 vector_src_12 = _mm256_loadu_ps(local_srcp + 88);
- __m256 vector_src_13 = _mm256_loadu_ps(local_srcp + 96);
- __m256 vector_src_14 = _mm256_loadu_ps(local_srcp + 104);
- __m256 vector_src_15 = _mm256_loadu_ps(local_srcp + 112);
- vector_src_01 = _mm256_sub_ps(vector_max, vector_src_01);
- vector_src_02 = _mm256_sub_ps(vector_max, vector_src_02);
- vector_src_03 = _mm256_sub_ps(vector_max, vector_src_03);
- vector_src_04 = _mm256_sub_ps(vector_max, vector_src_04);
- vector_src_05 = _mm256_sub_ps(vector_max, vector_src_05);
- vector_src_06 = _mm256_sub_ps(vector_max, vector_src_06);
- vector_src_07 = _mm256_sub_ps(vector_max, vector_src_07);
- vector_src_08 = _mm256_sub_ps(vector_max, vector_src_08);
- vector_src_09 = _mm256_sub_ps(vector_max, vector_src_09);
- vector_src_10 = _mm256_sub_ps(vector_max, vector_src_10);
- vector_src_11 = _mm256_sub_ps(vector_max, vector_src_11);
- vector_src_12 = _mm256_sub_ps(vector_max, vector_src_12);
- vector_src_13 = _mm256_sub_ps(vector_max, vector_src_13);
- vector_src_14 = _mm256_sub_ps(vector_max, vector_src_14);
- vector_src_15 = _mm256_sub_ps(vector_max, vector_src_15);
- _mm256_storeu_ps(local_dstp, vector_src_01);
- _mm256_storeu_ps(local_dstp + 8, vector_src_02);
- _mm256_storeu_ps(local_dstp + 16, vector_src_03);
- _mm256_storeu_ps(local_dstp + 24, vector_src_04);
- _mm256_storeu_ps(local_dstp + 32, vector_src_05);
- _mm256_storeu_ps(local_dstp + 40, vector_src_06);
- _mm256_storeu_ps(local_dstp + 48, vector_src_07);
- _mm256_storeu_ps(local_dstp + 56, vector_src_08);
- _mm256_storeu_ps(local_dstp + 64, vector_src_09);
- _mm256_storeu_ps(local_dstp + 72, vector_src_10);
- _mm256_storeu_ps(local_dstp + 80, vector_src_11);
- _mm256_storeu_ps(local_dstp + 88, vector_src_12);
- _mm256_storeu_ps(local_dstp + 96, vector_src_13);
- _mm256_storeu_ps(local_dstp + 104, vector_src_14);
- _mm256_storeu_ps(local_dstp + 112, vector_src_15);
- local_srcp += 112;
- local_dstp += 112;
- }
- for (auto column = row_size_mod; column < row_size; column++)
- {
- *local_dstp = (float)(1.0f - *local_srcp);
- local_dstp++;
- local_srcp++;
- }
- }
- }
- else if (bits == 16 || bits == 14 || bits == 12 || bits == 10)
- {
- uint16_t max_pixel = (1 << bits) - 1;
- uint16_t* dstp = reinterpret_cast<uint16_t*>(_dstp);
- const uint16_t* srcp = reinterpret_cast<const uint16_t*>(_srcp);
- #pragma omp parallel for num_threads(threads)
- for (auto y = 0; y < height; y++)
- {
- uint16_t* local_dstp = (uint16_t*)(dstp + y * dst_pitch);
- uint16_t* local_srcp = (uint16_t*)(srcp + y * src_pitch);
- __m256i vector_max = _mm256_set1_epi16(max_pixel);
- auto row_size_mod = row_size - (row_size % 224);
- for (auto column = 0; column < row_size_mod; column += 224)
- {
- __m256i vector_src_01 = _mm256_loadu_si256((const __m256i*)(local_srcp));
- __m256i vector_src_02 = _mm256_loadu_si256((const __m256i*)(local_srcp + 16));
- __m256i vector_src_03 = _mm256_loadu_si256((const __m256i*)(local_srcp + 32));
- __m256i vector_src_04 = _mm256_loadu_si256((const __m256i*)(local_srcp + 48));
- __m256i vector_src_05 = _mm256_loadu_si256((const __m256i*)(local_srcp + 64));
- __m256i vector_src_06 = _mm256_loadu_si256((const __m256i*)(local_srcp + 80));
- __m256i vector_src_07 = _mm256_loadu_si256((const __m256i*)(local_srcp + 96));
- __m256i vector_src_08 = _mm256_loadu_si256((const __m256i*)(local_srcp + 112));
- __m256i vector_src_09 = _mm256_loadu_si256((const __m256i*)(local_srcp + 128));
- __m256i vector_src_10 = _mm256_loadu_si256((const __m256i*)(local_srcp + 144));
- __m256i vector_src_11 = _mm256_loadu_si256((const __m256i*)(local_srcp + 160));
- __m256i vector_src_12 = _mm256_loadu_si256((const __m256i*)(local_srcp + 176));
- __m256i vector_src_13 = _mm256_loadu_si256((const __m256i*)(local_srcp + 192));
- __m256i vector_src_14 = _mm256_loadu_si256((const __m256i*)(local_srcp + 208));
- __m256i vector_src_15 = _mm256_loadu_si256((const __m256i*)(local_srcp + 224));
- vector_src_01 = _mm256_sub_epi16(vector_max, vector_src_01);
- vector_src_02 = _mm256_sub_epi16(vector_max, vector_src_02);
- vector_src_03 = _mm256_sub_epi16(vector_max, vector_src_03);
- vector_src_04 = _mm256_sub_epi16(vector_max, vector_src_04);
- vector_src_05 = _mm256_sub_epi16(vector_max, vector_src_05);
- vector_src_06 = _mm256_sub_epi16(vector_max, vector_src_06);
- vector_src_07 = _mm256_sub_epi16(vector_max, vector_src_07);
- vector_src_08 = _mm256_sub_epi16(vector_max, vector_src_08);
- vector_src_09 = _mm256_sub_epi16(vector_max, vector_src_09);
- vector_src_10 = _mm256_sub_epi16(vector_max, vector_src_10);
- vector_src_11 = _mm256_sub_epi16(vector_max, vector_src_11);
- vector_src_12 = _mm256_sub_epi16(vector_max, vector_src_12);
- vector_src_13 = _mm256_sub_epi16(vector_max, vector_src_13);
- vector_src_14 = _mm256_sub_epi16(vector_max, vector_src_14);
- vector_src_15 = _mm256_sub_epi16(vector_max, vector_src_15);
- _mm256_storeu_si256((__m256i*)(local_dstp), vector_src_01);
- _mm256_storeu_si256((__m256i*)(local_dstp + 16), vector_src_02);
- _mm256_storeu_si256((__m256i*)(local_dstp + 32), vector_src_03);
- _mm256_storeu_si256((__m256i*)(local_dstp + 48), vector_src_04);
- _mm256_storeu_si256((__m256i*)(local_dstp + 64), vector_src_05);
- _mm256_storeu_si256((__m256i*)(local_dstp + 80), vector_src_06);
- _mm256_storeu_si256((__m256i*)(local_dstp + 96), vector_src_07);
- _mm256_storeu_si256((__m256i*)(local_dstp + 112), vector_src_08);
- _mm256_storeu_si256((__m256i*)(local_dstp + 128), vector_src_09);
- _mm256_storeu_si256((__m256i*)(local_dstp + 144), vector_src_10);
- _mm256_storeu_si256((__m256i*)(local_dstp + 160), vector_src_11);
- _mm256_storeu_si256((__m256i*)(local_dstp + 176), vector_src_12);
- _mm256_storeu_si256((__m256i*)(local_dstp + 192), vector_src_13);
- _mm256_storeu_si256((__m256i*)(local_dstp + 208), vector_src_14);
- _mm256_storeu_si256((__m256i*)(local_dstp + 224), vector_src_15);
- local_srcp += 224;
- local_dstp += 224;
- }
- for (auto column = row_size_mod; column < row_size; column++)
- {
- *local_dstp = (uint16_t)(*local_srcp ^ max_pixel);
- local_dstp++;
- local_srcp++;
- }
- }
- }
- else
- {
- uint8_t* dstp = reinterpret_cast<uint8_t*>(_dstp);
- const uint8_t* srcp = reinterpret_cast<const uint8_t*>(_srcp);
- #pragma omp parallel for num_threads(threads)
- for (auto y = 0; y < height; y++)
- {
- uint8_t* local_dstp = (uint8_t*)(dstp + y * dst_pitch);
- uint8_t* local_srcp = (uint8_t*)(srcp + y * src_pitch);
- __m256i vector_max = _mm256_set1_epi8(255);
- auto row_size_mod = row_size - (row_size % 448);
- for (auto column = 0; column < row_size_mod; column += 448)
- {
- __m256i vector_src_01 = _mm256_loadu_si256((const __m256i*)(local_srcp));
- __m256i vector_src_02 = _mm256_loadu_si256((const __m256i*)(local_srcp + 32));
- __m256i vector_src_03 = _mm256_loadu_si256((const __m256i*)(local_srcp + 64));
- __m256i vector_src_04 = _mm256_loadu_si256((const __m256i*)(local_srcp + 96));
- __m256i vector_src_05 = _mm256_loadu_si256((const __m256i*)(local_srcp + 128));
- __m256i vector_src_06 = _mm256_loadu_si256((const __m256i*)(local_srcp + 160));
- __m256i vector_src_07 = _mm256_loadu_si256((const __m256i*)(local_srcp + 192));
- __m256i vector_src_08 = _mm256_loadu_si256((const __m256i*)(local_srcp + 224));
- __m256i vector_src_09 = _mm256_loadu_si256((const __m256i*)(local_srcp + 256));
- __m256i vector_src_10 = _mm256_loadu_si256((const __m256i*)(local_srcp + 288));
- __m256i vector_src_11 = _mm256_loadu_si256((const __m256i*)(local_srcp + 320));
- __m256i vector_src_12 = _mm256_loadu_si256((const __m256i*)(local_srcp + 352));
- __m256i vector_src_13 = _mm256_loadu_si256((const __m256i*)(local_srcp + 384));
- __m256i vector_src_14 = _mm256_loadu_si256((const __m256i*)(local_srcp + 416));
- __m256i vector_src_15 = _mm256_loadu_si256((const __m256i*)(local_srcp + 448));
- vector_src_01 = _mm256_sub_epi8(vector_max, vector_src_01);
- vector_src_02 = _mm256_sub_epi8(vector_max, vector_src_02);
- vector_src_03 = _mm256_sub_epi8(vector_max, vector_src_03);
- vector_src_04 = _mm256_sub_epi8(vector_max, vector_src_04);
- vector_src_05 = _mm256_sub_epi8(vector_max, vector_src_05);
- vector_src_06 = _mm256_sub_epi8(vector_max, vector_src_06);
- vector_src_07 = _mm256_sub_epi8(vector_max, vector_src_07);
- vector_src_08 = _mm256_sub_epi8(vector_max, vector_src_08);
- vector_src_09 = _mm256_sub_epi8(vector_max, vector_src_09);
- vector_src_10 = _mm256_sub_epi8(vector_max, vector_src_10);
- vector_src_11 = _mm256_sub_epi8(vector_max, vector_src_11);
- vector_src_12 = _mm256_sub_epi8(vector_max, vector_src_12);
- vector_src_13 = _mm256_sub_epi8(vector_max, vector_src_13);
- vector_src_14 = _mm256_sub_epi8(vector_max, vector_src_14);
- vector_src_15 = _mm256_sub_epi8(vector_max, vector_src_15);
- _mm256_storeu_si256((__m256i*)(local_dstp), vector_src_01);
- _mm256_storeu_si256((__m256i*)(local_dstp + 32), vector_src_02);
- _mm256_storeu_si256((__m256i*)(local_dstp + 64), vector_src_03);
- _mm256_storeu_si256((__m256i*)(local_dstp + 96), vector_src_04);
- _mm256_storeu_si256((__m256i*)(local_dstp + 128), vector_src_05);
- _mm256_storeu_si256((__m256i*)(local_dstp + 160), vector_src_06);
- _mm256_storeu_si256((__m256i*)(local_dstp + 192), vector_src_07);
- _mm256_storeu_si256((__m256i*)(local_dstp + 224), vector_src_08);
- _mm256_storeu_si256((__m256i*)(local_dstp + 256), vector_src_09);
- _mm256_storeu_si256((__m256i*)(local_dstp + 288), vector_src_10);
- _mm256_storeu_si256((__m256i*)(local_dstp + 320), vector_src_11);
- _mm256_storeu_si256((__m256i*)(local_dstp + 352), vector_src_12);
- _mm256_storeu_si256((__m256i*)(local_dstp + 384), vector_src_13);
- _mm256_storeu_si256((__m256i*)(local_dstp + 416), vector_src_14);
- _mm256_storeu_si256((__m256i*)(local_dstp + 448), vector_src_15);
- local_srcp += 448;
- local_dstp += 448;
- }
- for (auto column = row_size_mod; column < row_size; column++)
- {
- *local_dstp = (uint16_t)(*local_srcp ^ 255);
- local_dstp++;
- local_srcp++;
- }
- }
- }
- }
- void Invert_SSE4(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads)
- {
- if (bits == 32)
- {
- float* dstp = reinterpret_cast<float*>(_dstp);
- const float* srcp = reinterpret_cast<const float*>(_srcp);
- #pragma omp parallel for num_threads(threads)
- for (auto y = 0; y < height; y++)
- {
- float* local_dstp = (float*)(dstp + y * dst_pitch);
- float* local_srcp = (float*)(srcp + y * src_pitch);
- __m128 vector_max = _mm_set1_ps(1.0f);
- auto row_size_mod = row_size - (row_size % 56);
- for (auto column = 0; column < row_size_mod; column += 56)
- {
- __m128 vector_src_01 = _mm_loadu_ps(local_srcp);
- __m128 vector_src_02 = _mm_loadu_ps(local_srcp + 4);
- __m128 vector_src_03 = _mm_loadu_ps(local_srcp + 8);
- __m128 vector_src_04 = _mm_loadu_ps(local_srcp + 12);
- __m128 vector_src_05 = _mm_loadu_ps(local_srcp + 16);
- __m128 vector_src_06 = _mm_loadu_ps(local_srcp + 20);
- __m128 vector_src_07 = _mm_loadu_ps(local_srcp + 24);
- __m128 vector_src_08 = _mm_loadu_ps(local_srcp + 28);
- __m128 vector_src_09 = _mm_loadu_ps(local_srcp + 32);
- __m128 vector_src_10 = _mm_loadu_ps(local_srcp + 36);
- __m128 vector_src_11 = _mm_loadu_ps(local_srcp + 40);
- __m128 vector_src_12 = _mm_loadu_ps(local_srcp + 44);
- __m128 vector_src_13 = _mm_loadu_ps(local_srcp + 48);
- __m128 vector_src_14 = _mm_loadu_ps(local_srcp + 52);
- __m128 vector_src_15 = _mm_loadu_ps(local_srcp + 56);
- vector_src_01 = _mm_sub_ps(vector_max, vector_src_01);
- vector_src_02 = _mm_sub_ps(vector_max, vector_src_02);
- vector_src_03 = _mm_sub_ps(vector_max, vector_src_03);
- vector_src_04 = _mm_sub_ps(vector_max, vector_src_04);
- vector_src_05 = _mm_sub_ps(vector_max, vector_src_05);
- vector_src_06 = _mm_sub_ps(vector_max, vector_src_06);
- vector_src_07 = _mm_sub_ps(vector_max, vector_src_07);
- vector_src_08 = _mm_sub_ps(vector_max, vector_src_08);
- vector_src_09 = _mm_sub_ps(vector_max, vector_src_09);
- vector_src_10 = _mm_sub_ps(vector_max, vector_src_10);
- vector_src_11 = _mm_sub_ps(vector_max, vector_src_11);
- vector_src_12 = _mm_sub_ps(vector_max, vector_src_12);
- vector_src_13 = _mm_sub_ps(vector_max, vector_src_13);
- vector_src_14 = _mm_sub_ps(vector_max, vector_src_14);
- vector_src_15 = _mm_sub_ps(vector_max, vector_src_15);
- _mm_storeu_ps(local_dstp, vector_src_01);
- _mm_storeu_ps(local_dstp + 4, vector_src_02);
- _mm_storeu_ps(local_dstp + 8, vector_src_03);
- _mm_storeu_ps(local_dstp + 12, vector_src_04);
- _mm_storeu_ps(local_dstp + 16, vector_src_05);
- _mm_storeu_ps(local_dstp + 20, vector_src_06);
- _mm_storeu_ps(local_dstp + 24, vector_src_07);
- _mm_storeu_ps(local_dstp + 28, vector_src_08);
- _mm_storeu_ps(local_dstp + 32, vector_src_09);
- _mm_storeu_ps(local_dstp + 36, vector_src_10);
- _mm_storeu_ps(local_dstp + 40, vector_src_11);
- _mm_storeu_ps(local_dstp + 44, vector_src_12);
- _mm_storeu_ps(local_dstp + 48, vector_src_13);
- _mm_storeu_ps(local_dstp + 52, vector_src_14);
- _mm_storeu_ps(local_dstp + 56, vector_src_15);
- local_srcp += 56;
- local_dstp += 56;
- }
- for (auto column = row_size_mod; column < row_size; column++)
- {
- *local_dstp = (float)(1.0f - *local_srcp);
- local_dstp++;
- local_srcp++;
- }
- }
- }
- else if (bits == 16 || bits == 14 || bits == 12 || bits == 10)
- {
- uint16_t max_pixel = (1 << bits) - 1;
- uint16_t* dstp = reinterpret_cast<uint16_t*>(_dstp);
- const uint16_t* srcp = reinterpret_cast<const uint16_t*>(_srcp);
- #pragma omp parallel for num_threads(threads)
- for (auto y = 0; y < height; y++)
- {
- uint16_t* local_dstp = (uint16_t*)(dstp + y * dst_pitch);
- uint16_t* local_srcp = (uint16_t*)(srcp + y * src_pitch);
- __m128i vector_max = _mm_set1_epi16(max_pixel);
- auto row_size_mod = row_size - (row_size % 112);
- for (auto column = 0; column < row_size_mod; column += 112)
- {
- __m128i vector_src_01 = _mm_loadu_si128((const __m128i*)(local_srcp));
- __m128i vector_src_02 = _mm_loadu_si128((const __m128i*)(local_srcp + 8));
- __m128i vector_src_03 = _mm_loadu_si128((const __m128i*)(local_srcp + 16));
- __m128i vector_src_04 = _mm_loadu_si128((const __m128i*)(local_srcp + 24));
- __m128i vector_src_05 = _mm_loadu_si128((const __m128i*)(local_srcp + 32));
- __m128i vector_src_06 = _mm_loadu_si128((const __m128i*)(local_srcp + 40));
- __m128i vector_src_07 = _mm_loadu_si128((const __m128i*)(local_srcp + 48));
- __m128i vector_src_08 = _mm_loadu_si128((const __m128i*)(local_srcp + 56));
- __m128i vector_src_09 = _mm_loadu_si128((const __m128i*)(local_srcp + 64));
- __m128i vector_src_10 = _mm_loadu_si128((const __m128i*)(local_srcp + 72));
- __m128i vector_src_11 = _mm_loadu_si128((const __m128i*)(local_srcp + 80));
- __m128i vector_src_12 = _mm_loadu_si128((const __m128i*)(local_srcp + 88));
- __m128i vector_src_13 = _mm_loadu_si128((const __m128i*)(local_srcp + 96));
- __m128i vector_src_14 = _mm_loadu_si128((const __m128i*)(local_srcp + 104));
- __m128i vector_src_15 = _mm_loadu_si128((const __m128i*)(local_srcp + 112));
- vector_src_01 = _mm_sub_epi16(vector_max, vector_src_01);
- vector_src_02 = _mm_sub_epi16(vector_max, vector_src_02);
- vector_src_03 = _mm_sub_epi16(vector_max, vector_src_03);
- vector_src_04 = _mm_sub_epi16(vector_max, vector_src_04);
- vector_src_05 = _mm_sub_epi16(vector_max, vector_src_05);
- vector_src_06 = _mm_sub_epi16(vector_max, vector_src_06);
- vector_src_07 = _mm_sub_epi16(vector_max, vector_src_07);
- vector_src_08 = _mm_sub_epi16(vector_max, vector_src_08);
- vector_src_09 = _mm_sub_epi16(vector_max, vector_src_09);
- vector_src_10 = _mm_sub_epi16(vector_max, vector_src_10);
- vector_src_11 = _mm_sub_epi16(vector_max, vector_src_11);
- vector_src_12 = _mm_sub_epi16(vector_max, vector_src_12);
- vector_src_13 = _mm_sub_epi16(vector_max, vector_src_13);
- vector_src_14 = _mm_sub_epi16(vector_max, vector_src_14);
- vector_src_15 = _mm_sub_epi16(vector_max, vector_src_15);
- _mm_storeu_si128((__m128i*)(local_dstp), vector_src_01);
- _mm_storeu_si128((__m128i*)(local_dstp + 8), vector_src_02);
- _mm_storeu_si128((__m128i*)(local_dstp + 16), vector_src_03);
- _mm_storeu_si128((__m128i*)(local_dstp + 24), vector_src_04);
- _mm_storeu_si128((__m128i*)(local_dstp + 32), vector_src_05);
- _mm_storeu_si128((__m128i*)(local_dstp + 40), vector_src_06);
- _mm_storeu_si128((__m128i*)(local_dstp + 48), vector_src_07);
- _mm_storeu_si128((__m128i*)(local_dstp + 56), vector_src_08);
- _mm_storeu_si128((__m128i*)(local_dstp + 64), vector_src_09);
- _mm_storeu_si128((__m128i*)(local_dstp + 72), vector_src_10);
- _mm_storeu_si128((__m128i*)(local_dstp + 80), vector_src_11);
- _mm_storeu_si128((__m128i*)(local_dstp + 88), vector_src_12);
- _mm_storeu_si128((__m128i*)(local_dstp + 96), vector_src_13);
- _mm_storeu_si128((__m128i*)(local_dstp + 104), vector_src_14);
- _mm_storeu_si128((__m128i*)(local_dstp + 112), vector_src_15);
- local_srcp += 112;
- local_dstp += 112;
- }
- for (auto column = row_size_mod; column < row_size; column++)
- {
- *local_dstp = (uint16_t)(*local_srcp ^ max_pixel);
- local_dstp++;
- local_srcp++;
- }
- }
- }
- else
- {
- uint8_t* dstp = reinterpret_cast<uint8_t*>(_dstp);
- const uint8_t* srcp = reinterpret_cast<const uint8_t*>(_srcp);
- #pragma omp parallel for num_threads(threads)
- for (auto y = 0; y < height; y++)
- {
- uint8_t* local_dstp = (uint8_t*)(dstp + y * dst_pitch);
- uint8_t* local_srcp = (uint8_t*)(srcp + y * src_pitch);
- __m128i vector_max = _mm_set1_epi8(255);
- auto row_size_mod = row_size - (row_size % 224);
- for (auto column = 0; column < row_size_mod; column += 224)
- {
- __m128i vector_src_01 = _mm_loadu_si128((const __m128i*)(local_srcp));
- __m128i vector_src_02 = _mm_loadu_si128((const __m128i*)(local_srcp + 16));
- __m128i vector_src_03 = _mm_loadu_si128((const __m128i*)(local_srcp + 32));
- __m128i vector_src_04 = _mm_loadu_si128((const __m128i*)(local_srcp + 48));
- __m128i vector_src_05 = _mm_loadu_si128((const __m128i*)(local_srcp + 64));
- __m128i vector_src_06 = _mm_loadu_si128((const __m128i*)(local_srcp + 80));
- __m128i vector_src_07 = _mm_loadu_si128((const __m128i*)(local_srcp + 96));
- __m128i vector_src_08 = _mm_loadu_si128((const __m128i*)(local_srcp + 112));
- __m128i vector_src_09 = _mm_loadu_si128((const __m128i*)(local_srcp + 128));
- __m128i vector_src_10 = _mm_loadu_si128((const __m128i*)(local_srcp + 144));
- __m128i vector_src_11 = _mm_loadu_si128((const __m128i*)(local_srcp + 160));
- __m128i vector_src_12 = _mm_loadu_si128((const __m128i*)(local_srcp + 176));
- __m128i vector_src_13 = _mm_loadu_si128((const __m128i*)(local_srcp + 192));
- __m128i vector_src_14 = _mm_loadu_si128((const __m128i*)(local_srcp + 208));
- __m128i vector_src_15 = _mm_loadu_si128((const __m128i*)(local_srcp + 224));
- vector_src_01 = _mm_sub_epi8(vector_max, vector_src_01);
- vector_src_02 = _mm_sub_epi8(vector_max, vector_src_02);
- vector_src_03 = _mm_sub_epi8(vector_max, vector_src_03);
- vector_src_04 = _mm_sub_epi8(vector_max, vector_src_04);
- vector_src_05 = _mm_sub_epi8(vector_max, vector_src_05);
- vector_src_06 = _mm_sub_epi8(vector_max, vector_src_06);
- vector_src_07 = _mm_sub_epi8(vector_max, vector_src_07);
- vector_src_08 = _mm_sub_epi8(vector_max, vector_src_08);
- vector_src_09 = _mm_sub_epi8(vector_max, vector_src_09);
- vector_src_10 = _mm_sub_epi8(vector_max, vector_src_10);
- vector_src_11 = _mm_sub_epi8(vector_max, vector_src_11);
- vector_src_12 = _mm_sub_epi8(vector_max, vector_src_12);
- vector_src_13 = _mm_sub_epi8(vector_max, vector_src_13);
- vector_src_14 = _mm_sub_epi8(vector_max, vector_src_14);
- vector_src_15 = _mm_sub_epi8(vector_max, vector_src_15);
- _mm_storeu_si128((__m128i*)(local_dstp), vector_src_01);
- _mm_storeu_si128((__m128i*)(local_dstp + 16), vector_src_02);
- _mm_storeu_si128((__m128i*)(local_dstp + 32), vector_src_03);
- _mm_storeu_si128((__m128i*)(local_dstp + 48), vector_src_04);
- _mm_storeu_si128((__m128i*)(local_dstp + 64), vector_src_05);
- _mm_storeu_si128((__m128i*)(local_dstp + 80), vector_src_06);
- _mm_storeu_si128((__m128i*)(local_dstp + 96), vector_src_07);
- _mm_storeu_si128((__m128i*)(local_dstp + 112), vector_src_08);
- _mm_storeu_si128((__m128i*)(local_dstp + 128), vector_src_09);
- _mm_storeu_si128((__m128i*)(local_dstp + 144), vector_src_10);
- _mm_storeu_si128((__m128i*)(local_dstp + 160), vector_src_11);
- _mm_storeu_si128((__m128i*)(local_dstp + 176), vector_src_12);
- _mm_storeu_si128((__m128i*)(local_dstp + 192), vector_src_13);
- _mm_storeu_si128((__m128i*)(local_dstp + 208), vector_src_14);
- _mm_storeu_si128((__m128i*)(local_dstp + 224), vector_src_15);
- local_srcp += 224;
- local_dstp += 224;
- }
- for (auto column = row_size_mod; column < row_size; column++)
- {
- *local_dstp = (uint16_t)(*local_srcp ^ 255);
- local_dstp++;
- local_srcp++;
- }
- }
- }
- }
- class InvertNeg : public GenericVideoFilter
- {
- int cpu;
- int threads;
- int cpuFlags;
- public:
- InvertNeg(PClip _child, int _cpu, int _threads, IScriptEnvironment* env) : GenericVideoFilter(_child), cpu(_cpu), threads(_threads)
- {
- if (!vi.IsY()) env->ThrowError("InvertNeg: Only Y8 input, sorry!");
- if (cpu < 0 || cpu > 2) env->ThrowError("InvertNeg: cpu must be 0, 1, 2!");
- if (threads < 1) env->ThrowError("InvertNeg: threads must be >= 1!");
- switch (cpu)
- {
- case 1: CoreFilterPtr = Invert_SSE4; break;
- case 2: CoreFilterPtr = Invert_AVX2; break;
- default: CoreFilterPtr = Invert; break;
- }
- }
- PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
- {
- auto dst = env->NewVideoFrame(vi);
- auto dstp = dst->GetWritePtr(PLANAR_Y);
- auto dst_pitch = dst->GetPitch(PLANAR_Y) / vi.ComponentSize();
- auto src = child->GetFrame(n, env);
- auto srcp = src->GetReadPtr(PLANAR_Y);
- auto height = src->GetHeight(PLANAR_Y);
- auto row_size = src->GetRowSize(PLANAR_Y) / vi.ComponentSize();
- auto src_pitch = src->GetPitch(PLANAR_Y) / vi.ComponentSize();
- CoreFilterPtr(srcp, dstp, src_pitch, dst_pitch, height, row_size, vi.BitsPerComponent(), threads);
- return dst;
- }
- };
- AVSValue __cdecl Create_InvertNeg(AVSValue args, void* user_data, IScriptEnvironment* env)
- {
- return new InvertNeg(args[0].AsClip(),args[1].AsInt(0),args[2].AsInt(1), env);
- }
- const AVS_Linkage* AVS_linkage = 0;
- extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors)
- {
- AVS_linkage = vectors;
- env->AddFunction("InvertNeg", "c[cpu]i[threads]i", Create_InvertNeg, 0);
- return "InvertNeg sample plugin";
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement