Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- void Invert_AVX512F(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads)
- {
- if (bits == 32)
- {
- #pragma omp parallel for num_threads(threads)
- for (auto y = 0; y < height; y++)
- {
- float* local_dstp = (float*)(reinterpret_cast<float*>(_dstp) + y * dst_pitch);
- const float* local_srcp = (const float*)(reinterpret_cast<const float*>(_srcp) + y * src_pitch);
- auto n = 8;
- auto row_size_rst = row_size % (n*30);
- auto row_size_mod = row_size - row_size_rst;
- __m512 vector_max = _mm512_set1_ps(1.0f);
- for (auto column = 0; column < row_size_mod; column += (n * 30))
- {
- __m512 vector_src_00 = _mm512_loadu_ps(local_srcp + n * 0);
- __m512 vector_src_01 = _mm512_loadu_ps(local_srcp + n * 1);
- __m512 vector_src_02 = _mm512_loadu_ps(local_srcp + n * 2);
- __m512 vector_src_03 = _mm512_loadu_ps(local_srcp + n * 3);
- __m512 vector_src_04 = _mm512_loadu_ps(local_srcp + n * 4);
- __m512 vector_src_05 = _mm512_loadu_ps(local_srcp + n * 5);
- __m512 vector_src_06 = _mm512_loadu_ps(local_srcp + n * 6);
- __m512 vector_src_07 = _mm512_loadu_ps(local_srcp + n * 7);
- __m512 vector_src_08 = _mm512_loadu_ps(local_srcp + n * 8);
- __m512 vector_src_09 = _mm512_loadu_ps(local_srcp + n * 9);
- __m512 vector_src_10 = _mm512_loadu_ps(local_srcp + n * 10);
- __m512 vector_src_11 = _mm512_loadu_ps(local_srcp + n * 11);
- __m512 vector_src_12 = _mm512_loadu_ps(local_srcp + n * 12);
- __m512 vector_src_13 = _mm512_loadu_ps(local_srcp + n * 13);
- __m512 vector_src_14 = _mm512_loadu_ps(local_srcp + n * 14);
- __m512 vector_src_15 = _mm512_loadu_ps(local_srcp + n * 15);
- __m512 vector_src_16 = _mm512_loadu_ps(local_srcp + n * 16);
- __m512 vector_src_17 = _mm512_loadu_ps(local_srcp + n * 17);
- __m512 vector_src_18 = _mm512_loadu_ps(local_srcp + n * 18);
- __m512 vector_src_19 = _mm512_loadu_ps(local_srcp + n * 19);
- __m512 vector_src_20 = _mm512_loadu_ps(local_srcp + n * 20);
- __m512 vector_src_21 = _mm512_loadu_ps(local_srcp + n * 21);
- __m512 vector_src_22 = _mm512_loadu_ps(local_srcp + n * 22);
- __m512 vector_src_23 = _mm512_loadu_ps(local_srcp + n * 23);
- __m512 vector_src_24 = _mm512_loadu_ps(local_srcp + n * 24);
- __m512 vector_src_25 = _mm512_loadu_ps(local_srcp + n * 25);
- __m512 vector_src_26 = _mm512_loadu_ps(local_srcp + n * 26);
- __m512 vector_src_27 = _mm512_loadu_ps(local_srcp + n * 27);
- __m512 vector_src_28 = _mm512_loadu_ps(local_srcp + n * 28);
- __m512 vector_src_29 = _mm512_loadu_ps(local_srcp + n * 29);
- __m512 vector_src_30 = _mm512_loadu_ps(local_srcp + n * 30);
- vector_src_00 = _mm512_sub_ps(vector_max, vector_src_00);
- vector_src_01 = _mm512_sub_ps(vector_max, vector_src_01);
- vector_src_02 = _mm512_sub_ps(vector_max, vector_src_02);
- vector_src_03 = _mm512_sub_ps(vector_max, vector_src_03);
- vector_src_04 = _mm512_sub_ps(vector_max, vector_src_04);
- vector_src_05 = _mm512_sub_ps(vector_max, vector_src_05);
- vector_src_06 = _mm512_sub_ps(vector_max, vector_src_06);
- vector_src_07 = _mm512_sub_ps(vector_max, vector_src_07);
- vector_src_08 = _mm512_sub_ps(vector_max, vector_src_08);
- vector_src_09 = _mm512_sub_ps(vector_max, vector_src_09);
- vector_src_10 = _mm512_sub_ps(vector_max, vector_src_10);
- vector_src_11 = _mm512_sub_ps(vector_max, vector_src_11);
- vector_src_12 = _mm512_sub_ps(vector_max, vector_src_12);
- vector_src_13 = _mm512_sub_ps(vector_max, vector_src_13);
- vector_src_14 = _mm512_sub_ps(vector_max, vector_src_14);
- vector_src_15 = _mm512_sub_ps(vector_max, vector_src_15);
- vector_src_16 = _mm512_sub_ps(vector_max, vector_src_16);
- vector_src_17 = _mm512_sub_ps(vector_max, vector_src_17);
- vector_src_18 = _mm512_sub_ps(vector_max, vector_src_18);
- vector_src_19 = _mm512_sub_ps(vector_max, vector_src_19);
- vector_src_20 = _mm512_sub_ps(vector_max, vector_src_20);
- vector_src_21 = _mm512_sub_ps(vector_max, vector_src_21);
- vector_src_22 = _mm512_sub_ps(vector_max, vector_src_22);
- vector_src_23 = _mm512_sub_ps(vector_max, vector_src_23);
- vector_src_24 = _mm512_sub_ps(vector_max, vector_src_24);
- vector_src_25 = _mm512_sub_ps(vector_max, vector_src_25);
- vector_src_26 = _mm512_sub_ps(vector_max, vector_src_26);
- vector_src_27 = _mm512_sub_ps(vector_max, vector_src_27);
- vector_src_28 = _mm512_sub_ps(vector_max, vector_src_28);
- vector_src_29 = _mm512_sub_ps(vector_max, vector_src_29);
- vector_src_30 = _mm512_sub_ps(vector_max, vector_src_30);
- _mm512_storeu_ps(local_dstp + n * 0, vector_src_00);
- _mm512_storeu_ps(local_dstp + n * 1, vector_src_01);
- _mm512_storeu_ps(local_dstp + n * 2, vector_src_02);
- _mm512_storeu_ps(local_dstp + n * 3, vector_src_03);
- _mm512_storeu_ps(local_dstp + n * 4, vector_src_04);
- _mm512_storeu_ps(local_dstp + n * 5, vector_src_05);
- _mm512_storeu_ps(local_dstp + n * 6, vector_src_06);
- _mm512_storeu_ps(local_dstp + n * 7, vector_src_07);
- _mm512_storeu_ps(local_dstp + n * 8, vector_src_08);
- _mm512_storeu_ps(local_dstp + n * 9, vector_src_09);
- _mm512_storeu_ps(local_dstp + n * 10, vector_src_10);
- _mm512_storeu_ps(local_dstp + n * 11, vector_src_11);
- _mm512_storeu_ps(local_dstp + n * 12, vector_src_12);
- _mm512_storeu_ps(local_dstp + n * 13, vector_src_13);
- _mm512_storeu_ps(local_dstp + n * 14, vector_src_14);
- _mm512_storeu_ps(local_dstp + n * 15, vector_src_15);
- _mm512_storeu_ps(local_dstp + n * 16, vector_src_16);
- _mm512_storeu_ps(local_dstp + n * 17, vector_src_17);
- _mm512_storeu_ps(local_dstp + n * 18, vector_src_18);
- _mm512_storeu_ps(local_dstp + n * 19, vector_src_19);
- _mm512_storeu_ps(local_dstp + n * 20, vector_src_20);
- _mm512_storeu_ps(local_dstp + n * 21, vector_src_21);
- _mm512_storeu_ps(local_dstp + n * 22, vector_src_22);
- _mm512_storeu_ps(local_dstp + n * 23, vector_src_23);
- _mm512_storeu_ps(local_dstp + n * 24, vector_src_24);
- _mm512_storeu_ps(local_dstp + n * 25, vector_src_25);
- _mm512_storeu_ps(local_dstp + n * 26, vector_src_26);
- _mm512_storeu_ps(local_dstp + n * 27, vector_src_27);
- _mm512_storeu_ps(local_dstp + n * 28, vector_src_28);
- _mm512_storeu_ps(local_dstp + n * 29, vector_src_29);
- _mm512_storeu_ps(local_dstp + n * 30, vector_src_30);
- local_srcp += (n * 30);
- local_dstp += (n * 30);
- }
- row_size_mod = row_size_rst - (row_size_rst % (n * 15));
- row_size_rst = row_size_rst % (n * 15);
- for (auto column = 0; column < row_size_mod; column += (n * 15))
- {
- __m512 vector_src_00 = _mm512_loadu_ps(local_srcp + n * 0);
- __m512 vector_src_01 = _mm512_loadu_ps(local_srcp + n * 1);
- __m512 vector_src_02 = _mm512_loadu_ps(local_srcp + n * 2);
- __m512 vector_src_03 = _mm512_loadu_ps(local_srcp + n * 3);
- __m512 vector_src_04 = _mm512_loadu_ps(local_srcp + n * 4);
- __m512 vector_src_05 = _mm512_loadu_ps(local_srcp + n * 5);
- __m512 vector_src_06 = _mm512_loadu_ps(local_srcp + n * 6);
- __m512 vector_src_07 = _mm512_loadu_ps(local_srcp + n * 7);
- __m512 vector_src_08 = _mm512_loadu_ps(local_srcp + n * 8);
- __m512 vector_src_09 = _mm512_loadu_ps(local_srcp + n * 9);
- __m512 vector_src_10 = _mm512_loadu_ps(local_srcp + n * 10);
- __m512 vector_src_11 = _mm512_loadu_ps(local_srcp + n * 11);
- __m512 vector_src_12 = _mm512_loadu_ps(local_srcp + n * 12);
- __m512 vector_src_13 = _mm512_loadu_ps(local_srcp + n * 13);
- __m512 vector_src_14 = _mm512_loadu_ps(local_srcp + n * 14);
- __m512 vector_src_15 = _mm512_loadu_ps(local_srcp + n * 15);
- vector_src_00 = _mm512_sub_ps(vector_max, vector_src_00);
- vector_src_01 = _mm512_sub_ps(vector_max, vector_src_01);
- vector_src_02 = _mm512_sub_ps(vector_max, vector_src_02);
- vector_src_03 = _mm512_sub_ps(vector_max, vector_src_03);
- vector_src_04 = _mm512_sub_ps(vector_max, vector_src_04);
- vector_src_05 = _mm512_sub_ps(vector_max, vector_src_05);
- vector_src_06 = _mm512_sub_ps(vector_max, vector_src_06);
- vector_src_07 = _mm512_sub_ps(vector_max, vector_src_07);
- vector_src_08 = _mm512_sub_ps(vector_max, vector_src_08);
- vector_src_09 = _mm512_sub_ps(vector_max, vector_src_09);
- vector_src_10 = _mm512_sub_ps(vector_max, vector_src_10);
- vector_src_11 = _mm512_sub_ps(vector_max, vector_src_11);
- vector_src_12 = _mm512_sub_ps(vector_max, vector_src_12);
- vector_src_13 = _mm512_sub_ps(vector_max, vector_src_13);
- vector_src_14 = _mm512_sub_ps(vector_max, vector_src_14);
- vector_src_15 = _mm512_sub_ps(vector_max, vector_src_15);
- _mm512_storeu_ps(local_dstp + n * 0, vector_src_00);
- _mm512_storeu_ps(local_dstp + n * 1, vector_src_01);
- _mm512_storeu_ps(local_dstp + n * 2, vector_src_02);
- _mm512_storeu_ps(local_dstp + n * 3, vector_src_03);
- _mm512_storeu_ps(local_dstp + n * 4, vector_src_04);
- _mm512_storeu_ps(local_dstp + n * 5, vector_src_05);
- _mm512_storeu_ps(local_dstp + n * 6, vector_src_06);
- _mm512_storeu_ps(local_dstp + n * 7, vector_src_07);
- _mm512_storeu_ps(local_dstp + n * 8, vector_src_08);
- _mm512_storeu_ps(local_dstp + n * 9, vector_src_09);
- _mm512_storeu_ps(local_dstp + n * 10, vector_src_10);
- _mm512_storeu_ps(local_dstp + n * 11, vector_src_11);
- _mm512_storeu_ps(local_dstp + n * 12, vector_src_12);
- _mm512_storeu_ps(local_dstp + n * 13, vector_src_13);
- _mm512_storeu_ps(local_dstp + n * 14, vector_src_14);
- _mm512_storeu_ps(local_dstp + n * 15, vector_src_15);
- local_srcp += (n * 15);
- local_dstp += (n * 15);
- }
- row_size_mod = row_size_rst - (row_size_rst % (n * 7));
- row_size_rst = row_size_rst % (n * 7);
- for (auto column = 0; column < row_size_mod; column += (n * 7))
- {
- __m512 vector_src_00 = _mm512_loadu_ps(local_srcp + n * 0);
- __m512 vector_src_01 = _mm512_loadu_ps(local_srcp + n * 1);
- __m512 vector_src_02 = _mm512_loadu_ps(local_srcp + n * 2);
- __m512 vector_src_03 = _mm512_loadu_ps(local_srcp + n * 3);
- __m512 vector_src_04 = _mm512_loadu_ps(local_srcp + n * 4);
- __m512 vector_src_05 = _mm512_loadu_ps(local_srcp + n * 5);
- __m512 vector_src_06 = _mm512_loadu_ps(local_srcp + n * 6);
- __m512 vector_src_07 = _mm512_loadu_ps(local_srcp + n * 7);
- vector_src_00 = _mm512_sub_ps(vector_max, vector_src_00);
- vector_src_01 = _mm512_sub_ps(vector_max, vector_src_01);
- vector_src_02 = _mm512_sub_ps(vector_max, vector_src_02);
- vector_src_03 = _mm512_sub_ps(vector_max, vector_src_03);
- vector_src_04 = _mm512_sub_ps(vector_max, vector_src_04);
- vector_src_05 = _mm512_sub_ps(vector_max, vector_src_05);
- vector_src_06 = _mm512_sub_ps(vector_max, vector_src_06);
- vector_src_07 = _mm512_sub_ps(vector_max, vector_src_07);
- _mm512_storeu_ps(local_dstp + n * 0, vector_src_00);
- _mm512_storeu_ps(local_dstp + n * 1, vector_src_01);
- _mm512_storeu_ps(local_dstp + n * 2, vector_src_02);
- _mm512_storeu_ps(local_dstp + n * 3, vector_src_03);
- _mm512_storeu_ps(local_dstp + n * 4, vector_src_04);
- _mm512_storeu_ps(local_dstp + n * 5, vector_src_05);
- _mm512_storeu_ps(local_dstp + n * 6, vector_src_06);
- _mm512_storeu_ps(local_dstp + n * 7, vector_src_07);
- local_srcp += (n * 7);
- local_dstp += (n * 7);
- }
- row_size_mod = row_size_rst - (row_size_rst % (n * 3));
- row_size_rst = row_size_rst % (n * 3);
- for (auto column = 0; column < row_size_mod; column += (n * 3))
- {
- __m512 vector_src_00 = _mm512_loadu_ps(local_srcp + n * 0);
- __m512 vector_src_01 = _mm512_loadu_ps(local_srcp + n * 1);
- __m512 vector_src_02 = _mm512_loadu_ps(local_srcp + n * 2);
- __m512 vector_src_03 = _mm512_loadu_ps(local_srcp + n * 3);
- vector_src_00 = _mm512_sub_ps(vector_max, vector_src_00);
- vector_src_01 = _mm512_sub_ps(vector_max, vector_src_01);
- vector_src_02 = _mm512_sub_ps(vector_max, vector_src_02);
- vector_src_03 = _mm512_sub_ps(vector_max, vector_src_03);
- _mm512_storeu_ps(local_dstp + n * 0, vector_src_00);
- _mm512_storeu_ps(local_dstp + n * 1, vector_src_01);
- _mm512_storeu_ps(local_dstp + n * 2, vector_src_02);
- _mm512_storeu_ps(local_dstp + n * 3, vector_src_03);
- local_srcp += (n * 3);
- local_dstp += (n * 3);
- }
- for (auto column = row_size_mod; column < row_size; column++)
- {
- *local_dstp = (float)(1.0f - *local_srcp);
- local_dstp++;
- local_srcp++;
- }
- }
- }
- else if (bits == 16 || bits == 14 || bits == 12 || bits == 10)
- {
- uint16_t max_pixel = (1 << bits) - 1;
- #pragma omp parallel for num_threads(threads)
- for (auto y = 0; y < height; y++)
- {
- uint16_t* local_dstp = (uint16_t*)(reinterpret_cast<uint16_t*>(_dstp) + y * dst_pitch);
- const uint16_t* local_srcp = (const uint16_t*)(reinterpret_cast<const uint16_t*>(_srcp) + y * src_pitch);
- auto n = 16;
- auto row_size_rst = row_size % (n * 30);
- auto row_size_mod = row_size - row_size_rst;
- __m512i vector_max = _mm512_set1_epi16(max_pixel);
- for (auto column = 0; column < row_size_mod; column += (n * 30))
- {
- __m512i vector_src_00 = _mm512_loadu_epi16(local_srcp + n * 0);
- __m512i vector_src_01 = _mm512_loadu_epi16(local_srcp + n * 1);
- __m512i vector_src_02 = _mm512_loadu_epi16(local_srcp + n * 2);
- __m512i vector_src_03 = _mm512_loadu_epi16(local_srcp + n * 3);
- __m512i vector_src_04 = _mm512_loadu_epi16(local_srcp + n * 4);
- __m512i vector_src_05 = _mm512_loadu_epi16(local_srcp + n * 5);
- __m512i vector_src_06 = _mm512_loadu_epi16(local_srcp + n * 6);
- __m512i vector_src_07 = _mm512_loadu_epi16(local_srcp + n * 7);
- __m512i vector_src_08 = _mm512_loadu_epi16(local_srcp + n * 8);
- __m512i vector_src_09 = _mm512_loadu_epi16(local_srcp + n * 9);
- __m512i vector_src_10 = _mm512_loadu_epi16(local_srcp + n * 10);
- __m512i vector_src_11 = _mm512_loadu_epi16(local_srcp + n * 11);
- __m512i vector_src_12 = _mm512_loadu_epi16(local_srcp + n * 12);
- __m512i vector_src_13 = _mm512_loadu_epi16(local_srcp + n * 13);
- __m512i vector_src_14 = _mm512_loadu_epi16(local_srcp + n * 14);
- __m512i vector_src_15 = _mm512_loadu_epi16(local_srcp + n * 15);
- __m512i vector_src_16 = _mm512_loadu_epi16(local_srcp + n * 16);
- __m512i vector_src_17 = _mm512_loadu_epi16(local_srcp + n * 17);
- __m512i vector_src_18 = _mm512_loadu_epi16(local_srcp + n * 18);
- __m512i vector_src_19 = _mm512_loadu_epi16(local_srcp + n * 19);
- __m512i vector_src_20 = _mm512_loadu_epi16(local_srcp + n * 20);
- __m512i vector_src_21 = _mm512_loadu_epi16(local_srcp + n * 21);
- __m512i vector_src_22 = _mm512_loadu_epi16(local_srcp + n * 22);
- __m512i vector_src_23 = _mm512_loadu_epi16(local_srcp + n * 23);
- __m512i vector_src_24 = _mm512_loadu_epi16(local_srcp + n * 24);
- __m512i vector_src_25 = _mm512_loadu_epi16(local_srcp + n * 25);
- __m512i vector_src_26 = _mm512_loadu_epi16(local_srcp + n * 26);
- __m512i vector_src_27 = _mm512_loadu_epi16(local_srcp + n * 27);
- __m512i vector_src_28 = _mm512_loadu_epi16(local_srcp + n * 28);
- __m512i vector_src_29 = _mm512_loadu_epi16(local_srcp + n * 29);
- __m512i vector_src_30 = _mm512_loadu_epi16(local_srcp + n * 30);
- vector_src_00 = _mm512_sub_epi16(vector_max, vector_src_00);
- vector_src_01 = _mm512_sub_epi16(vector_max, vector_src_01);
- vector_src_02 = _mm512_sub_epi16(vector_max, vector_src_02);
- vector_src_03 = _mm512_sub_epi16(vector_max, vector_src_03);
- vector_src_04 = _mm512_sub_epi16(vector_max, vector_src_04);
- vector_src_05 = _mm512_sub_epi16(vector_max, vector_src_05);
- vector_src_06 = _mm512_sub_epi16(vector_max, vector_src_06);
- vector_src_07 = _mm512_sub_epi16(vector_max, vector_src_07);
- vector_src_08 = _mm512_sub_epi16(vector_max, vector_src_08);
- vector_src_09 = _mm512_sub_epi16(vector_max, vector_src_09);
- vector_src_10 = _mm512_sub_epi16(vector_max, vector_src_10);
- vector_src_11 = _mm512_sub_epi16(vector_max, vector_src_11);
- vector_src_12 = _mm512_sub_epi16(vector_max, vector_src_12);
- vector_src_13 = _mm512_sub_epi16(vector_max, vector_src_13);
- vector_src_14 = _mm512_sub_epi16(vector_max, vector_src_14);
- vector_src_15 = _mm512_sub_epi16(vector_max, vector_src_15);
- vector_src_16 = _mm512_sub_epi16(vector_max, vector_src_16);
- vector_src_17 = _mm512_sub_epi16(vector_max, vector_src_17);
- vector_src_18 = _mm512_sub_epi16(vector_max, vector_src_18);
- vector_src_19 = _mm512_sub_epi16(vector_max, vector_src_19);
- vector_src_20 = _mm512_sub_epi16(vector_max, vector_src_20);
- vector_src_21 = _mm512_sub_epi16(vector_max, vector_src_21);
- vector_src_22 = _mm512_sub_epi16(vector_max, vector_src_22);
- vector_src_23 = _mm512_sub_epi16(vector_max, vector_src_23);
- vector_src_24 = _mm512_sub_epi16(vector_max, vector_src_24);
- vector_src_25 = _mm512_sub_epi16(vector_max, vector_src_25);
- vector_src_26 = _mm512_sub_epi16(vector_max, vector_src_26);
- vector_src_27 = _mm512_sub_epi16(vector_max, vector_src_27);
- vector_src_28 = _mm512_sub_epi16(vector_max, vector_src_28);
- vector_src_29 = _mm512_sub_epi16(vector_max, vector_src_29);
- vector_src_30 = _mm512_sub_epi16(vector_max, vector_src_30);
- _mm512_storeu_epi16(local_dstp + n * 0, vector_src_00);
- _mm512_storeu_epi16(local_dstp + n * 1, vector_src_01);
- _mm512_storeu_epi16(local_dstp + n * 2, vector_src_02);
- _mm512_storeu_epi16(local_dstp + n * 3, vector_src_03);
- _mm512_storeu_epi16(local_dstp + n * 4, vector_src_04);
- _mm512_storeu_epi16(local_dstp + n * 5, vector_src_05);
- _mm512_storeu_epi16(local_dstp + n * 6, vector_src_06);
- _mm512_storeu_epi16(local_dstp + n * 7, vector_src_07);
- _mm512_storeu_epi16(local_dstp + n * 8, vector_src_08);
- _mm512_storeu_epi16(local_dstp + n * 9, vector_src_09);
- _mm512_storeu_epi16(local_dstp + n * 10, vector_src_10);
- _mm512_storeu_epi16(local_dstp + n * 11, vector_src_11);
- _mm512_storeu_epi16(local_dstp + n * 12, vector_src_12);
- _mm512_storeu_epi16(local_dstp + n * 13, vector_src_13);
- _mm512_storeu_epi16(local_dstp + n * 14, vector_src_14);
- _mm512_storeu_epi16(local_dstp + n * 15, vector_src_15);
- _mm512_storeu_epi16(local_dstp + n * 16, vector_src_16);
- _mm512_storeu_epi16(local_dstp + n * 17, vector_src_17);
- _mm512_storeu_epi16(local_dstp + n * 18, vector_src_18);
- _mm512_storeu_epi16(local_dstp + n * 19, vector_src_19);
- _mm512_storeu_epi16(local_dstp + n * 20, vector_src_20);
- _mm512_storeu_epi16(local_dstp + n * 21, vector_src_21);
- _mm512_storeu_epi16(local_dstp + n * 22, vector_src_22);
- _mm512_storeu_epi16(local_dstp + n * 23, vector_src_23);
- _mm512_storeu_epi16(local_dstp + n * 24, vector_src_24);
- _mm512_storeu_epi16(local_dstp + n * 25, vector_src_25);
- _mm512_storeu_epi16(local_dstp + n * 26, vector_src_26);
- _mm512_storeu_epi16(local_dstp + n * 27, vector_src_27);
- _mm512_storeu_epi16(local_dstp + n * 28, vector_src_28);
- _mm512_storeu_epi16(local_dstp + n * 29, vector_src_29);
- _mm512_storeu_epi16(local_dstp + n * 30, vector_src_30);
- local_srcp += (n * 30);
- local_dstp += (n * 30);
- }
- row_size_mod = row_size_rst - (row_size_rst % (n * 15));
- row_size_rst = row_size_rst % (n * 15);
- for (auto column = 0; column < row_size_mod; column += (n * 15))
- {
- __m512i vector_src_00 = _mm512_loadu_epi16(local_srcp + n * 0);
- __m512i vector_src_01 = _mm512_loadu_epi16(local_srcp + n * 1);
- __m512i vector_src_02 = _mm512_loadu_epi16(local_srcp + n * 2);
- __m512i vector_src_03 = _mm512_loadu_epi16(local_srcp + n * 3);
- __m512i vector_src_04 = _mm512_loadu_epi16(local_srcp + n * 4);
- __m512i vector_src_05 = _mm512_loadu_epi16(local_srcp + n * 5);
- __m512i vector_src_06 = _mm512_loadu_epi16(local_srcp + n * 6);
- __m512i vector_src_07 = _mm512_loadu_epi16(local_srcp + n * 7);
- __m512i vector_src_08 = _mm512_loadu_epi16(local_srcp + n * 8);
- __m512i vector_src_09 = _mm512_loadu_epi16(local_srcp + n * 9);
- __m512i vector_src_10 = _mm512_loadu_epi16(local_srcp + n * 10);
- __m512i vector_src_11 = _mm512_loadu_epi16(local_srcp + n * 11);
- __m512i vector_src_12 = _mm512_loadu_epi16(local_srcp + n * 12);
- __m512i vector_src_13 = _mm512_loadu_epi16(local_srcp + n * 13);
- __m512i vector_src_14 = _mm512_loadu_epi16(local_srcp + n * 14);
- __m512i vector_src_15 = _mm512_loadu_epi16(local_srcp + n * 15);
- vector_src_00 = _mm512_sub_epi16(vector_max, vector_src_00);
- vector_src_01 = _mm512_sub_epi16(vector_max, vector_src_01);
- vector_src_02 = _mm512_sub_epi16(vector_max, vector_src_02);
- vector_src_03 = _mm512_sub_epi16(vector_max, vector_src_03);
- vector_src_04 = _mm512_sub_epi16(vector_max, vector_src_04);
- vector_src_05 = _mm512_sub_epi16(vector_max, vector_src_05);
- vector_src_06 = _mm512_sub_epi16(vector_max, vector_src_06);
- vector_src_07 = _mm512_sub_epi16(vector_max, vector_src_07);
- vector_src_08 = _mm512_sub_epi16(vector_max, vector_src_08);
- vector_src_09 = _mm512_sub_epi16(vector_max, vector_src_09);
- vector_src_10 = _mm512_sub_epi16(vector_max, vector_src_10);
- vector_src_11 = _mm512_sub_epi16(vector_max, vector_src_11);
- vector_src_12 = _mm512_sub_epi16(vector_max, vector_src_12);
- vector_src_13 = _mm512_sub_epi16(vector_max, vector_src_13);
- vector_src_14 = _mm512_sub_epi16(vector_max, vector_src_14);
- vector_src_15 = _mm512_sub_epi16(vector_max, vector_src_15);
- _mm512_storeu_epi16(local_dstp + n * 0, vector_src_00);
- _mm512_storeu_epi16(local_dstp + n * 1, vector_src_01);
- _mm512_storeu_epi16(local_dstp + n * 2, vector_src_02);
- _mm512_storeu_epi16(local_dstp + n * 3, vector_src_03);
- _mm512_storeu_epi16(local_dstp + n * 4, vector_src_04);
- _mm512_storeu_epi16(local_dstp + n * 5, vector_src_05);
- _mm512_storeu_epi16(local_dstp + n * 6, vector_src_06);
- _mm512_storeu_epi16(local_dstp + n * 7, vector_src_07);
- _mm512_storeu_epi16(local_dstp + n * 8, vector_src_08);
- _mm512_storeu_epi16(local_dstp + n * 9, vector_src_09);
- _mm512_storeu_epi16(local_dstp + n * 10, vector_src_10);
- _mm512_storeu_epi16(local_dstp + n * 11, vector_src_11);
- _mm512_storeu_epi16(local_dstp + n * 12, vector_src_12);
- _mm512_storeu_epi16(local_dstp + n * 13, vector_src_13);
- _mm512_storeu_epi16(local_dstp + n * 14, vector_src_14);
- _mm512_storeu_epi16(local_dstp + n * 15, vector_src_15);
- local_srcp += (n * 15);
- local_dstp += (n * 15);
- }
- row_size_mod = row_size_rst - (row_size_rst % (n * 7));
- row_size_rst = row_size_rst % (n * 7);
- for (auto column = 0; column < row_size_mod; column += (n * 7))
- {
- __m512i vector_src_00 = _mm512_loadu_epi16(local_srcp + n * 0);
- __m512i vector_src_01 = _mm512_loadu_epi16(local_srcp + n * 1);
- __m512i vector_src_02 = _mm512_loadu_epi16(local_srcp + n * 2);
- __m512i vector_src_03 = _mm512_loadu_epi16(local_srcp + n * 3);
- __m512i vector_src_04 = _mm512_loadu_epi16(local_srcp + n * 4);
- __m512i vector_src_05 = _mm512_loadu_epi16(local_srcp + n * 5);
- __m512i vector_src_06 = _mm512_loadu_epi16(local_srcp + n * 6);
- __m512i vector_src_07 = _mm512_loadu_epi16(local_srcp + n * 7);
- vector_src_00 = _mm512_sub_epi16(vector_max, vector_src_00);
- vector_src_01 = _mm512_sub_epi16(vector_max, vector_src_01);
- vector_src_02 = _mm512_sub_epi16(vector_max, vector_src_02);
- vector_src_03 = _mm512_sub_epi16(vector_max, vector_src_03);
- vector_src_04 = _mm512_sub_epi16(vector_max, vector_src_04);
- vector_src_05 = _mm512_sub_epi16(vector_max, vector_src_05);
- vector_src_06 = _mm512_sub_epi16(vector_max, vector_src_06);
- vector_src_07 = _mm512_sub_epi16(vector_max, vector_src_07);
- _mm512_storeu_epi16(local_dstp + n * 0, vector_src_00);
- _mm512_storeu_epi16(local_dstp + n * 1, vector_src_01);
- _mm512_storeu_epi16(local_dstp + n * 2, vector_src_02);
- _mm512_storeu_epi16(local_dstp + n * 3, vector_src_03);
- _mm512_storeu_epi16(local_dstp + n * 4, vector_src_04);
- _mm512_storeu_epi16(local_dstp + n * 5, vector_src_05);
- _mm512_storeu_epi16(local_dstp + n * 6, vector_src_06);
- _mm512_storeu_epi16(local_dstp + n * 7, vector_src_07);
- local_srcp += (n * 7);
- local_dstp += (n * 7);
- }
- row_size_mod = row_size_rst - (row_size_rst % (n * 3));
- row_size_rst = row_size_rst % (n * 3);
- for (auto column = 0; column < row_size_mod; column += (n * 3))
- {
- __m512i vector_src_00 = _mm512_loadu_epi16(local_srcp + n * 0);
- __m512i vector_src_01 = _mm512_loadu_epi16(local_srcp + n * 1);
- __m512i vector_src_02 = _mm512_loadu_epi16(local_srcp + n * 2);
- __m512i vector_src_03 = _mm512_loadu_epi16(local_srcp + n * 3);
- vector_src_00 = _mm512_sub_epi16(vector_max, vector_src_00);
- vector_src_01 = _mm512_sub_epi16(vector_max, vector_src_01);
- vector_src_02 = _mm512_sub_epi16(vector_max, vector_src_02);
- vector_src_03 = _mm512_sub_epi16(vector_max, vector_src_03);
- _mm512_storeu_epi16(local_dstp + n * 0, vector_src_00);
- _mm512_storeu_epi16(local_dstp + n * 1, vector_src_01);
- _mm512_storeu_epi16(local_dstp + n * 2, vector_src_02);
- _mm512_storeu_epi16(local_dstp + n * 3, vector_src_03);
- local_srcp += (n * 3);
- local_dstp += (n * 3);
- }
- for (auto column = row_size_mod; column < row_size; column++)
- {
- *local_dstp = (uint16_t)(*local_srcp ^ max_pixel);
- local_dstp++;
- local_srcp++;
- }
- }
- }
- else
- {
- #pragma omp parallel for num_threads(threads)
- for (auto y = 0; y < height; y++)
- {
- uint8_t* local_dstp = (uint8_t*)(reinterpret_cast<uint8_t*>(_dstp) + y * dst_pitch);
- const uint8_t* local_srcp = (const uint8_t*)(reinterpret_cast<const uint8_t*>(_srcp) + y * src_pitch);
- auto n = 32;
- auto row_size_rst = row_size % (n * 30);
- auto row_size_mod = row_size - row_size_rst;
- __m512i vector_max = _mm512_set1_epi8(255);
- for (auto column = 0; column < row_size_mod; column += (n * 30))
- {
- __m512i vector_src_00 = _mm512_loadu_epi8(local_srcp + n * 0);
- __m512i vector_src_01 = _mm512_loadu_epi8(local_srcp + n * 1);
- __m512i vector_src_02 = _mm512_loadu_epi8(local_srcp + n * 2);
- __m512i vector_src_03 = _mm512_loadu_epi8(local_srcp + n * 3);
- __m512i vector_src_04 = _mm512_loadu_epi8(local_srcp + n * 4);
- __m512i vector_src_05 = _mm512_loadu_epi8(local_srcp + n * 5);
- __m512i vector_src_06 = _mm512_loadu_epi8(local_srcp + n * 6);
- __m512i vector_src_07 = _mm512_loadu_epi8(local_srcp + n * 7);
- __m512i vector_src_08 = _mm512_loadu_epi8(local_srcp + n * 8);
- __m512i vector_src_09 = _mm512_loadu_epi8(local_srcp + n * 9);
- __m512i vector_src_10 = _mm512_loadu_epi8(local_srcp + n * 10);
- __m512i vector_src_11 = _mm512_loadu_epi8(local_srcp + n * 11);
- __m512i vector_src_12 = _mm512_loadu_epi8(local_srcp + n * 12);
- __m512i vector_src_13 = _mm512_loadu_epi8(local_srcp + n * 13);
- __m512i vector_src_14 = _mm512_loadu_epi8(local_srcp + n * 14);
- __m512i vector_src_15 = _mm512_loadu_epi8(local_srcp + n * 15);
- __m512i vector_src_16 = _mm512_loadu_epi8(local_srcp + n * 16);
- __m512i vector_src_17 = _mm512_loadu_epi8(local_srcp + n * 17);
- __m512i vector_src_18 = _mm512_loadu_epi8(local_srcp + n * 18);
- __m512i vector_src_19 = _mm512_loadu_epi8(local_srcp + n * 19);
- __m512i vector_src_20 = _mm512_loadu_epi8(local_srcp + n * 20);
- __m512i vector_src_21 = _mm512_loadu_epi8(local_srcp + n * 21);
- __m512i vector_src_22 = _mm512_loadu_epi8(local_srcp + n * 22);
- __m512i vector_src_23 = _mm512_loadu_epi8(local_srcp + n * 23);
- __m512i vector_src_24 = _mm512_loadu_epi8(local_srcp + n * 24);
- __m512i vector_src_25 = _mm512_loadu_epi8(local_srcp + n * 25);
- __m512i vector_src_26 = _mm512_loadu_epi8(local_srcp + n * 26);
- __m512i vector_src_27 = _mm512_loadu_epi8(local_srcp + n * 27);
- __m512i vector_src_28 = _mm512_loadu_epi8(local_srcp + n * 28);
- __m512i vector_src_29 = _mm512_loadu_epi8(local_srcp + n * 29);
- __m512i vector_src_30 = _mm512_loadu_epi8(local_srcp + n * 30);
- vector_src_00 = _mm512_sub_epi8(vector_max, vector_src_00);
- vector_src_01 = _mm512_sub_epi8(vector_max, vector_src_01);
- vector_src_02 = _mm512_sub_epi8(vector_max, vector_src_02);
- vector_src_03 = _mm512_sub_epi8(vector_max, vector_src_03);
- vector_src_04 = _mm512_sub_epi8(vector_max, vector_src_04);
- vector_src_05 = _mm512_sub_epi8(vector_max, vector_src_05);
- vector_src_06 = _mm512_sub_epi8(vector_max, vector_src_06);
- vector_src_07 = _mm512_sub_epi8(vector_max, vector_src_07);
- vector_src_08 = _mm512_sub_epi8(vector_max, vector_src_08);
- vector_src_09 = _mm512_sub_epi8(vector_max, vector_src_09);
- vector_src_10 = _mm512_sub_epi8(vector_max, vector_src_10);
- vector_src_11 = _mm512_sub_epi8(vector_max, vector_src_11);
- vector_src_12 = _mm512_sub_epi8(vector_max, vector_src_12);
- vector_src_13 = _mm512_sub_epi8(vector_max, vector_src_13);
- vector_src_14 = _mm512_sub_epi8(vector_max, vector_src_14);
- vector_src_15 = _mm512_sub_epi8(vector_max, vector_src_15);
- vector_src_16 = _mm512_sub_epi8(vector_max, vector_src_16);
- vector_src_17 = _mm512_sub_epi8(vector_max, vector_src_17);
- vector_src_18 = _mm512_sub_epi8(vector_max, vector_src_18);
- vector_src_19 = _mm512_sub_epi8(vector_max, vector_src_19);
- vector_src_20 = _mm512_sub_epi8(vector_max, vector_src_20);
- vector_src_21 = _mm512_sub_epi8(vector_max, vector_src_21);
- vector_src_22 = _mm512_sub_epi8(vector_max, vector_src_22);
- vector_src_23 = _mm512_sub_epi8(vector_max, vector_src_23);
- vector_src_24 = _mm512_sub_epi8(vector_max, vector_src_24);
- vector_src_25 = _mm512_sub_epi8(vector_max, vector_src_25);
- vector_src_26 = _mm512_sub_epi8(vector_max, vector_src_26);
- vector_src_27 = _mm512_sub_epi8(vector_max, vector_src_27);
- vector_src_28 = _mm512_sub_epi8(vector_max, vector_src_28);
- vector_src_29 = _mm512_sub_epi8(vector_max, vector_src_29);
- vector_src_30 = _mm512_sub_epi8(vector_max, vector_src_30);
- _mm512_storeu_epi8(local_dstp + n * 0, vector_src_00);
- _mm512_storeu_epi8(local_dstp + n * 1, vector_src_01);
- _mm512_storeu_epi8(local_dstp + n * 2, vector_src_02);
- _mm512_storeu_epi8(local_dstp + n * 3, vector_src_03);
- _mm512_storeu_epi8(local_dstp + n * 4, vector_src_04);
- _mm512_storeu_epi8(local_dstp + n * 5, vector_src_05);
- _mm512_storeu_epi8(local_dstp + n * 6, vector_src_06);
- _mm512_storeu_epi8(local_dstp + n * 7, vector_src_07);
- _mm512_storeu_epi8(local_dstp + n * 8, vector_src_08);
- _mm512_storeu_epi8(local_dstp + n * 9, vector_src_09);
- _mm512_storeu_epi8(local_dstp + n * 10, vector_src_10);
- _mm512_storeu_epi8(local_dstp + n * 11, vector_src_11);
- _mm512_storeu_epi8(local_dstp + n * 12, vector_src_12);
- _mm512_storeu_epi8(local_dstp + n * 13, vector_src_13);
- _mm512_storeu_epi8(local_dstp + n * 14, vector_src_14);
- _mm512_storeu_epi8(local_dstp + n * 15, vector_src_15);
- _mm512_storeu_epi8(local_dstp + n * 16, vector_src_16);
- _mm512_storeu_epi8(local_dstp + n * 17, vector_src_17);
- _mm512_storeu_epi8(local_dstp + n * 18, vector_src_18);
- _mm512_storeu_epi8(local_dstp + n * 19, vector_src_19);
- _mm512_storeu_epi8(local_dstp + n * 20, vector_src_20);
- _mm512_storeu_epi8(local_dstp + n * 21, vector_src_21);
- _mm512_storeu_epi8(local_dstp + n * 22, vector_src_22);
- _mm512_storeu_epi8(local_dstp + n * 23, vector_src_23);
- _mm512_storeu_epi8(local_dstp + n * 24, vector_src_24);
- _mm512_storeu_epi8(local_dstp + n * 25, vector_src_25);
- _mm512_storeu_epi8(local_dstp + n * 26, vector_src_26);
- _mm512_storeu_epi8(local_dstp + n * 27, vector_src_27);
- _mm512_storeu_epi8(local_dstp + n * 28, vector_src_28);
- _mm512_storeu_epi8(local_dstp + n * 29, vector_src_29);
- _mm512_storeu_epi8(local_dstp + n * 30, vector_src_30);
- local_srcp += (n * 30);
- local_dstp += (n * 30);
- }
- row_size_mod = row_size_rst - (row_size_rst % (n * 15));
- row_size_rst = row_size_rst % (n * 15);
- for (auto column = 0; column < row_size_mod; column += (n * 15))
- {
- __m512i vector_src_00 = _mm512_loadu_epi8(local_srcp + n * 0);
- __m512i vector_src_01 = _mm512_loadu_epi8(local_srcp + n * 1);
- __m512i vector_src_02 = _mm512_loadu_epi8(local_srcp + n * 2);
- __m512i vector_src_03 = _mm512_loadu_epi8(local_srcp + n * 3);
- __m512i vector_src_04 = _mm512_loadu_epi8(local_srcp + n * 4);
- __m512i vector_src_05 = _mm512_loadu_epi8(local_srcp + n * 5);
- __m512i vector_src_06 = _mm512_loadu_epi8(local_srcp + n * 6);
- __m512i vector_src_07 = _mm512_loadu_epi8(local_srcp + n * 7);
- __m512i vector_src_08 = _mm512_loadu_epi8(local_srcp + n * 8);
- __m512i vector_src_09 = _mm512_loadu_epi8(local_srcp + n * 9);
- __m512i vector_src_10 = _mm512_loadu_epi8(local_srcp + n * 10);
- __m512i vector_src_11 = _mm512_loadu_epi8(local_srcp + n * 11);
- __m512i vector_src_12 = _mm512_loadu_epi8(local_srcp + n * 12);
- __m512i vector_src_13 = _mm512_loadu_epi8(local_srcp + n * 13);
- __m512i vector_src_14 = _mm512_loadu_epi8(local_srcp + n * 14);
- __m512i vector_src_15 = _mm512_loadu_epi8(local_srcp + n * 15);
- vector_src_00 = _mm512_sub_epi8(vector_max, vector_src_00);
- vector_src_01 = _mm512_sub_epi8(vector_max, vector_src_01);
- vector_src_02 = _mm512_sub_epi8(vector_max, vector_src_02);
- vector_src_03 = _mm512_sub_epi8(vector_max, vector_src_03);
- vector_src_04 = _mm512_sub_epi8(vector_max, vector_src_04);
- vector_src_05 = _mm512_sub_epi8(vector_max, vector_src_05);
- vector_src_06 = _mm512_sub_epi8(vector_max, vector_src_06);
- vector_src_07 = _mm512_sub_epi8(vector_max, vector_src_07);
- vector_src_08 = _mm512_sub_epi8(vector_max, vector_src_08);
- vector_src_09 = _mm512_sub_epi8(vector_max, vector_src_09);
- vector_src_10 = _mm512_sub_epi8(vector_max, vector_src_10);
- vector_src_11 = _mm512_sub_epi8(vector_max, vector_src_11);
- vector_src_12 = _mm512_sub_epi8(vector_max, vector_src_12);
- vector_src_13 = _mm512_sub_epi8(vector_max, vector_src_13);
- vector_src_14 = _mm512_sub_epi8(vector_max, vector_src_14);
- vector_src_15 = _mm512_sub_epi8(vector_max, vector_src_15);
- _mm512_storeu_epi8(local_dstp + n * 0, vector_src_00);
- _mm512_storeu_epi8(local_dstp + n * 1, vector_src_01);
- _mm512_storeu_epi8(local_dstp + n * 2, vector_src_02);
- _mm512_storeu_epi8(local_dstp + n * 3, vector_src_03);
- _mm512_storeu_epi8(local_dstp + n * 4, vector_src_04);
- _mm512_storeu_epi8(local_dstp + n * 5, vector_src_05);
- _mm512_storeu_epi8(local_dstp + n * 6, vector_src_06);
- _mm512_storeu_epi8(local_dstp + n * 7, vector_src_07);
- _mm512_storeu_epi8(local_dstp + n * 8, vector_src_08);
- _mm512_storeu_epi8(local_dstp + n * 9, vector_src_09);
- _mm512_storeu_epi8(local_dstp + n * 10, vector_src_10);
- _mm512_storeu_epi8(local_dstp + n * 11, vector_src_11);
- _mm512_storeu_epi8(local_dstp + n * 12, vector_src_12);
- _mm512_storeu_epi8(local_dstp + n * 13, vector_src_13);
- _mm512_storeu_epi8(local_dstp + n * 14, vector_src_14);
- _mm512_storeu_epi8(local_dstp + n * 15, vector_src_15);
- local_srcp += (n * 15);
- local_dstp += (n * 15);
- }
- row_size_mod = row_size_rst - (row_size_rst % (n * 7));
- row_size_rst = row_size_rst % (n * 7);
- for (auto column = 0; column < row_size_mod; column += (n * 7))
- {
- __m512i vector_src_00 = _mm512_loadu_epi8(local_srcp + n * 0);
- __m512i vector_src_01 = _mm512_loadu_epi8(local_srcp + n * 1);
- __m512i vector_src_02 = _mm512_loadu_epi8(local_srcp + n * 2);
- __m512i vector_src_03 = _mm512_loadu_epi8(local_srcp + n * 3);
- __m512i vector_src_04 = _mm512_loadu_epi8(local_srcp + n * 4);
- __m512i vector_src_05 = _mm512_loadu_epi8(local_srcp + n * 5);
- __m512i vector_src_06 = _mm512_loadu_epi8(local_srcp + n * 6);
- __m512i vector_src_07 = _mm512_loadu_epi8(local_srcp + n * 7);
- vector_src_00 = _mm512_sub_epi8(vector_max, vector_src_00);
- vector_src_01 = _mm512_sub_epi8(vector_max, vector_src_01);
- vector_src_02 = _mm512_sub_epi8(vector_max, vector_src_02);
- vector_src_03 = _mm512_sub_epi8(vector_max, vector_src_03);
- vector_src_04 = _mm512_sub_epi8(vector_max, vector_src_04);
- vector_src_05 = _mm512_sub_epi8(vector_max, vector_src_05);
- vector_src_06 = _mm512_sub_epi8(vector_max, vector_src_06);
- vector_src_07 = _mm512_sub_epi8(vector_max, vector_src_07);
- _mm512_storeu_epi8(local_dstp + n * 0, vector_src_00);
- _mm512_storeu_epi8(local_dstp + n * 1, vector_src_01);
- _mm512_storeu_epi8(local_dstp + n * 2, vector_src_02);
- _mm512_storeu_epi8(local_dstp + n * 3, vector_src_03);
- _mm512_storeu_epi8(local_dstp + n * 4, vector_src_04);
- _mm512_storeu_epi8(local_dstp + n * 5, vector_src_05);
- _mm512_storeu_epi8(local_dstp + n * 6, vector_src_06);
- _mm512_storeu_epi8(local_dstp + n * 7, vector_src_07);
- local_srcp += (n * 7);
- local_dstp += (n * 7);
- }
- row_size_mod = row_size_rst - (row_size_rst % (n * 3));
- row_size_rst = row_size_rst % (n * 3);
- for (auto column = 0; column < row_size_mod; column += (n * 3))
- {
- __m512i vector_src_00 = _mm512_loadu_epi8(local_srcp + n * 0);
- __m512i vector_src_01 = _mm512_loadu_epi8(local_srcp + n * 1);
- __m512i vector_src_02 = _mm512_loadu_epi8(local_srcp + n * 2);
- __m512i vector_src_03 = _mm512_loadu_epi8(local_srcp + n * 3);
- vector_src_00 = _mm512_sub_epi8(vector_max, vector_src_00);
- vector_src_01 = _mm512_sub_epi8(vector_max, vector_src_01);
- vector_src_02 = _mm512_sub_epi8(vector_max, vector_src_02);
- vector_src_03 = _mm512_sub_epi8(vector_max, vector_src_03);
- _mm512_storeu_epi8(local_dstp + n * 0, vector_src_00);
- _mm512_storeu_epi8(local_dstp + n * 1, vector_src_01);
- _mm512_storeu_epi8(local_dstp + n * 2, vector_src_02);
- _mm512_storeu_epi8(local_dstp + n * 3, vector_src_03);
- local_srcp += (n * 3);
- local_dstp += (n * 3);
- }
- for (auto column = row_size_mod; column < row_size; column++)
- {
- *local_dstp = (uint8_t)(*local_srcp ^ 255);
- local_dstp++;
- local_srcp++;
- }
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement