Advertisement
Guest User

Untitled

a guest
Jan 29th, 2023
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 45.50 KB | None | 0 0
  1.  
  2. void Invert_AVX512F(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads)
  3. {
  4. if (bits == 32)
  5. {
  6. #pragma omp parallel for num_threads(threads)
  7. for (auto y = 0; y < height; y++)
  8. {
  9. float* local_dstp = (float*)(reinterpret_cast<float*>(_dstp) + y * dst_pitch);
  10. const float* local_srcp = (const float*)(reinterpret_cast<const float*>(_srcp) + y * src_pitch);
  11.  
  12. auto n = 8;
  13. auto row_size_rst = row_size % (n*30);
  14. auto row_size_mod = row_size - row_size_rst;
  15.  
  16. __m512 vector_max = _mm512_set1_ps(1.0f);
  17.  
  18. for (auto column = 0; column < row_size_mod; column += (n * 30))
  19. {
  20. __m512 vector_src_00 = _mm512_loadu_ps(local_srcp + n * 0);
  21. __m512 vector_src_01 = _mm512_loadu_ps(local_srcp + n * 1);
  22. __m512 vector_src_02 = _mm512_loadu_ps(local_srcp + n * 2);
  23. __m512 vector_src_03 = _mm512_loadu_ps(local_srcp + n * 3);
  24. __m512 vector_src_04 = _mm512_loadu_ps(local_srcp + n * 4);
  25. __m512 vector_src_05 = _mm512_loadu_ps(local_srcp + n * 5);
  26. __m512 vector_src_06 = _mm512_loadu_ps(local_srcp + n * 6);
  27. __m512 vector_src_07 = _mm512_loadu_ps(local_srcp + n * 7);
  28. __m512 vector_src_08 = _mm512_loadu_ps(local_srcp + n * 8);
  29. __m512 vector_src_09 = _mm512_loadu_ps(local_srcp + n * 9);
  30. __m512 vector_src_10 = _mm512_loadu_ps(local_srcp + n * 10);
  31. __m512 vector_src_11 = _mm512_loadu_ps(local_srcp + n * 11);
  32. __m512 vector_src_12 = _mm512_loadu_ps(local_srcp + n * 12);
  33. __m512 vector_src_13 = _mm512_loadu_ps(local_srcp + n * 13);
  34. __m512 vector_src_14 = _mm512_loadu_ps(local_srcp + n * 14);
  35. __m512 vector_src_15 = _mm512_loadu_ps(local_srcp + n * 15);
  36. __m512 vector_src_16 = _mm512_loadu_ps(local_srcp + n * 16);
  37. __m512 vector_src_17 = _mm512_loadu_ps(local_srcp + n * 17);
  38. __m512 vector_src_18 = _mm512_loadu_ps(local_srcp + n * 18);
  39. __m512 vector_src_19 = _mm512_loadu_ps(local_srcp + n * 19);
  40. __m512 vector_src_20 = _mm512_loadu_ps(local_srcp + n * 20);
  41. __m512 vector_src_21 = _mm512_loadu_ps(local_srcp + n * 21);
  42. __m512 vector_src_22 = _mm512_loadu_ps(local_srcp + n * 22);
  43. __m512 vector_src_23 = _mm512_loadu_ps(local_srcp + n * 23);
  44. __m512 vector_src_24 = _mm512_loadu_ps(local_srcp + n * 24);
  45. __m512 vector_src_25 = _mm512_loadu_ps(local_srcp + n * 25);
  46. __m512 vector_src_26 = _mm512_loadu_ps(local_srcp + n * 26);
  47. __m512 vector_src_27 = _mm512_loadu_ps(local_srcp + n * 27);
  48. __m512 vector_src_28 = _mm512_loadu_ps(local_srcp + n * 28);
  49. __m512 vector_src_29 = _mm512_loadu_ps(local_srcp + n * 29);
  50. __m512 vector_src_30 = _mm512_loadu_ps(local_srcp + n * 30);
  51.  
  52. vector_src_00 = _mm512_sub_ps(vector_max, vector_src_00);
  53. vector_src_01 = _mm512_sub_ps(vector_max, vector_src_01);
  54. vector_src_02 = _mm512_sub_ps(vector_max, vector_src_02);
  55. vector_src_03 = _mm512_sub_ps(vector_max, vector_src_03);
  56. vector_src_04 = _mm512_sub_ps(vector_max, vector_src_04);
  57. vector_src_05 = _mm512_sub_ps(vector_max, vector_src_05);
  58. vector_src_06 = _mm512_sub_ps(vector_max, vector_src_06);
  59. vector_src_07 = _mm512_sub_ps(vector_max, vector_src_07);
  60. vector_src_08 = _mm512_sub_ps(vector_max, vector_src_08);
  61. vector_src_09 = _mm512_sub_ps(vector_max, vector_src_09);
  62. vector_src_10 = _mm512_sub_ps(vector_max, vector_src_10);
  63. vector_src_11 = _mm512_sub_ps(vector_max, vector_src_11);
  64. vector_src_12 = _mm512_sub_ps(vector_max, vector_src_12);
  65. vector_src_13 = _mm512_sub_ps(vector_max, vector_src_13);
  66. vector_src_14 = _mm512_sub_ps(vector_max, vector_src_14);
  67. vector_src_15 = _mm512_sub_ps(vector_max, vector_src_15);
  68. vector_src_16 = _mm512_sub_ps(vector_max, vector_src_16);
  69. vector_src_17 = _mm512_sub_ps(vector_max, vector_src_17);
  70. vector_src_18 = _mm512_sub_ps(vector_max, vector_src_18);
  71. vector_src_19 = _mm512_sub_ps(vector_max, vector_src_19);
  72. vector_src_20 = _mm512_sub_ps(vector_max, vector_src_20);
  73. vector_src_21 = _mm512_sub_ps(vector_max, vector_src_21);
  74. vector_src_22 = _mm512_sub_ps(vector_max, vector_src_22);
  75. vector_src_23 = _mm512_sub_ps(vector_max, vector_src_23);
  76. vector_src_24 = _mm512_sub_ps(vector_max, vector_src_24);
  77. vector_src_25 = _mm512_sub_ps(vector_max, vector_src_25);
  78. vector_src_26 = _mm512_sub_ps(vector_max, vector_src_26);
  79. vector_src_27 = _mm512_sub_ps(vector_max, vector_src_27);
  80. vector_src_28 = _mm512_sub_ps(vector_max, vector_src_28);
  81. vector_src_29 = _mm512_sub_ps(vector_max, vector_src_29);
  82. vector_src_30 = _mm512_sub_ps(vector_max, vector_src_30);
  83.  
  84. _mm512_storeu_ps(local_dstp + n * 0, vector_src_00);
  85. _mm512_storeu_ps(local_dstp + n * 1, vector_src_01);
  86. _mm512_storeu_ps(local_dstp + n * 2, vector_src_02);
  87. _mm512_storeu_ps(local_dstp + n * 3, vector_src_03);
  88. _mm512_storeu_ps(local_dstp + n * 4, vector_src_04);
  89. _mm512_storeu_ps(local_dstp + n * 5, vector_src_05);
  90. _mm512_storeu_ps(local_dstp + n * 6, vector_src_06);
  91. _mm512_storeu_ps(local_dstp + n * 7, vector_src_07);
  92. _mm512_storeu_ps(local_dstp + n * 8, vector_src_08);
  93. _mm512_storeu_ps(local_dstp + n * 9, vector_src_09);
  94. _mm512_storeu_ps(local_dstp + n * 10, vector_src_10);
  95. _mm512_storeu_ps(local_dstp + n * 11, vector_src_11);
  96. _mm512_storeu_ps(local_dstp + n * 12, vector_src_12);
  97. _mm512_storeu_ps(local_dstp + n * 13, vector_src_13);
  98. _mm512_storeu_ps(local_dstp + n * 14, vector_src_14);
  99. _mm512_storeu_ps(local_dstp + n * 15, vector_src_15);
  100. _mm512_storeu_ps(local_dstp + n * 16, vector_src_16);
  101. _mm512_storeu_ps(local_dstp + n * 17, vector_src_17);
  102. _mm512_storeu_ps(local_dstp + n * 18, vector_src_18);
  103. _mm512_storeu_ps(local_dstp + n * 19, vector_src_19);
  104. _mm512_storeu_ps(local_dstp + n * 20, vector_src_20);
  105. _mm512_storeu_ps(local_dstp + n * 21, vector_src_21);
  106. _mm512_storeu_ps(local_dstp + n * 22, vector_src_22);
  107. _mm512_storeu_ps(local_dstp + n * 23, vector_src_23);
  108. _mm512_storeu_ps(local_dstp + n * 24, vector_src_24);
  109. _mm512_storeu_ps(local_dstp + n * 25, vector_src_25);
  110. _mm512_storeu_ps(local_dstp + n * 26, vector_src_26);
  111. _mm512_storeu_ps(local_dstp + n * 27, vector_src_27);
  112. _mm512_storeu_ps(local_dstp + n * 28, vector_src_28);
  113. _mm512_storeu_ps(local_dstp + n * 29, vector_src_29);
  114. _mm512_storeu_ps(local_dstp + n * 30, vector_src_30);
  115.  
  116. local_srcp += (n * 30);
  117. local_dstp += (n * 30);
  118. }
  119.  
  120. row_size_mod = row_size_rst - (row_size_rst % (n * 15));
  121. row_size_rst = row_size_rst % (n * 15);
  122.  
  123. for (auto column = 0; column < row_size_mod; column += (n * 15))
  124. {
  125. __m512 vector_src_00 = _mm512_loadu_ps(local_srcp + n * 0);
  126. __m512 vector_src_01 = _mm512_loadu_ps(local_srcp + n * 1);
  127. __m512 vector_src_02 = _mm512_loadu_ps(local_srcp + n * 2);
  128. __m512 vector_src_03 = _mm512_loadu_ps(local_srcp + n * 3);
  129. __m512 vector_src_04 = _mm512_loadu_ps(local_srcp + n * 4);
  130. __m512 vector_src_05 = _mm512_loadu_ps(local_srcp + n * 5);
  131. __m512 vector_src_06 = _mm512_loadu_ps(local_srcp + n * 6);
  132. __m512 vector_src_07 = _mm512_loadu_ps(local_srcp + n * 7);
  133. __m512 vector_src_08 = _mm512_loadu_ps(local_srcp + n * 8);
  134. __m512 vector_src_09 = _mm512_loadu_ps(local_srcp + n * 9);
  135. __m512 vector_src_10 = _mm512_loadu_ps(local_srcp + n * 10);
  136. __m512 vector_src_11 = _mm512_loadu_ps(local_srcp + n * 11);
  137. __m512 vector_src_12 = _mm512_loadu_ps(local_srcp + n * 12);
  138. __m512 vector_src_13 = _mm512_loadu_ps(local_srcp + n * 13);
  139. __m512 vector_src_14 = _mm512_loadu_ps(local_srcp + n * 14);
  140. __m512 vector_src_15 = _mm512_loadu_ps(local_srcp + n * 15);
  141.  
  142. vector_src_00 = _mm512_sub_ps(vector_max, vector_src_00);
  143. vector_src_01 = _mm512_sub_ps(vector_max, vector_src_01);
  144. vector_src_02 = _mm512_sub_ps(vector_max, vector_src_02);
  145. vector_src_03 = _mm512_sub_ps(vector_max, vector_src_03);
  146. vector_src_04 = _mm512_sub_ps(vector_max, vector_src_04);
  147. vector_src_05 = _mm512_sub_ps(vector_max, vector_src_05);
  148. vector_src_06 = _mm512_sub_ps(vector_max, vector_src_06);
  149. vector_src_07 = _mm512_sub_ps(vector_max, vector_src_07);
  150. vector_src_08 = _mm512_sub_ps(vector_max, vector_src_08);
  151. vector_src_09 = _mm512_sub_ps(vector_max, vector_src_09);
  152. vector_src_10 = _mm512_sub_ps(vector_max, vector_src_10);
  153. vector_src_11 = _mm512_sub_ps(vector_max, vector_src_11);
  154. vector_src_12 = _mm512_sub_ps(vector_max, vector_src_12);
  155. vector_src_13 = _mm512_sub_ps(vector_max, vector_src_13);
  156. vector_src_14 = _mm512_sub_ps(vector_max, vector_src_14);
  157. vector_src_15 = _mm512_sub_ps(vector_max, vector_src_15);
  158.  
  159. _mm512_storeu_ps(local_dstp + n * 0, vector_src_00);
  160. _mm512_storeu_ps(local_dstp + n * 1, vector_src_01);
  161. _mm512_storeu_ps(local_dstp + n * 2, vector_src_02);
  162. _mm512_storeu_ps(local_dstp + n * 3, vector_src_03);
  163. _mm512_storeu_ps(local_dstp + n * 4, vector_src_04);
  164. _mm512_storeu_ps(local_dstp + n * 5, vector_src_05);
  165. _mm512_storeu_ps(local_dstp + n * 6, vector_src_06);
  166. _mm512_storeu_ps(local_dstp + n * 7, vector_src_07);
  167. _mm512_storeu_ps(local_dstp + n * 8, vector_src_08);
  168. _mm512_storeu_ps(local_dstp + n * 9, vector_src_09);
  169. _mm512_storeu_ps(local_dstp + n * 10, vector_src_10);
  170. _mm512_storeu_ps(local_dstp + n * 11, vector_src_11);
  171. _mm512_storeu_ps(local_dstp + n * 12, vector_src_12);
  172. _mm512_storeu_ps(local_dstp + n * 13, vector_src_13);
  173. _mm512_storeu_ps(local_dstp + n * 14, vector_src_14);
  174. _mm512_storeu_ps(local_dstp + n * 15, vector_src_15);
  175.  
  176. local_srcp += (n * 15);
  177. local_dstp += (n * 15);
  178. }
  179.  
  180. row_size_mod = row_size_rst - (row_size_rst % (n * 7));
  181. row_size_rst = row_size_rst % (n * 7);
  182.  
  183. for (auto column = 0; column < row_size_mod; column += (n * 7))
  184. {
  185. __m512 vector_src_00 = _mm512_loadu_ps(local_srcp + n * 0);
  186. __m512 vector_src_01 = _mm512_loadu_ps(local_srcp + n * 1);
  187. __m512 vector_src_02 = _mm512_loadu_ps(local_srcp + n * 2);
  188. __m512 vector_src_03 = _mm512_loadu_ps(local_srcp + n * 3);
  189. __m512 vector_src_04 = _mm512_loadu_ps(local_srcp + n * 4);
  190. __m512 vector_src_05 = _mm512_loadu_ps(local_srcp + n * 5);
  191. __m512 vector_src_06 = _mm512_loadu_ps(local_srcp + n * 6);
  192. __m512 vector_src_07 = _mm512_loadu_ps(local_srcp + n * 7);
  193.  
  194. vector_src_00 = _mm512_sub_ps(vector_max, vector_src_00);
  195. vector_src_01 = _mm512_sub_ps(vector_max, vector_src_01);
  196. vector_src_02 = _mm512_sub_ps(vector_max, vector_src_02);
  197. vector_src_03 = _mm512_sub_ps(vector_max, vector_src_03);
  198. vector_src_04 = _mm512_sub_ps(vector_max, vector_src_04);
  199. vector_src_05 = _mm512_sub_ps(vector_max, vector_src_05);
  200. vector_src_06 = _mm512_sub_ps(vector_max, vector_src_06);
  201. vector_src_07 = _mm512_sub_ps(vector_max, vector_src_07);
  202.  
  203. _mm512_storeu_ps(local_dstp + n * 0, vector_src_00);
  204. _mm512_storeu_ps(local_dstp + n * 1, vector_src_01);
  205. _mm512_storeu_ps(local_dstp + n * 2, vector_src_02);
  206. _mm512_storeu_ps(local_dstp + n * 3, vector_src_03);
  207. _mm512_storeu_ps(local_dstp + n * 4, vector_src_04);
  208. _mm512_storeu_ps(local_dstp + n * 5, vector_src_05);
  209. _mm512_storeu_ps(local_dstp + n * 6, vector_src_06);
  210. _mm512_storeu_ps(local_dstp + n * 7, vector_src_07);
  211.  
  212. local_srcp += (n * 7);
  213. local_dstp += (n * 7);
  214. }
  215.  
  216. row_size_mod = row_size_rst - (row_size_rst % (n * 3));
  217. row_size_rst = row_size_rst % (n * 3);
  218.  
  219. for (auto column = 0; column < row_size_mod; column += (n * 3))
  220. {
  221. __m512 vector_src_00 = _mm512_loadu_ps(local_srcp + n * 0);
  222. __m512 vector_src_01 = _mm512_loadu_ps(local_srcp + n * 1);
  223. __m512 vector_src_02 = _mm512_loadu_ps(local_srcp + n * 2);
  224. __m512 vector_src_03 = _mm512_loadu_ps(local_srcp + n * 3);
  225.  
  226. vector_src_00 = _mm512_sub_ps(vector_max, vector_src_00);
  227. vector_src_01 = _mm512_sub_ps(vector_max, vector_src_01);
  228. vector_src_02 = _mm512_sub_ps(vector_max, vector_src_02);
  229. vector_src_03 = _mm512_sub_ps(vector_max, vector_src_03);
  230.  
  231. _mm512_storeu_ps(local_dstp + n * 0, vector_src_00);
  232. _mm512_storeu_ps(local_dstp + n * 1, vector_src_01);
  233. _mm512_storeu_ps(local_dstp + n * 2, vector_src_02);
  234. _mm512_storeu_ps(local_dstp + n * 3, vector_src_03);
  235.  
  236. local_srcp += (n * 3);
  237. local_dstp += (n * 3);
  238. }
  239. for (auto column = row_size_mod; column < row_size; column++)
  240. {
  241. *local_dstp = (float)(1.0f - *local_srcp);
  242. local_dstp++;
  243. local_srcp++;
  244. }
  245. }
  246. }
  247. else if (bits == 16 || bits == 14 || bits == 12 || bits == 10)
  248. {
  249. uint16_t max_pixel = (1 << bits) - 1;
  250. #pragma omp parallel for num_threads(threads)
  251. for (auto y = 0; y < height; y++)
  252. {
  253. uint16_t* local_dstp = (uint16_t*)(reinterpret_cast<uint16_t*>(_dstp) + y * dst_pitch);
  254. const uint16_t* local_srcp = (const uint16_t*)(reinterpret_cast<const uint16_t*>(_srcp) + y * src_pitch);
  255.  
  256. auto n = 16;
  257. auto row_size_rst = row_size % (n * 30);
  258. auto row_size_mod = row_size - row_size_rst;
  259.  
  260. __m512i vector_max = _mm512_set1_epi16(max_pixel);
  261.  
  262. for (auto column = 0; column < row_size_mod; column += (n * 30))
  263. {
  264. __m512i vector_src_00 = _mm512_loadu_epi16(local_srcp + n * 0);
  265. __m512i vector_src_01 = _mm512_loadu_epi16(local_srcp + n * 1);
  266. __m512i vector_src_02 = _mm512_loadu_epi16(local_srcp + n * 2);
  267. __m512i vector_src_03 = _mm512_loadu_epi16(local_srcp + n * 3);
  268. __m512i vector_src_04 = _mm512_loadu_epi16(local_srcp + n * 4);
  269. __m512i vector_src_05 = _mm512_loadu_epi16(local_srcp + n * 5);
  270. __m512i vector_src_06 = _mm512_loadu_epi16(local_srcp + n * 6);
  271. __m512i vector_src_07 = _mm512_loadu_epi16(local_srcp + n * 7);
  272. __m512i vector_src_08 = _mm512_loadu_epi16(local_srcp + n * 8);
  273. __m512i vector_src_09 = _mm512_loadu_epi16(local_srcp + n * 9);
  274. __m512i vector_src_10 = _mm512_loadu_epi16(local_srcp + n * 10);
  275. __m512i vector_src_11 = _mm512_loadu_epi16(local_srcp + n * 11);
  276. __m512i vector_src_12 = _mm512_loadu_epi16(local_srcp + n * 12);
  277. __m512i vector_src_13 = _mm512_loadu_epi16(local_srcp + n * 13);
  278. __m512i vector_src_14 = _mm512_loadu_epi16(local_srcp + n * 14);
  279. __m512i vector_src_15 = _mm512_loadu_epi16(local_srcp + n * 15);
  280. __m512i vector_src_16 = _mm512_loadu_epi16(local_srcp + n * 16);
  281. __m512i vector_src_17 = _mm512_loadu_epi16(local_srcp + n * 17);
  282. __m512i vector_src_18 = _mm512_loadu_epi16(local_srcp + n * 18);
  283. __m512i vector_src_19 = _mm512_loadu_epi16(local_srcp + n * 19);
  284. __m512i vector_src_20 = _mm512_loadu_epi16(local_srcp + n * 20);
  285. __m512i vector_src_21 = _mm512_loadu_epi16(local_srcp + n * 21);
  286. __m512i vector_src_22 = _mm512_loadu_epi16(local_srcp + n * 22);
  287. __m512i vector_src_23 = _mm512_loadu_epi16(local_srcp + n * 23);
  288. __m512i vector_src_24 = _mm512_loadu_epi16(local_srcp + n * 24);
  289. __m512i vector_src_25 = _mm512_loadu_epi16(local_srcp + n * 25);
  290. __m512i vector_src_26 = _mm512_loadu_epi16(local_srcp + n * 26);
  291. __m512i vector_src_27 = _mm512_loadu_epi16(local_srcp + n * 27);
  292. __m512i vector_src_28 = _mm512_loadu_epi16(local_srcp + n * 28);
  293. __m512i vector_src_29 = _mm512_loadu_epi16(local_srcp + n * 29);
  294. __m512i vector_src_30 = _mm512_loadu_epi16(local_srcp + n * 30);
  295.  
  296. vector_src_00 = _mm512_sub_epi16(vector_max, vector_src_00);
  297. vector_src_01 = _mm512_sub_epi16(vector_max, vector_src_01);
  298. vector_src_02 = _mm512_sub_epi16(vector_max, vector_src_02);
  299. vector_src_03 = _mm512_sub_epi16(vector_max, vector_src_03);
  300. vector_src_04 = _mm512_sub_epi16(vector_max, vector_src_04);
  301. vector_src_05 = _mm512_sub_epi16(vector_max, vector_src_05);
  302. vector_src_06 = _mm512_sub_epi16(vector_max, vector_src_06);
  303. vector_src_07 = _mm512_sub_epi16(vector_max, vector_src_07);
  304. vector_src_08 = _mm512_sub_epi16(vector_max, vector_src_08);
  305. vector_src_09 = _mm512_sub_epi16(vector_max, vector_src_09);
  306. vector_src_10 = _mm512_sub_epi16(vector_max, vector_src_10);
  307. vector_src_11 = _mm512_sub_epi16(vector_max, vector_src_11);
  308. vector_src_12 = _mm512_sub_epi16(vector_max, vector_src_12);
  309. vector_src_13 = _mm512_sub_epi16(vector_max, vector_src_13);
  310. vector_src_14 = _mm512_sub_epi16(vector_max, vector_src_14);
  311. vector_src_15 = _mm512_sub_epi16(vector_max, vector_src_15);
  312. vector_src_16 = _mm512_sub_epi16(vector_max, vector_src_16);
  313. vector_src_17 = _mm512_sub_epi16(vector_max, vector_src_17);
  314. vector_src_18 = _mm512_sub_epi16(vector_max, vector_src_18);
  315. vector_src_19 = _mm512_sub_epi16(vector_max, vector_src_19);
  316. vector_src_20 = _mm512_sub_epi16(vector_max, vector_src_20);
  317. vector_src_21 = _mm512_sub_epi16(vector_max, vector_src_21);
  318. vector_src_22 = _mm512_sub_epi16(vector_max, vector_src_22);
  319. vector_src_23 = _mm512_sub_epi16(vector_max, vector_src_23);
  320. vector_src_24 = _mm512_sub_epi16(vector_max, vector_src_24);
  321. vector_src_25 = _mm512_sub_epi16(vector_max, vector_src_25);
  322. vector_src_26 = _mm512_sub_epi16(vector_max, vector_src_26);
  323. vector_src_27 = _mm512_sub_epi16(vector_max, vector_src_27);
  324. vector_src_28 = _mm512_sub_epi16(vector_max, vector_src_28);
  325. vector_src_29 = _mm512_sub_epi16(vector_max, vector_src_29);
  326. vector_src_30 = _mm512_sub_epi16(vector_max, vector_src_30);
  327.  
  328. _mm512_storeu_epi16(local_dstp + n * 0, vector_src_00);
  329. _mm512_storeu_epi16(local_dstp + n * 1, vector_src_01);
  330. _mm512_storeu_epi16(local_dstp + n * 2, vector_src_02);
  331. _mm512_storeu_epi16(local_dstp + n * 3, vector_src_03);
  332. _mm512_storeu_epi16(local_dstp + n * 4, vector_src_04);
  333. _mm512_storeu_epi16(local_dstp + n * 5, vector_src_05);
  334. _mm512_storeu_epi16(local_dstp + n * 6, vector_src_06);
  335. _mm512_storeu_epi16(local_dstp + n * 7, vector_src_07);
  336. _mm512_storeu_epi16(local_dstp + n * 8, vector_src_08);
  337. _mm512_storeu_epi16(local_dstp + n * 9, vector_src_09);
  338. _mm512_storeu_epi16(local_dstp + n * 10, vector_src_10);
  339. _mm512_storeu_epi16(local_dstp + n * 11, vector_src_11);
  340. _mm512_storeu_epi16(local_dstp + n * 12, vector_src_12);
  341. _mm512_storeu_epi16(local_dstp + n * 13, vector_src_13);
  342. _mm512_storeu_epi16(local_dstp + n * 14, vector_src_14);
  343. _mm512_storeu_epi16(local_dstp + n * 15, vector_src_15);
  344. _mm512_storeu_epi16(local_dstp + n * 16, vector_src_16);
  345. _mm512_storeu_epi16(local_dstp + n * 17, vector_src_17);
  346. _mm512_storeu_epi16(local_dstp + n * 18, vector_src_18);
  347. _mm512_storeu_epi16(local_dstp + n * 19, vector_src_19);
  348. _mm512_storeu_epi16(local_dstp + n * 20, vector_src_20);
  349. _mm512_storeu_epi16(local_dstp + n * 21, vector_src_21);
  350. _mm512_storeu_epi16(local_dstp + n * 22, vector_src_22);
  351. _mm512_storeu_epi16(local_dstp + n * 23, vector_src_23);
  352. _mm512_storeu_epi16(local_dstp + n * 24, vector_src_24);
  353. _mm512_storeu_epi16(local_dstp + n * 25, vector_src_25);
  354. _mm512_storeu_epi16(local_dstp + n * 26, vector_src_26);
  355. _mm512_storeu_epi16(local_dstp + n * 27, vector_src_27);
  356. _mm512_storeu_epi16(local_dstp + n * 28, vector_src_28);
  357. _mm512_storeu_epi16(local_dstp + n * 29, vector_src_29);
  358. _mm512_storeu_epi16(local_dstp + n * 30, vector_src_30);
  359.  
  360. local_srcp += (n * 30);
  361. local_dstp += (n * 30);
  362. }
  363.  
  364. row_size_mod = row_size_rst - (row_size_rst % (n * 15));
  365. row_size_rst = row_size_rst % (n * 15);
  366.  
  367. for (auto column = 0; column < row_size_mod; column += (n * 15))
  368. {
  369. __m512i vector_src_00 = _mm512_loadu_epi16(local_srcp + n * 0);
  370. __m512i vector_src_01 = _mm512_loadu_epi16(local_srcp + n * 1);
  371. __m512i vector_src_02 = _mm512_loadu_epi16(local_srcp + n * 2);
  372. __m512i vector_src_03 = _mm512_loadu_epi16(local_srcp + n * 3);
  373. __m512i vector_src_04 = _mm512_loadu_epi16(local_srcp + n * 4);
  374. __m512i vector_src_05 = _mm512_loadu_epi16(local_srcp + n * 5);
  375. __m512i vector_src_06 = _mm512_loadu_epi16(local_srcp + n * 6);
  376. __m512i vector_src_07 = _mm512_loadu_epi16(local_srcp + n * 7);
  377. __m512i vector_src_08 = _mm512_loadu_epi16(local_srcp + n * 8);
  378. __m512i vector_src_09 = _mm512_loadu_epi16(local_srcp + n * 9);
  379. __m512i vector_src_10 = _mm512_loadu_epi16(local_srcp + n * 10);
  380. __m512i vector_src_11 = _mm512_loadu_epi16(local_srcp + n * 11);
  381. __m512i vector_src_12 = _mm512_loadu_epi16(local_srcp + n * 12);
  382. __m512i vector_src_13 = _mm512_loadu_epi16(local_srcp + n * 13);
  383. __m512i vector_src_14 = _mm512_loadu_epi16(local_srcp + n * 14);
  384. __m512i vector_src_15 = _mm512_loadu_epi16(local_srcp + n * 15);
  385.  
  386. vector_src_00 = _mm512_sub_epi16(vector_max, vector_src_00);
  387. vector_src_01 = _mm512_sub_epi16(vector_max, vector_src_01);
  388. vector_src_02 = _mm512_sub_epi16(vector_max, vector_src_02);
  389. vector_src_03 = _mm512_sub_epi16(vector_max, vector_src_03);
  390. vector_src_04 = _mm512_sub_epi16(vector_max, vector_src_04);
  391. vector_src_05 = _mm512_sub_epi16(vector_max, vector_src_05);
  392. vector_src_06 = _mm512_sub_epi16(vector_max, vector_src_06);
  393. vector_src_07 = _mm512_sub_epi16(vector_max, vector_src_07);
  394. vector_src_08 = _mm512_sub_epi16(vector_max, vector_src_08);
  395. vector_src_09 = _mm512_sub_epi16(vector_max, vector_src_09);
  396. vector_src_10 = _mm512_sub_epi16(vector_max, vector_src_10);
  397. vector_src_11 = _mm512_sub_epi16(vector_max, vector_src_11);
  398. vector_src_12 = _mm512_sub_epi16(vector_max, vector_src_12);
  399. vector_src_13 = _mm512_sub_epi16(vector_max, vector_src_13);
  400. vector_src_14 = _mm512_sub_epi16(vector_max, vector_src_14);
  401. vector_src_15 = _mm512_sub_epi16(vector_max, vector_src_15);
  402.  
  403. _mm512_storeu_epi16(local_dstp + n * 0, vector_src_00);
  404. _mm512_storeu_epi16(local_dstp + n * 1, vector_src_01);
  405. _mm512_storeu_epi16(local_dstp + n * 2, vector_src_02);
  406. _mm512_storeu_epi16(local_dstp + n * 3, vector_src_03);
  407. _mm512_storeu_epi16(local_dstp + n * 4, vector_src_04);
  408. _mm512_storeu_epi16(local_dstp + n * 5, vector_src_05);
  409. _mm512_storeu_epi16(local_dstp + n * 6, vector_src_06);
  410. _mm512_storeu_epi16(local_dstp + n * 7, vector_src_07);
  411. _mm512_storeu_epi16(local_dstp + n * 8, vector_src_08);
  412. _mm512_storeu_epi16(local_dstp + n * 9, vector_src_09);
  413. _mm512_storeu_epi16(local_dstp + n * 10, vector_src_10);
  414. _mm512_storeu_epi16(local_dstp + n * 11, vector_src_11);
  415. _mm512_storeu_epi16(local_dstp + n * 12, vector_src_12);
  416. _mm512_storeu_epi16(local_dstp + n * 13, vector_src_13);
  417. _mm512_storeu_epi16(local_dstp + n * 14, vector_src_14);
  418. _mm512_storeu_epi16(local_dstp + n * 15, vector_src_15);
  419.  
  420. local_srcp += (n * 15);
  421. local_dstp += (n * 15);
  422. }
  423.  
  424. row_size_mod = row_size_rst - (row_size_rst % (n * 7));
  425. row_size_rst = row_size_rst % (n * 7);
  426.  
  427. for (auto column = 0; column < row_size_mod; column += (n * 7))
  428. {
  429. __m512i vector_src_00 = _mm512_loadu_epi16(local_srcp + n * 0);
  430. __m512i vector_src_01 = _mm512_loadu_epi16(local_srcp + n * 1);
  431. __m512i vector_src_02 = _mm512_loadu_epi16(local_srcp + n * 2);
  432. __m512i vector_src_03 = _mm512_loadu_epi16(local_srcp + n * 3);
  433. __m512i vector_src_04 = _mm512_loadu_epi16(local_srcp + n * 4);
  434. __m512i vector_src_05 = _mm512_loadu_epi16(local_srcp + n * 5);
  435. __m512i vector_src_06 = _mm512_loadu_epi16(local_srcp + n * 6);
  436. __m512i vector_src_07 = _mm512_loadu_epi16(local_srcp + n * 7);
  437.  
  438. vector_src_00 = _mm512_sub_epi16(vector_max, vector_src_00);
  439. vector_src_01 = _mm512_sub_epi16(vector_max, vector_src_01);
  440. vector_src_02 = _mm512_sub_epi16(vector_max, vector_src_02);
  441. vector_src_03 = _mm512_sub_epi16(vector_max, vector_src_03);
  442. vector_src_04 = _mm512_sub_epi16(vector_max, vector_src_04);
  443. vector_src_05 = _mm512_sub_epi16(vector_max, vector_src_05);
  444. vector_src_06 = _mm512_sub_epi16(vector_max, vector_src_06);
  445. vector_src_07 = _mm512_sub_epi16(vector_max, vector_src_07);
  446.  
  447. _mm512_storeu_epi16(local_dstp + n * 0, vector_src_00);
  448. _mm512_storeu_epi16(local_dstp + n * 1, vector_src_01);
  449. _mm512_storeu_epi16(local_dstp + n * 2, vector_src_02);
  450. _mm512_storeu_epi16(local_dstp + n * 3, vector_src_03);
  451. _mm512_storeu_epi16(local_dstp + n * 4, vector_src_04);
  452. _mm512_storeu_epi16(local_dstp + n * 5, vector_src_05);
  453. _mm512_storeu_epi16(local_dstp + n * 6, vector_src_06);
  454. _mm512_storeu_epi16(local_dstp + n * 7, vector_src_07);
  455.  
  456. local_srcp += (n * 7);
  457. local_dstp += (n * 7);
  458. }
  459.  
  460. row_size_mod = row_size_rst - (row_size_rst % (n * 3));
  461. row_size_rst = row_size_rst % (n * 3);
  462.  
  463. for (auto column = 0; column < row_size_mod; column += (n * 3))
  464. {
  465. __m512i vector_src_00 = _mm512_loadu_epi16(local_srcp + n * 0);
  466. __m512i vector_src_01 = _mm512_loadu_epi16(local_srcp + n * 1);
  467. __m512i vector_src_02 = _mm512_loadu_epi16(local_srcp + n * 2);
  468. __m512i vector_src_03 = _mm512_loadu_epi16(local_srcp + n * 3);
  469.  
  470. vector_src_00 = _mm512_sub_epi16(vector_max, vector_src_00);
  471. vector_src_01 = _mm512_sub_epi16(vector_max, vector_src_01);
  472. vector_src_02 = _mm512_sub_epi16(vector_max, vector_src_02);
  473. vector_src_03 = _mm512_sub_epi16(vector_max, vector_src_03);
  474.  
  475. _mm512_storeu_epi16(local_dstp + n * 0, vector_src_00);
  476. _mm512_storeu_epi16(local_dstp + n * 1, vector_src_01);
  477. _mm512_storeu_epi16(local_dstp + n * 2, vector_src_02);
  478. _mm512_storeu_epi16(local_dstp + n * 3, vector_src_03);
  479.  
  480. local_srcp += (n * 3);
  481. local_dstp += (n * 3);
  482. }
  483.  
  484. for (auto column = row_size_mod; column < row_size; column++)
  485. {
  486. *local_dstp = (uint16_t)(*local_srcp ^ max_pixel);
  487. local_dstp++;
  488. local_srcp++;
  489. }
  490. }
  491. }
  492. else
  493. {
  494. #pragma omp parallel for num_threads(threads)
  495. for (auto y = 0; y < height; y++)
  496. {
  497. uint8_t* local_dstp = (uint8_t*)(reinterpret_cast<uint8_t*>(_dstp) + y * dst_pitch);
  498. const uint8_t* local_srcp = (const uint8_t*)(reinterpret_cast<const uint8_t*>(_srcp) + y * src_pitch);
  499.  
  500. auto n = 32;
  501. auto row_size_rst = row_size % (n * 30);
  502. auto row_size_mod = row_size - row_size_rst;
  503.  
  504. __m512i vector_max = _mm512_set1_epi8(255);
  505.  
  506. for (auto column = 0; column < row_size_mod; column += (n * 30))
  507. {
  508. __m512i vector_src_00 = _mm512_loadu_epi8(local_srcp + n * 0);
  509. __m512i vector_src_01 = _mm512_loadu_epi8(local_srcp + n * 1);
  510. __m512i vector_src_02 = _mm512_loadu_epi8(local_srcp + n * 2);
  511. __m512i vector_src_03 = _mm512_loadu_epi8(local_srcp + n * 3);
  512. __m512i vector_src_04 = _mm512_loadu_epi8(local_srcp + n * 4);
  513. __m512i vector_src_05 = _mm512_loadu_epi8(local_srcp + n * 5);
  514. __m512i vector_src_06 = _mm512_loadu_epi8(local_srcp + n * 6);
  515. __m512i vector_src_07 = _mm512_loadu_epi8(local_srcp + n * 7);
  516. __m512i vector_src_08 = _mm512_loadu_epi8(local_srcp + n * 8);
  517. __m512i vector_src_09 = _mm512_loadu_epi8(local_srcp + n * 9);
  518. __m512i vector_src_10 = _mm512_loadu_epi8(local_srcp + n * 10);
  519. __m512i vector_src_11 = _mm512_loadu_epi8(local_srcp + n * 11);
  520. __m512i vector_src_12 = _mm512_loadu_epi8(local_srcp + n * 12);
  521. __m512i vector_src_13 = _mm512_loadu_epi8(local_srcp + n * 13);
  522. __m512i vector_src_14 = _mm512_loadu_epi8(local_srcp + n * 14);
  523. __m512i vector_src_15 = _mm512_loadu_epi8(local_srcp + n * 15);
  524. __m512i vector_src_16 = _mm512_loadu_epi8(local_srcp + n * 16);
  525. __m512i vector_src_17 = _mm512_loadu_epi8(local_srcp + n * 17);
  526. __m512i vector_src_18 = _mm512_loadu_epi8(local_srcp + n * 18);
  527. __m512i vector_src_19 = _mm512_loadu_epi8(local_srcp + n * 19);
  528. __m512i vector_src_20 = _mm512_loadu_epi8(local_srcp + n * 20);
  529. __m512i vector_src_21 = _mm512_loadu_epi8(local_srcp + n * 21);
  530. __m512i vector_src_22 = _mm512_loadu_epi8(local_srcp + n * 22);
  531. __m512i vector_src_23 = _mm512_loadu_epi8(local_srcp + n * 23);
  532. __m512i vector_src_24 = _mm512_loadu_epi8(local_srcp + n * 24);
  533. __m512i vector_src_25 = _mm512_loadu_epi8(local_srcp + n * 25);
  534. __m512i vector_src_26 = _mm512_loadu_epi8(local_srcp + n * 26);
  535. __m512i vector_src_27 = _mm512_loadu_epi8(local_srcp + n * 27);
  536. __m512i vector_src_28 = _mm512_loadu_epi8(local_srcp + n * 28);
  537. __m512i vector_src_29 = _mm512_loadu_epi8(local_srcp + n * 29);
  538. __m512i vector_src_30 = _mm512_loadu_epi8(local_srcp + n * 30);
  539.  
  540. vector_src_00 = _mm512_sub_epi8(vector_max, vector_src_00);
  541. vector_src_01 = _mm512_sub_epi8(vector_max, vector_src_01);
  542. vector_src_02 = _mm512_sub_epi8(vector_max, vector_src_02);
  543. vector_src_03 = _mm512_sub_epi8(vector_max, vector_src_03);
  544. vector_src_04 = _mm512_sub_epi8(vector_max, vector_src_04);
  545. vector_src_05 = _mm512_sub_epi8(vector_max, vector_src_05);
  546. vector_src_06 = _mm512_sub_epi8(vector_max, vector_src_06);
  547. vector_src_07 = _mm512_sub_epi8(vector_max, vector_src_07);
  548. vector_src_08 = _mm512_sub_epi8(vector_max, vector_src_08);
  549. vector_src_09 = _mm512_sub_epi8(vector_max, vector_src_09);
  550. vector_src_10 = _mm512_sub_epi8(vector_max, vector_src_10);
  551. vector_src_11 = _mm512_sub_epi8(vector_max, vector_src_11);
  552. vector_src_12 = _mm512_sub_epi8(vector_max, vector_src_12);
  553. vector_src_13 = _mm512_sub_epi8(vector_max, vector_src_13);
  554. vector_src_14 = _mm512_sub_epi8(vector_max, vector_src_14);
  555. vector_src_15 = _mm512_sub_epi8(vector_max, vector_src_15);
  556. vector_src_16 = _mm512_sub_epi8(vector_max, vector_src_16);
  557. vector_src_17 = _mm512_sub_epi8(vector_max, vector_src_17);
  558. vector_src_18 = _mm512_sub_epi8(vector_max, vector_src_18);
  559. vector_src_19 = _mm512_sub_epi8(vector_max, vector_src_19);
  560. vector_src_20 = _mm512_sub_epi8(vector_max, vector_src_20);
  561. vector_src_21 = _mm512_sub_epi8(vector_max, vector_src_21);
  562. vector_src_22 = _mm512_sub_epi8(vector_max, vector_src_22);
  563. vector_src_23 = _mm512_sub_epi8(vector_max, vector_src_23);
  564. vector_src_24 = _mm512_sub_epi8(vector_max, vector_src_24);
  565. vector_src_25 = _mm512_sub_epi8(vector_max, vector_src_25);
  566. vector_src_26 = _mm512_sub_epi8(vector_max, vector_src_26);
  567. vector_src_27 = _mm512_sub_epi8(vector_max, vector_src_27);
  568. vector_src_28 = _mm512_sub_epi8(vector_max, vector_src_28);
  569. vector_src_29 = _mm512_sub_epi8(vector_max, vector_src_29);
  570. vector_src_30 = _mm512_sub_epi8(vector_max, vector_src_30);
  571.  
  572. _mm512_storeu_epi8(local_dstp + n * 0, vector_src_00);
  573. _mm512_storeu_epi8(local_dstp + n * 1, vector_src_01);
  574. _mm512_storeu_epi8(local_dstp + n * 2, vector_src_02);
  575. _mm512_storeu_epi8(local_dstp + n * 3, vector_src_03);
  576. _mm512_storeu_epi8(local_dstp + n * 4, vector_src_04);
  577. _mm512_storeu_epi8(local_dstp + n * 5, vector_src_05);
  578. _mm512_storeu_epi8(local_dstp + n * 6, vector_src_06);
  579. _mm512_storeu_epi8(local_dstp + n * 7, vector_src_07);
  580. _mm512_storeu_epi8(local_dstp + n * 8, vector_src_08);
  581. _mm512_storeu_epi8(local_dstp + n * 9, vector_src_09);
  582. _mm512_storeu_epi8(local_dstp + n * 10, vector_src_10);
  583. _mm512_storeu_epi8(local_dstp + n * 11, vector_src_11);
  584. _mm512_storeu_epi8(local_dstp + n * 12, vector_src_12);
  585. _mm512_storeu_epi8(local_dstp + n * 13, vector_src_13);
  586. _mm512_storeu_epi8(local_dstp + n * 14, vector_src_14);
  587. _mm512_storeu_epi8(local_dstp + n * 15, vector_src_15);
  588. _mm512_storeu_epi8(local_dstp + n * 16, vector_src_16);
  589. _mm512_storeu_epi8(local_dstp + n * 17, vector_src_17);
  590. _mm512_storeu_epi8(local_dstp + n * 18, vector_src_18);
  591. _mm512_storeu_epi8(local_dstp + n * 19, vector_src_19);
  592. _mm512_storeu_epi8(local_dstp + n * 20, vector_src_20);
  593. _mm512_storeu_epi8(local_dstp + n * 21, vector_src_21);
  594. _mm512_storeu_epi8(local_dstp + n * 22, vector_src_22);
  595. _mm512_storeu_epi8(local_dstp + n * 23, vector_src_23);
  596. _mm512_storeu_epi8(local_dstp + n * 24, vector_src_24);
  597. _mm512_storeu_epi8(local_dstp + n * 25, vector_src_25);
  598. _mm512_storeu_epi8(local_dstp + n * 26, vector_src_26);
  599. _mm512_storeu_epi8(local_dstp + n * 27, vector_src_27);
  600. _mm512_storeu_epi8(local_dstp + n * 28, vector_src_28);
  601. _mm512_storeu_epi8(local_dstp + n * 29, vector_src_29);
  602. _mm512_storeu_epi8(local_dstp + n * 30, vector_src_30);
  603.  
  604. local_srcp += (n * 30);
  605. local_dstp += (n * 30);
  606. }
  607.  
  608. row_size_mod = row_size_rst - (row_size_rst % (n * 15));
  609. row_size_rst = row_size_rst % (n * 15);
  610.  
  611. for (auto column = 0; column < row_size_mod; column += (n * 15))
  612. {
  613. __m512i vector_src_00 = _mm512_loadu_epi8(local_srcp + n * 0);
  614. __m512i vector_src_01 = _mm512_loadu_epi8(local_srcp + n * 1);
  615. __m512i vector_src_02 = _mm512_loadu_epi8(local_srcp + n * 2);
  616. __m512i vector_src_03 = _mm512_loadu_epi8(local_srcp + n * 3);
  617. __m512i vector_src_04 = _mm512_loadu_epi8(local_srcp + n * 4);
  618. __m512i vector_src_05 = _mm512_loadu_epi8(local_srcp + n * 5);
  619. __m512i vector_src_06 = _mm512_loadu_epi8(local_srcp + n * 6);
  620. __m512i vector_src_07 = _mm512_loadu_epi8(local_srcp + n * 7);
  621. __m512i vector_src_08 = _mm512_loadu_epi8(local_srcp + n * 8);
  622. __m512i vector_src_09 = _mm512_loadu_epi8(local_srcp + n * 9);
  623. __m512i vector_src_10 = _mm512_loadu_epi8(local_srcp + n * 10);
  624. __m512i vector_src_11 = _mm512_loadu_epi8(local_srcp + n * 11);
  625. __m512i vector_src_12 = _mm512_loadu_epi8(local_srcp + n * 12);
  626. __m512i vector_src_13 = _mm512_loadu_epi8(local_srcp + n * 13);
  627. __m512i vector_src_14 = _mm512_loadu_epi8(local_srcp + n * 14);
  628. __m512i vector_src_15 = _mm512_loadu_epi8(local_srcp + n * 15);
  629.  
  630. vector_src_00 = _mm512_sub_epi8(vector_max, vector_src_00);
  631. vector_src_01 = _mm512_sub_epi8(vector_max, vector_src_01);
  632. vector_src_02 = _mm512_sub_epi8(vector_max, vector_src_02);
  633. vector_src_03 = _mm512_sub_epi8(vector_max, vector_src_03);
  634. vector_src_04 = _mm512_sub_epi8(vector_max, vector_src_04);
  635. vector_src_05 = _mm512_sub_epi8(vector_max, vector_src_05);
  636. vector_src_06 = _mm512_sub_epi8(vector_max, vector_src_06);
  637. vector_src_07 = _mm512_sub_epi8(vector_max, vector_src_07);
  638. vector_src_08 = _mm512_sub_epi8(vector_max, vector_src_08);
  639. vector_src_09 = _mm512_sub_epi8(vector_max, vector_src_09);
  640. vector_src_10 = _mm512_sub_epi8(vector_max, vector_src_10);
  641. vector_src_11 = _mm512_sub_epi8(vector_max, vector_src_11);
  642. vector_src_12 = _mm512_sub_epi8(vector_max, vector_src_12);
  643. vector_src_13 = _mm512_sub_epi8(vector_max, vector_src_13);
  644. vector_src_14 = _mm512_sub_epi8(vector_max, vector_src_14);
  645. vector_src_15 = _mm512_sub_epi8(vector_max, vector_src_15);
  646.  
  647. _mm512_storeu_epi8(local_dstp + n * 0, vector_src_00);
  648. _mm512_storeu_epi8(local_dstp + n * 1, vector_src_01);
  649. _mm512_storeu_epi8(local_dstp + n * 2, vector_src_02);
  650. _mm512_storeu_epi8(local_dstp + n * 3, vector_src_03);
  651. _mm512_storeu_epi8(local_dstp + n * 4, vector_src_04);
  652. _mm512_storeu_epi8(local_dstp + n * 5, vector_src_05);
  653. _mm512_storeu_epi8(local_dstp + n * 6, vector_src_06);
  654. _mm512_storeu_epi8(local_dstp + n * 7, vector_src_07);
  655. _mm512_storeu_epi8(local_dstp + n * 8, vector_src_08);
  656. _mm512_storeu_epi8(local_dstp + n * 9, vector_src_09);
  657. _mm512_storeu_epi8(local_dstp + n * 10, vector_src_10);
  658. _mm512_storeu_epi8(local_dstp + n * 11, vector_src_11);
  659. _mm512_storeu_epi8(local_dstp + n * 12, vector_src_12);
  660. _mm512_storeu_epi8(local_dstp + n * 13, vector_src_13);
  661. _mm512_storeu_epi8(local_dstp + n * 14, vector_src_14);
  662. _mm512_storeu_epi8(local_dstp + n * 15, vector_src_15);
  663.  
  664. local_srcp += (n * 15);
  665. local_dstp += (n * 15);
  666. }
  667.  
  668. row_size_mod = row_size_rst - (row_size_rst % (n * 7));
  669. row_size_rst = row_size_rst % (n * 7);
  670.  
  671. for (auto column = 0; column < row_size_mod; column += (n * 7))
  672. {
  673. __m512i vector_src_00 = _mm512_loadu_epi8(local_srcp + n * 0);
  674. __m512i vector_src_01 = _mm512_loadu_epi8(local_srcp + n * 1);
  675. __m512i vector_src_02 = _mm512_loadu_epi8(local_srcp + n * 2);
  676. __m512i vector_src_03 = _mm512_loadu_epi8(local_srcp + n * 3);
  677. __m512i vector_src_04 = _mm512_loadu_epi8(local_srcp + n * 4);
  678. __m512i vector_src_05 = _mm512_loadu_epi8(local_srcp + n * 5);
  679. __m512i vector_src_06 = _mm512_loadu_epi8(local_srcp + n * 6);
  680. __m512i vector_src_07 = _mm512_loadu_epi8(local_srcp + n * 7);
  681.  
  682. vector_src_00 = _mm512_sub_epi8(vector_max, vector_src_00);
  683. vector_src_01 = _mm512_sub_epi8(vector_max, vector_src_01);
  684. vector_src_02 = _mm512_sub_epi8(vector_max, vector_src_02);
  685. vector_src_03 = _mm512_sub_epi8(vector_max, vector_src_03);
  686. vector_src_04 = _mm512_sub_epi8(vector_max, vector_src_04);
  687. vector_src_05 = _mm512_sub_epi8(vector_max, vector_src_05);
  688. vector_src_06 = _mm512_sub_epi8(vector_max, vector_src_06);
  689. vector_src_07 = _mm512_sub_epi8(vector_max, vector_src_07);
  690.  
  691. _mm512_storeu_epi8(local_dstp + n * 0, vector_src_00);
  692. _mm512_storeu_epi8(local_dstp + n * 1, vector_src_01);
  693. _mm512_storeu_epi8(local_dstp + n * 2, vector_src_02);
  694. _mm512_storeu_epi8(local_dstp + n * 3, vector_src_03);
  695. _mm512_storeu_epi8(local_dstp + n * 4, vector_src_04);
  696. _mm512_storeu_epi8(local_dstp + n * 5, vector_src_05);
  697. _mm512_storeu_epi8(local_dstp + n * 6, vector_src_06);
  698. _mm512_storeu_epi8(local_dstp + n * 7, vector_src_07);
  699.  
  700. local_srcp += (n * 7);
  701. local_dstp += (n * 7);
  702. }
  703.  
  704. row_size_mod = row_size_rst - (row_size_rst % (n * 3));
  705. row_size_rst = row_size_rst % (n * 3);
  706.  
  707. for (auto column = 0; column < row_size_mod; column += (n * 3))
  708. {
  709. __m512i vector_src_00 = _mm512_loadu_epi8(local_srcp + n * 0);
  710. __m512i vector_src_01 = _mm512_loadu_epi8(local_srcp + n * 1);
  711. __m512i vector_src_02 = _mm512_loadu_epi8(local_srcp + n * 2);
  712. __m512i vector_src_03 = _mm512_loadu_epi8(local_srcp + n * 3);
  713.  
  714. vector_src_00 = _mm512_sub_epi8(vector_max, vector_src_00);
  715. vector_src_01 = _mm512_sub_epi8(vector_max, vector_src_01);
  716. vector_src_02 = _mm512_sub_epi8(vector_max, vector_src_02);
  717. vector_src_03 = _mm512_sub_epi8(vector_max, vector_src_03);
  718.  
  719. _mm512_storeu_epi8(local_dstp + n * 0, vector_src_00);
  720. _mm512_storeu_epi8(local_dstp + n * 1, vector_src_01);
  721. _mm512_storeu_epi8(local_dstp + n * 2, vector_src_02);
  722. _mm512_storeu_epi8(local_dstp + n * 3, vector_src_03);
  723.  
  724. local_srcp += (n * 3);
  725. local_dstp += (n * 3);
  726. }
  727.  
  728. for (auto column = row_size_mod; column < row_size; column++)
  729. {
  730. *local_dstp = (uint8_t)(*local_srcp ^ 255);
  731. local_dstp++;
  732. local_srcp++;
  733. }
  734. }
  735. }
  736. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement