Advertisement
Guest User

Untitled

a guest
Jan 28th, 2023
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 30.84 KB | None | 0 0
  1. #include <windows.h>
  2. #include <avisynth.h>
  3. #include <immintrin.h>
  4.  
  5. void (*CoreFilterPtr)(const unsigned char*, unsigned char*, int, int, int, int, int, int);
  6.  
  7. void Invert(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads)
  8. {
  9. if (bits == 32)
  10. {
  11. float* dstp = reinterpret_cast<float*>(_dstp);
  12. const float* srcp = reinterpret_cast<const float*>(_srcp);
  13. #pragma omp parallel for num_threads(threads)
  14. for (auto y = 0; y < height; y++)
  15. {
  16. float* local_dstp = dstp + y * dst_pitch;
  17. const float* local_srcp = srcp + y * src_pitch;
  18.  
  19. for (auto x = 0; x < row_size; x++)
  20. {
  21. local_dstp[x] = (float)(1.0f - local_srcp[x]);
  22. }
  23. }
  24. }
  25. else if (bits == 16 || bits == 14 || bits == 12 || bits == 10)
  26. {
  27. uint16_t max_pixel = (1 << bits) - 1;
  28. uint16_t* dstp = reinterpret_cast<uint16_t*>(_dstp);
  29. const uint16_t* srcp = reinterpret_cast<const uint16_t*>(_srcp);
  30.  
  31. #pragma omp parallel for num_threads(threads)
  32. for (auto y = 0; y < height; y++)
  33. {
  34. uint16_t* local_dstp = dstp + y * dst_pitch;
  35. const uint16_t* local_srcp = srcp + y * src_pitch;
  36.  
  37. for (auto x = 0; x < row_size; x++)
  38. {
  39. local_dstp[x] = (uint16_t)(local_srcp[x] ^ max_pixel);
  40. }
  41. }
  42. }
  43. else
  44. {
  45. uint8_t* dstp = reinterpret_cast<uint8_t*>(_dstp);
  46. const uint8_t* srcp = reinterpret_cast<const uint8_t*>(_srcp);
  47.  
  48. #pragma omp parallel for num_threads(threads)
  49. for (auto y = 0; y < height; y++)
  50. {
  51. uint8_t* local_dstp = dstp + y * dst_pitch;
  52. const uint8_t* local_srcp = srcp + y * src_pitch;
  53.  
  54. for (auto x = 0; x < row_size; x++)
  55. {
  56. local_dstp[x] = (uint8_t)(local_srcp[x] ^ 255);
  57. }
  58. }
  59. }
  60. }
  61.  
  62. void Invert_AVX2(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads)
  63. {
  64. if (bits == 32)
  65. {
  66. float* dstp = reinterpret_cast<float*>(_dstp);
  67. const float* srcp = reinterpret_cast<const float*>(_srcp);
  68.  
  69. #pragma omp parallel for num_threads(threads)
  70. for (auto y = 0; y < height; y++)
  71. {
  72. float* local_dstp = (float*)(dstp + y * dst_pitch);
  73. float* local_srcp = (float*)(srcp + y * src_pitch);
  74.  
  75. __m256 vector_max = _mm256_set1_ps(1.0f);
  76. auto row_size_mod = row_size - (row_size % 112);
  77.  
  78. for (auto column = 0; column < row_size_mod; column += 112)
  79. {
  80. __m256 vector_src_01 = _mm256_loadu_ps(local_srcp);
  81. __m256 vector_src_02 = _mm256_loadu_ps(local_srcp + 8);
  82. __m256 vector_src_03 = _mm256_loadu_ps(local_srcp + 16);
  83. __m256 vector_src_04 = _mm256_loadu_ps(local_srcp + 24);
  84. __m256 vector_src_05 = _mm256_loadu_ps(local_srcp + 32);
  85. __m256 vector_src_06 = _mm256_loadu_ps(local_srcp + 40);
  86. __m256 vector_src_07 = _mm256_loadu_ps(local_srcp + 48);
  87. __m256 vector_src_08 = _mm256_loadu_ps(local_srcp + 56);
  88. __m256 vector_src_09 = _mm256_loadu_ps(local_srcp + 64);
  89. __m256 vector_src_10 = _mm256_loadu_ps(local_srcp + 72);
  90. __m256 vector_src_11 = _mm256_loadu_ps(local_srcp + 80);
  91. __m256 vector_src_12 = _mm256_loadu_ps(local_srcp + 88);
  92. __m256 vector_src_13 = _mm256_loadu_ps(local_srcp + 96);
  93. __m256 vector_src_14 = _mm256_loadu_ps(local_srcp + 104);
  94. __m256 vector_src_15 = _mm256_loadu_ps(local_srcp + 112);
  95.  
  96. vector_src_01 = _mm256_sub_ps(vector_max, vector_src_01);
  97. vector_src_02 = _mm256_sub_ps(vector_max, vector_src_02);
  98. vector_src_03 = _mm256_sub_ps(vector_max, vector_src_03);
  99. vector_src_04 = _mm256_sub_ps(vector_max, vector_src_04);
  100. vector_src_05 = _mm256_sub_ps(vector_max, vector_src_05);
  101. vector_src_06 = _mm256_sub_ps(vector_max, vector_src_06);
  102. vector_src_07 = _mm256_sub_ps(vector_max, vector_src_07);
  103. vector_src_08 = _mm256_sub_ps(vector_max, vector_src_08);
  104. vector_src_09 = _mm256_sub_ps(vector_max, vector_src_09);
  105. vector_src_10 = _mm256_sub_ps(vector_max, vector_src_10);
  106. vector_src_11 = _mm256_sub_ps(vector_max, vector_src_11);
  107. vector_src_12 = _mm256_sub_ps(vector_max, vector_src_12);
  108. vector_src_13 = _mm256_sub_ps(vector_max, vector_src_13);
  109. vector_src_14 = _mm256_sub_ps(vector_max, vector_src_14);
  110. vector_src_15 = _mm256_sub_ps(vector_max, vector_src_15);
  111.  
  112. _mm256_storeu_ps(local_dstp, vector_src_01);
  113. _mm256_storeu_ps(local_dstp + 8, vector_src_02);
  114. _mm256_storeu_ps(local_dstp + 16, vector_src_03);
  115. _mm256_storeu_ps(local_dstp + 24, vector_src_04);
  116. _mm256_storeu_ps(local_dstp + 32, vector_src_05);
  117. _mm256_storeu_ps(local_dstp + 40, vector_src_06);
  118. _mm256_storeu_ps(local_dstp + 48, vector_src_07);
  119. _mm256_storeu_ps(local_dstp + 56, vector_src_08);
  120. _mm256_storeu_ps(local_dstp + 64, vector_src_09);
  121. _mm256_storeu_ps(local_dstp + 72, vector_src_10);
  122. _mm256_storeu_ps(local_dstp + 80, vector_src_11);
  123. _mm256_storeu_ps(local_dstp + 88, vector_src_12);
  124. _mm256_storeu_ps(local_dstp + 96, vector_src_13);
  125. _mm256_storeu_ps(local_dstp + 104, vector_src_14);
  126. _mm256_storeu_ps(local_dstp + 112, vector_src_15);
  127.  
  128. local_srcp += 112;
  129. local_dstp += 112;
  130. }
  131. for (auto column = row_size_mod; column < row_size; column++)
  132. {
  133. *local_dstp = (float)(1.0f - *local_srcp);
  134. local_dstp++;
  135. local_srcp++;
  136. }
  137. }
  138. }
  139. else if (bits == 16 || bits == 14 || bits == 12 || bits == 10)
  140. {
  141. uint16_t max_pixel = (1 << bits) - 1;
  142. uint16_t* dstp = reinterpret_cast<uint16_t*>(_dstp);
  143. const uint16_t* srcp = reinterpret_cast<const uint16_t*>(_srcp);
  144.  
  145. #pragma omp parallel for num_threads(threads)
  146. for (auto y = 0; y < height; y++)
  147. {
  148. uint16_t* local_dstp = (uint16_t*)(dstp + y * dst_pitch);
  149. uint16_t* local_srcp = (uint16_t*)(srcp + y * src_pitch);
  150.  
  151. __m256i vector_max = _mm256_set1_epi16(max_pixel);
  152. auto row_size_mod = row_size - (row_size % 224);
  153.  
  154. for (auto column = 0; column < row_size_mod; column += 224)
  155. {
  156. __m256i vector_src_01 = _mm256_loadu_si256((const __m256i*)(local_srcp));
  157. __m256i vector_src_02 = _mm256_loadu_si256((const __m256i*)(local_srcp + 16));
  158. __m256i vector_src_03 = _mm256_loadu_si256((const __m256i*)(local_srcp + 32));
  159. __m256i vector_src_04 = _mm256_loadu_si256((const __m256i*)(local_srcp + 48));
  160. __m256i vector_src_05 = _mm256_loadu_si256((const __m256i*)(local_srcp + 64));
  161. __m256i vector_src_06 = _mm256_loadu_si256((const __m256i*)(local_srcp + 80));
  162. __m256i vector_src_07 = _mm256_loadu_si256((const __m256i*)(local_srcp + 96));
  163. __m256i vector_src_08 = _mm256_loadu_si256((const __m256i*)(local_srcp + 112));
  164. __m256i vector_src_09 = _mm256_loadu_si256((const __m256i*)(local_srcp + 128));
  165. __m256i vector_src_10 = _mm256_loadu_si256((const __m256i*)(local_srcp + 144));
  166. __m256i vector_src_11 = _mm256_loadu_si256((const __m256i*)(local_srcp + 160));
  167. __m256i vector_src_12 = _mm256_loadu_si256((const __m256i*)(local_srcp + 176));
  168. __m256i vector_src_13 = _mm256_loadu_si256((const __m256i*)(local_srcp + 192));
  169. __m256i vector_src_14 = _mm256_loadu_si256((const __m256i*)(local_srcp + 208));
  170. __m256i vector_src_15 = _mm256_loadu_si256((const __m256i*)(local_srcp + 224));
  171.  
  172. vector_src_01 = _mm256_sub_epi16(vector_max, vector_src_01);
  173. vector_src_02 = _mm256_sub_epi16(vector_max, vector_src_02);
  174. vector_src_03 = _mm256_sub_epi16(vector_max, vector_src_03);
  175. vector_src_04 = _mm256_sub_epi16(vector_max, vector_src_04);
  176. vector_src_05 = _mm256_sub_epi16(vector_max, vector_src_05);
  177. vector_src_06 = _mm256_sub_epi16(vector_max, vector_src_06);
  178. vector_src_07 = _mm256_sub_epi16(vector_max, vector_src_07);
  179. vector_src_08 = _mm256_sub_epi16(vector_max, vector_src_08);
  180. vector_src_09 = _mm256_sub_epi16(vector_max, vector_src_09);
  181. vector_src_10 = _mm256_sub_epi16(vector_max, vector_src_10);
  182. vector_src_11 = _mm256_sub_epi16(vector_max, vector_src_11);
  183. vector_src_12 = _mm256_sub_epi16(vector_max, vector_src_12);
  184. vector_src_13 = _mm256_sub_epi16(vector_max, vector_src_13);
  185. vector_src_14 = _mm256_sub_epi16(vector_max, vector_src_14);
  186. vector_src_15 = _mm256_sub_epi16(vector_max, vector_src_15);
  187.  
  188. _mm256_storeu_si256((__m256i*)(local_dstp), vector_src_01);
  189. _mm256_storeu_si256((__m256i*)(local_dstp + 16), vector_src_02);
  190. _mm256_storeu_si256((__m256i*)(local_dstp + 32), vector_src_03);
  191. _mm256_storeu_si256((__m256i*)(local_dstp + 48), vector_src_04);
  192. _mm256_storeu_si256((__m256i*)(local_dstp + 64), vector_src_05);
  193. _mm256_storeu_si256((__m256i*)(local_dstp + 80), vector_src_06);
  194. _mm256_storeu_si256((__m256i*)(local_dstp + 96), vector_src_07);
  195. _mm256_storeu_si256((__m256i*)(local_dstp + 112), vector_src_08);
  196. _mm256_storeu_si256((__m256i*)(local_dstp + 128), vector_src_09);
  197. _mm256_storeu_si256((__m256i*)(local_dstp + 144), vector_src_10);
  198. _mm256_storeu_si256((__m256i*)(local_dstp + 160), vector_src_11);
  199. _mm256_storeu_si256((__m256i*)(local_dstp + 176), vector_src_12);
  200. _mm256_storeu_si256((__m256i*)(local_dstp + 192), vector_src_13);
  201. _mm256_storeu_si256((__m256i*)(local_dstp + 208), vector_src_14);
  202. _mm256_storeu_si256((__m256i*)(local_dstp + 224), vector_src_15);
  203.  
  204. local_srcp += 224;
  205. local_dstp += 224;
  206. }
  207. for (auto column = row_size_mod; column < row_size; column++)
  208. {
  209. *local_dstp = (uint16_t)(*local_srcp ^ max_pixel);
  210. local_dstp++;
  211. local_srcp++;
  212. }
  213. }
  214. }
  215. else
  216. {
  217. uint8_t* dstp = reinterpret_cast<uint8_t*>(_dstp);
  218. const uint8_t* srcp = reinterpret_cast<const uint8_t*>(_srcp);
  219.  
  220. #pragma omp parallel for num_threads(threads)
  221. for (auto y = 0; y < height; y++)
  222. {
  223. uint8_t* local_dstp = (uint8_t*)(dstp + y * dst_pitch);
  224. uint8_t* local_srcp = (uint8_t*)(srcp + y * src_pitch);
  225.  
  226. __m256i vector_max = _mm256_set1_epi8(255);
  227. auto row_size_mod = row_size - (row_size % 448);
  228.  
  229. for (auto column = 0; column < row_size_mod; column += 448)
  230. {
  231. __m256i vector_src_01 = _mm256_loadu_si256((const __m256i*)(local_srcp));
  232. __m256i vector_src_02 = _mm256_loadu_si256((const __m256i*)(local_srcp + 32));
  233. __m256i vector_src_03 = _mm256_loadu_si256((const __m256i*)(local_srcp + 64));
  234. __m256i vector_src_04 = _mm256_loadu_si256((const __m256i*)(local_srcp + 96));
  235. __m256i vector_src_05 = _mm256_loadu_si256((const __m256i*)(local_srcp + 128));
  236. __m256i vector_src_06 = _mm256_loadu_si256((const __m256i*)(local_srcp + 160));
  237. __m256i vector_src_07 = _mm256_loadu_si256((const __m256i*)(local_srcp + 192));
  238. __m256i vector_src_08 = _mm256_loadu_si256((const __m256i*)(local_srcp + 224));
  239. __m256i vector_src_09 = _mm256_loadu_si256((const __m256i*)(local_srcp + 256));
  240. __m256i vector_src_10 = _mm256_loadu_si256((const __m256i*)(local_srcp + 288));
  241. __m256i vector_src_11 = _mm256_loadu_si256((const __m256i*)(local_srcp + 320));
  242. __m256i vector_src_12 = _mm256_loadu_si256((const __m256i*)(local_srcp + 352));
  243. __m256i vector_src_13 = _mm256_loadu_si256((const __m256i*)(local_srcp + 384));
  244. __m256i vector_src_14 = _mm256_loadu_si256((const __m256i*)(local_srcp + 416));
  245. __m256i vector_src_15 = _mm256_loadu_si256((const __m256i*)(local_srcp + 448));
  246.  
  247. vector_src_01 = _mm256_sub_epi8(vector_max, vector_src_01);
  248. vector_src_02 = _mm256_sub_epi8(vector_max, vector_src_02);
  249. vector_src_03 = _mm256_sub_epi8(vector_max, vector_src_03);
  250. vector_src_04 = _mm256_sub_epi8(vector_max, vector_src_04);
  251. vector_src_05 = _mm256_sub_epi8(vector_max, vector_src_05);
  252. vector_src_06 = _mm256_sub_epi8(vector_max, vector_src_06);
  253. vector_src_07 = _mm256_sub_epi8(vector_max, vector_src_07);
  254. vector_src_08 = _mm256_sub_epi8(vector_max, vector_src_08);
  255. vector_src_09 = _mm256_sub_epi8(vector_max, vector_src_09);
  256. vector_src_10 = _mm256_sub_epi8(vector_max, vector_src_10);
  257. vector_src_11 = _mm256_sub_epi8(vector_max, vector_src_11);
  258. vector_src_12 = _mm256_sub_epi8(vector_max, vector_src_12);
  259. vector_src_13 = _mm256_sub_epi8(vector_max, vector_src_13);
  260. vector_src_14 = _mm256_sub_epi8(vector_max, vector_src_14);
  261. vector_src_15 = _mm256_sub_epi8(vector_max, vector_src_15);
  262.  
  263. _mm256_storeu_si256((__m256i*)(local_dstp), vector_src_01);
  264. _mm256_storeu_si256((__m256i*)(local_dstp + 32), vector_src_02);
  265. _mm256_storeu_si256((__m256i*)(local_dstp + 64), vector_src_03);
  266. _mm256_storeu_si256((__m256i*)(local_dstp + 96), vector_src_04);
  267. _mm256_storeu_si256((__m256i*)(local_dstp + 128), vector_src_05);
  268. _mm256_storeu_si256((__m256i*)(local_dstp + 160), vector_src_06);
  269. _mm256_storeu_si256((__m256i*)(local_dstp + 192), vector_src_07);
  270. _mm256_storeu_si256((__m256i*)(local_dstp + 224), vector_src_08);
  271. _mm256_storeu_si256((__m256i*)(local_dstp + 256), vector_src_09);
  272. _mm256_storeu_si256((__m256i*)(local_dstp + 288), vector_src_10);
  273. _mm256_storeu_si256((__m256i*)(local_dstp + 320), vector_src_11);
  274. _mm256_storeu_si256((__m256i*)(local_dstp + 352), vector_src_12);
  275. _mm256_storeu_si256((__m256i*)(local_dstp + 384), vector_src_13);
  276. _mm256_storeu_si256((__m256i*)(local_dstp + 416), vector_src_14);
  277. _mm256_storeu_si256((__m256i*)(local_dstp + 448), vector_src_15);
  278.  
  279. local_srcp += 448;
  280. local_dstp += 448;
  281. }
  282. for (auto column = row_size_mod; column < row_size; column++)
  283. {
  284. *local_dstp = (uint16_t)(*local_srcp ^ 255);
  285. local_dstp++;
  286. local_srcp++;
  287. }
  288. }
  289. }
  290. }
  291.  
  292. void Invert_SSE4(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads)
  293. {
  294. if (bits == 32)
  295. {
  296. float* dstp = reinterpret_cast<float*>(_dstp);
  297. const float* srcp = reinterpret_cast<const float*>(_srcp);
  298.  
  299. #pragma omp parallel for num_threads(threads)
  300. for (auto y = 0; y < height; y++)
  301. {
  302. float* local_dstp = (float*)(dstp + y * dst_pitch);
  303. float* local_srcp = (float*)(srcp + y * src_pitch);
  304.  
  305. __m128 vector_max = _mm_set1_ps(1.0f);
  306. auto row_size_mod = row_size - (row_size % 56);
  307.  
  308. for (auto column = 0; column < row_size_mod; column += 56)
  309. {
  310. __m128 vector_src_01 = _mm_loadu_ps(local_srcp);
  311. __m128 vector_src_02 = _mm_loadu_ps(local_srcp + 4);
  312. __m128 vector_src_03 = _mm_loadu_ps(local_srcp + 8);
  313. __m128 vector_src_04 = _mm_loadu_ps(local_srcp + 12);
  314. __m128 vector_src_05 = _mm_loadu_ps(local_srcp + 16);
  315. __m128 vector_src_06 = _mm_loadu_ps(local_srcp + 20);
  316. __m128 vector_src_07 = _mm_loadu_ps(local_srcp + 24);
  317. __m128 vector_src_08 = _mm_loadu_ps(local_srcp + 28);
  318. __m128 vector_src_09 = _mm_loadu_ps(local_srcp + 32);
  319. __m128 vector_src_10 = _mm_loadu_ps(local_srcp + 36);
  320. __m128 vector_src_11 = _mm_loadu_ps(local_srcp + 40);
  321. __m128 vector_src_12 = _mm_loadu_ps(local_srcp + 44);
  322. __m128 vector_src_13 = _mm_loadu_ps(local_srcp + 48);
  323. __m128 vector_src_14 = _mm_loadu_ps(local_srcp + 52);
  324. __m128 vector_src_15 = _mm_loadu_ps(local_srcp + 56);
  325.  
  326. vector_src_01 = _mm_sub_ps(vector_max, vector_src_01);
  327. vector_src_02 = _mm_sub_ps(vector_max, vector_src_02);
  328. vector_src_03 = _mm_sub_ps(vector_max, vector_src_03);
  329. vector_src_04 = _mm_sub_ps(vector_max, vector_src_04);
  330. vector_src_05 = _mm_sub_ps(vector_max, vector_src_05);
  331. vector_src_06 = _mm_sub_ps(vector_max, vector_src_06);
  332. vector_src_07 = _mm_sub_ps(vector_max, vector_src_07);
  333. vector_src_08 = _mm_sub_ps(vector_max, vector_src_08);
  334. vector_src_09 = _mm_sub_ps(vector_max, vector_src_09);
  335. vector_src_10 = _mm_sub_ps(vector_max, vector_src_10);
  336. vector_src_11 = _mm_sub_ps(vector_max, vector_src_11);
  337. vector_src_12 = _mm_sub_ps(vector_max, vector_src_12);
  338. vector_src_13 = _mm_sub_ps(vector_max, vector_src_13);
  339. vector_src_14 = _mm_sub_ps(vector_max, vector_src_14);
  340. vector_src_15 = _mm_sub_ps(vector_max, vector_src_15);
  341.  
  342. _mm_storeu_ps(local_dstp, vector_src_01);
  343. _mm_storeu_ps(local_dstp + 4, vector_src_02);
  344. _mm_storeu_ps(local_dstp + 8, vector_src_03);
  345. _mm_storeu_ps(local_dstp + 12, vector_src_04);
  346. _mm_storeu_ps(local_dstp + 16, vector_src_05);
  347. _mm_storeu_ps(local_dstp + 20, vector_src_06);
  348. _mm_storeu_ps(local_dstp + 24, vector_src_07);
  349. _mm_storeu_ps(local_dstp + 28, vector_src_08);
  350. _mm_storeu_ps(local_dstp + 32, vector_src_09);
  351. _mm_storeu_ps(local_dstp + 36, vector_src_10);
  352. _mm_storeu_ps(local_dstp + 40, vector_src_11);
  353. _mm_storeu_ps(local_dstp + 44, vector_src_12);
  354. _mm_storeu_ps(local_dstp + 48, vector_src_13);
  355. _mm_storeu_ps(local_dstp + 52, vector_src_14);
  356. _mm_storeu_ps(local_dstp + 56, vector_src_15);
  357.  
  358. local_srcp += 56;
  359. local_dstp += 56;
  360. }
  361. for (auto column = row_size_mod; column < row_size; column++)
  362. {
  363. *local_dstp = (float)(1.0f - *local_srcp);
  364. local_dstp++;
  365. local_srcp++;
  366. }
  367. }
  368. }
  369. else if (bits == 16 || bits == 14 || bits == 12 || bits == 10)
  370. {
  371. uint16_t max_pixel = (1 << bits) - 1;
  372. uint16_t* dstp = reinterpret_cast<uint16_t*>(_dstp);
  373. const uint16_t* srcp = reinterpret_cast<const uint16_t*>(_srcp);
  374.  
  375. #pragma omp parallel for num_threads(threads)
  376. for (auto y = 0; y < height; y++)
  377. {
  378. uint16_t* local_dstp = (uint16_t*)(dstp + y * dst_pitch);
  379. uint16_t* local_srcp = (uint16_t*)(srcp + y * src_pitch);
  380.  
  381. __m128i vector_max = _mm_set1_epi16(max_pixel);
  382. auto row_size_mod = row_size - (row_size % 112);
  383.  
  384. for (auto column = 0; column < row_size_mod; column += 112)
  385. {
  386. __m128i vector_src_01 = _mm_loadu_si128((const __m128i*)(local_srcp));
  387. __m128i vector_src_02 = _mm_loadu_si128((const __m128i*)(local_srcp + 8));
  388. __m128i vector_src_03 = _mm_loadu_si128((const __m128i*)(local_srcp + 16));
  389. __m128i vector_src_04 = _mm_loadu_si128((const __m128i*)(local_srcp + 24));
  390. __m128i vector_src_05 = _mm_loadu_si128((const __m128i*)(local_srcp + 32));
  391. __m128i vector_src_06 = _mm_loadu_si128((const __m128i*)(local_srcp + 40));
  392. __m128i vector_src_07 = _mm_loadu_si128((const __m128i*)(local_srcp + 48));
  393. __m128i vector_src_08 = _mm_loadu_si128((const __m128i*)(local_srcp + 56));
  394. __m128i vector_src_09 = _mm_loadu_si128((const __m128i*)(local_srcp + 64));
  395. __m128i vector_src_10 = _mm_loadu_si128((const __m128i*)(local_srcp + 72));
  396. __m128i vector_src_11 = _mm_loadu_si128((const __m128i*)(local_srcp + 80));
  397. __m128i vector_src_12 = _mm_loadu_si128((const __m128i*)(local_srcp + 88));
  398. __m128i vector_src_13 = _mm_loadu_si128((const __m128i*)(local_srcp + 96));
  399. __m128i vector_src_14 = _mm_loadu_si128((const __m128i*)(local_srcp + 104));
  400. __m128i vector_src_15 = _mm_loadu_si128((const __m128i*)(local_srcp + 112));
  401.  
  402. vector_src_01 = _mm_sub_epi16(vector_max, vector_src_01);
  403. vector_src_02 = _mm_sub_epi16(vector_max, vector_src_02);
  404. vector_src_03 = _mm_sub_epi16(vector_max, vector_src_03);
  405. vector_src_04 = _mm_sub_epi16(vector_max, vector_src_04);
  406. vector_src_05 = _mm_sub_epi16(vector_max, vector_src_05);
  407. vector_src_06 = _mm_sub_epi16(vector_max, vector_src_06);
  408. vector_src_07 = _mm_sub_epi16(vector_max, vector_src_07);
  409. vector_src_08 = _mm_sub_epi16(vector_max, vector_src_08);
  410. vector_src_09 = _mm_sub_epi16(vector_max, vector_src_09);
  411. vector_src_10 = _mm_sub_epi16(vector_max, vector_src_10);
  412. vector_src_11 = _mm_sub_epi16(vector_max, vector_src_11);
  413. vector_src_12 = _mm_sub_epi16(vector_max, vector_src_12);
  414. vector_src_13 = _mm_sub_epi16(vector_max, vector_src_13);
  415. vector_src_14 = _mm_sub_epi16(vector_max, vector_src_14);
  416. vector_src_15 = _mm_sub_epi16(vector_max, vector_src_15);
  417.  
  418. _mm_storeu_si128((__m128i*)(local_dstp), vector_src_01);
  419. _mm_storeu_si128((__m128i*)(local_dstp + 8), vector_src_02);
  420. _mm_storeu_si128((__m128i*)(local_dstp + 16), vector_src_03);
  421. _mm_storeu_si128((__m128i*)(local_dstp + 24), vector_src_04);
  422. _mm_storeu_si128((__m128i*)(local_dstp + 32), vector_src_05);
  423. _mm_storeu_si128((__m128i*)(local_dstp + 40), vector_src_06);
  424. _mm_storeu_si128((__m128i*)(local_dstp + 48), vector_src_07);
  425. _mm_storeu_si128((__m128i*)(local_dstp + 56), vector_src_08);
  426. _mm_storeu_si128((__m128i*)(local_dstp + 64), vector_src_09);
  427. _mm_storeu_si128((__m128i*)(local_dstp + 72), vector_src_10);
  428. _mm_storeu_si128((__m128i*)(local_dstp + 80), vector_src_11);
  429. _mm_storeu_si128((__m128i*)(local_dstp + 88), vector_src_12);
  430. _mm_storeu_si128((__m128i*)(local_dstp + 96), vector_src_13);
  431. _mm_storeu_si128((__m128i*)(local_dstp + 104), vector_src_14);
  432. _mm_storeu_si128((__m128i*)(local_dstp + 112), vector_src_15);
  433.  
  434. local_srcp += 112;
  435. local_dstp += 112;
  436. }
  437. for (auto column = row_size_mod; column < row_size; column++)
  438. {
  439. *local_dstp = (uint16_t)(*local_srcp ^ max_pixel);
  440. local_dstp++;
  441. local_srcp++;
  442. }
  443. }
  444. }
  445. else
  446. {
  447. uint8_t* dstp = reinterpret_cast<uint8_t*>(_dstp);
  448. const uint8_t* srcp = reinterpret_cast<const uint8_t*>(_srcp);
  449.  
  450. #pragma omp parallel for num_threads(threads)
  451. for (auto y = 0; y < height; y++)
  452. {
  453. uint8_t* local_dstp = (uint8_t*)(dstp + y * dst_pitch);
  454. uint8_t* local_srcp = (uint8_t*)(srcp + y * src_pitch);
  455.  
  456. __m128i vector_max = _mm_set1_epi8(255);
  457. auto row_size_mod = row_size - (row_size % 224);
  458.  
  459. for (auto column = 0; column < row_size_mod; column += 224)
  460. {
  461. __m128i vector_src_01 = _mm_loadu_si128((const __m128i*)(local_srcp));
  462. __m128i vector_src_02 = _mm_loadu_si128((const __m128i*)(local_srcp + 16));
  463. __m128i vector_src_03 = _mm_loadu_si128((const __m128i*)(local_srcp + 32));
  464. __m128i vector_src_04 = _mm_loadu_si128((const __m128i*)(local_srcp + 48));
  465. __m128i vector_src_05 = _mm_loadu_si128((const __m128i*)(local_srcp + 64));
  466. __m128i vector_src_06 = _mm_loadu_si128((const __m128i*)(local_srcp + 80));
  467. __m128i vector_src_07 = _mm_loadu_si128((const __m128i*)(local_srcp + 96));
  468. __m128i vector_src_08 = _mm_loadu_si128((const __m128i*)(local_srcp + 112));
  469. __m128i vector_src_09 = _mm_loadu_si128((const __m128i*)(local_srcp + 128));
  470. __m128i vector_src_10 = _mm_loadu_si128((const __m128i*)(local_srcp + 144));
  471. __m128i vector_src_11 = _mm_loadu_si128((const __m128i*)(local_srcp + 160));
  472. __m128i vector_src_12 = _mm_loadu_si128((const __m128i*)(local_srcp + 176));
  473. __m128i vector_src_13 = _mm_loadu_si128((const __m128i*)(local_srcp + 192));
  474. __m128i vector_src_14 = _mm_loadu_si128((const __m128i*)(local_srcp + 208));
  475. __m128i vector_src_15 = _mm_loadu_si128((const __m128i*)(local_srcp + 224));
  476.  
  477. vector_src_01 = _mm_sub_epi8(vector_max, vector_src_01);
  478. vector_src_02 = _mm_sub_epi8(vector_max, vector_src_02);
  479. vector_src_03 = _mm_sub_epi8(vector_max, vector_src_03);
  480. vector_src_04 = _mm_sub_epi8(vector_max, vector_src_04);
  481. vector_src_05 = _mm_sub_epi8(vector_max, vector_src_05);
  482. vector_src_06 = _mm_sub_epi8(vector_max, vector_src_06);
  483. vector_src_07 = _mm_sub_epi8(vector_max, vector_src_07);
  484. vector_src_08 = _mm_sub_epi8(vector_max, vector_src_08);
  485. vector_src_09 = _mm_sub_epi8(vector_max, vector_src_09);
  486. vector_src_10 = _mm_sub_epi8(vector_max, vector_src_10);
  487. vector_src_11 = _mm_sub_epi8(vector_max, vector_src_11);
  488. vector_src_12 = _mm_sub_epi8(vector_max, vector_src_12);
  489. vector_src_13 = _mm_sub_epi8(vector_max, vector_src_13);
  490. vector_src_14 = _mm_sub_epi8(vector_max, vector_src_14);
  491. vector_src_15 = _mm_sub_epi8(vector_max, vector_src_15);
  492.  
  493. _mm_storeu_si128((__m128i*)(local_dstp), vector_src_01);
  494. _mm_storeu_si128((__m128i*)(local_dstp + 16), vector_src_02);
  495. _mm_storeu_si128((__m128i*)(local_dstp + 32), vector_src_03);
  496. _mm_storeu_si128((__m128i*)(local_dstp + 48), vector_src_04);
  497. _mm_storeu_si128((__m128i*)(local_dstp + 64), vector_src_05);
  498. _mm_storeu_si128((__m128i*)(local_dstp + 80), vector_src_06);
  499. _mm_storeu_si128((__m128i*)(local_dstp + 96), vector_src_07);
  500. _mm_storeu_si128((__m128i*)(local_dstp + 112), vector_src_08);
  501. _mm_storeu_si128((__m128i*)(local_dstp + 128), vector_src_09);
  502. _mm_storeu_si128((__m128i*)(local_dstp + 144), vector_src_10);
  503. _mm_storeu_si128((__m128i*)(local_dstp + 160), vector_src_11);
  504. _mm_storeu_si128((__m128i*)(local_dstp + 176), vector_src_12);
  505. _mm_storeu_si128((__m128i*)(local_dstp + 192), vector_src_13);
  506. _mm_storeu_si128((__m128i*)(local_dstp + 208), vector_src_14);
  507. _mm_storeu_si128((__m128i*)(local_dstp + 224), vector_src_15);
  508.  
  509. local_srcp += 224;
  510. local_dstp += 224;
  511. }
  512. for (auto column = row_size_mod; column < row_size; column++)
  513. {
  514. *local_dstp = (uint16_t)(*local_srcp ^ 255);
  515. local_dstp++;
  516. local_srcp++;
  517. }
  518. }
  519. }
  520. }
  521.  
  522. class InvertNeg : public GenericVideoFilter
  523. {
  524. int cpu;
  525. int threads;
  526. int cpuFlags;
  527. public:
  528. InvertNeg(PClip _child, int _cpu, int _threads, IScriptEnvironment* env) : GenericVideoFilter(_child), cpu(_cpu), threads(_threads)
  529. {
  530. if (!vi.IsY()) env->ThrowError("InvertNeg: Only Y8 input, sorry!");
  531. if (cpu < 0 || cpu > 2) env->ThrowError("InvertNeg: cpu must be 0, 1, 2!");
  532. if (threads < 1) env->ThrowError("InvertNeg: threads must be >= 1!");
  533.  
  534. switch (cpu)
  535. {
  536. case 1: CoreFilterPtr = Invert_SSE4; break;
  537. case 2: CoreFilterPtr = Invert_AVX2; break;
  538. default: CoreFilterPtr = Invert; break;
  539. }
  540. }
  541. PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
  542. {
  543. auto dst = env->NewVideoFrame(vi);
  544. auto dstp = dst->GetWritePtr(PLANAR_Y);
  545. auto dst_pitch = dst->GetPitch(PLANAR_Y) / vi.ComponentSize();
  546. auto src = child->GetFrame(n, env);
  547. auto srcp = src->GetReadPtr(PLANAR_Y);
  548. auto height = src->GetHeight(PLANAR_Y);
  549. auto row_size = src->GetRowSize(PLANAR_Y) / vi.ComponentSize();
  550. auto src_pitch = src->GetPitch(PLANAR_Y) / vi.ComponentSize();
  551.  
  552. CoreFilterPtr(srcp, dstp, src_pitch, dst_pitch, height, row_size, vi.BitsPerComponent(), threads);
  553.  
  554. return dst;
  555. }
  556. };
  557.  
  558.  
  559. AVSValue __cdecl Create_InvertNeg(AVSValue args, void* user_data, IScriptEnvironment* env)
  560. {
  561. return new InvertNeg(args[0].AsClip(),args[1].AsInt(0),args[2].AsInt(1), env);
  562. }
  563.  
  564. const AVS_Linkage* AVS_linkage = 0;
  565.  
  566. extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors)
  567. {
  568. AVS_linkage = vectors;
  569. env->AddFunction("InvertNeg", "c[cpu]i[threads]i", Create_InvertNeg, 0);
  570. return "InvertNeg sample plugin";
  571. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement