Advertisement
Guest User

Untitled

a guest
Jan 24th, 2017
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.46 KB | None | 0 0
  1. // public domain
  2. #include <cstdio>
  3. #include <random>
  4. #include <xmmintrin.h>
  5. #include <immintrin.h>
  6. #include <iostream>
  7. #include <fstream>
  8. #include <algorithm>
  9. #include <thread>
  10. #include <list>
  11.  
  12. using namespace std;
  13.  
  14. constexpr int num_iter = 100000000;
  15.  
  16. constexpr int image_size_log2 = 9;
  17. constexpr int image_size = 1 << image_size_log2;
  18.  
  19. static const float coefs[] = {
  20. 0.856f, 0.0414f, 0.07f,
  21. -0.0205f, 0.858f, 0.147f,
  22.  
  23. 0.244f, -0.385f, 0.393f,
  24. 0.176f, 0.224f, -0.102f,
  25.  
  26. -0.144f, 0.39f, 0.527f,
  27. 0.181f, 0.259f, -0.014f,
  28.  
  29. 0.0f, 0.0f, 0.486f,
  30. 0.355f, 0.216f, 0.05f
  31. };
  32.  
  33. constexpr int chance_map_size = 32;
  34.  
  35. static int32_t chance_map[chance_map_size];
  36.  
  37. int main() {
  38. list<thread> threads;
  39. vector<uint32_t *> images;
  40.  
  41. {
  42. int i = 0;
  43. for (; i < 73 * chance_map_size / 100; ++i) {
  44. chance_map[i] = 0;
  45. }
  46. for (; i < 86 * chance_map_size / 100; ++i) {
  47. chance_map[i] = 1;
  48. }
  49. for (; i < 99 * chance_map_size / 100; ++i) {
  50. chance_map[i] = 2;
  51. }
  52. for (; i < 100 * chance_map_size / 100; ++i) {
  53. chance_map[i] = 3;
  54. }
  55. }
  56.  
  57. int num_threads = thread::hardware_concurrency() / 2;
  58.  
  59. cout << "plotting " << num_iter * num_threads << " points" << endl;
  60.  
  61. for (unsigned i = 0; i < num_threads; ++i) {
  62. uint32_t *local_image = new uint32_t[image_size * image_size];
  63. images.push_back(local_image);
  64. threads.emplace_back([local_image] {
  65. fill(local_image, local_image + image_size * image_size, 0);
  66.  
  67. __m256 M[6];
  68. for (int i = 0; i < 6; ++i) {
  69. M[i] = _mm256_set_ps(coefs[i + 18], coefs[i + 12], coefs[i + 6], coefs[i],
  70. coefs[i + 18], coefs[i + 12], coefs[i + 6], coefs[i]);
  71. }
  72.  
  73. __m256i min_coord = _mm256_set1_epi32(0);
  74. __m256i max_coord = _mm256_set1_epi32(image_size - 1);
  75.  
  76. __m256i xst;
  77. {
  78. random_device rd;
  79. xst = _mm256_set_epi32(rd()|1, rd()|1, rd()|1, rd()|1, rd()|1, rd()|1, rd()|1, rd()|1);
  80. }
  81.  
  82. __m256i cur_x, cur_y;
  83. cur_x = _mm256_set1_ps(1.0f);
  84. cur_y = _mm256_set1_ps(1.0f);
  85.  
  86. for (int i = 0; i < num_iter; ++i) {
  87.  
  88. __m256i index = xst;
  89. if (false) {
  90. index = _mm256_and_si256(index, _mm256_set1_epi32(chance_map_size - 1));
  91. index = _mm256_i32gather_epi32(chance_map, index, 4);
  92. } else {
  93. index = _mm256_and_si256(index, _mm256_set1_epi32(0xffff));
  94. __m256i compare_result1 = _mm256_cmpgt_epi32(index, _mm256_set1_epi32(73 * 0x10000 / 100));
  95. __m256i compare_result2 = _mm256_cmpgt_epi32(index, _mm256_set1_epi32(99 * 0x10000 / 100));
  96. __m256i compare_result3 = _mm256_cmpgt_epi32(index, _mm256_set1_epi32(86 * 0x10000 / 100));
  97.  
  98. index = _mm256_castps_si256(_mm256_blendv_ps(
  99. _mm256_castsi256_ps(compare_result1),
  100. _mm256_castsi256_ps(compare_result2),
  101. _mm256_castsi256_ps(compare_result3)
  102. ));
  103. index = _mm256_srli_epi32(index, 31);
  104. index = _mm256_or_si256(index, _mm256_slli_epi32(compare_result3, 1));
  105. }
  106.  
  107. __m256 m00 = _mm256_permutevar_ps(M[0], index);
  108. __m256 m01 = _mm256_permutevar_ps(M[1], index);
  109. __m256 m02 = _mm256_permutevar_ps(M[2], index);
  110. __m256 m10 = _mm256_permutevar_ps(M[3], index);
  111. __m256 m11 = _mm256_permutevar_ps(M[4], index);
  112. __m256 m12 = _mm256_permutevar_ps(M[5], index);
  113.  
  114. __m256 new_x = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(cur_x, m00), _mm256_mul_ps(cur_y, m01)), m02);
  115. __m256 new_y = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(cur_x, m10), _mm256_mul_ps(cur_y, m11)), m12);
  116. cur_x = new_x; cur_y = new_y;
  117.  
  118. __m256i coord_x_f = cur_x;
  119. __m256i coord_y_f = cur_y;
  120. coord_x_f = _mm256_mul_ps(coord_x_f, _mm256_set1_ps(350.f));
  121. coord_y_f = _mm256_mul_ps(coord_y_f, _mm256_set1_ps(-350.f));
  122. coord_x_f = _mm256_add_ps(coord_x_f, _mm256_set1_ps(80.f));
  123. coord_y_f = _mm256_add_ps(coord_y_f, _mm256_set1_ps(400.f));
  124.  
  125. __m256i coord_x = _mm256_cvtps_epi32(coord_x_f);
  126. __m256i coord_y = _mm256_cvtps_epi32(coord_y_f);
  127. coord_x = _mm256_max_epi32(coord_x, min_coord);
  128. coord_y = _mm256_max_epi32(coord_y, min_coord);
  129. coord_x = _mm256_min_epi32(coord_x, max_coord);
  130. coord_y = _mm256_min_epi32(coord_y, max_coord);
  131.  
  132. __m256i address = _mm256_slli_epi32(coord_y, image_size_log2);
  133. address = _mm256_add_epi32(address, coord_x);
  134.  
  135. ++local_image[_mm256_extract_epi32(address, 0)];
  136. ++local_image[_mm256_extract_epi32(address, 1)];
  137. ++local_image[_mm256_extract_epi32(address, 2)];
  138. ++local_image[_mm256_extract_epi32(address, 3)];
  139. ++local_image[_mm256_extract_epi32(address, 4)];
  140. ++local_image[_mm256_extract_epi32(address, 5)];
  141. ++local_image[_mm256_extract_epi32(address, 6)];
  142. ++local_image[_mm256_extract_epi32(address, 7)];
  143.  
  144. xst = _mm256_xor_si256(xst, _mm256_slli_epi32(xst, 13));
  145. xst = _mm256_xor_si256(xst, _mm256_srli_epi32(xst, 17));
  146. xst = _mm256_xor_si256(xst, _mm256_slli_epi32(xst, 5));
  147. }
  148. });
  149. }
  150.  
  151. for (auto &thread: threads) {
  152. thread.join();
  153. }
  154.  
  155. uint32_t *final_image = images[0];
  156. for (size_t i = 1; i < images.size(); ++i) {
  157. uint32_t *other_image = images[i];
  158. for (int k = 0; k < image_size * image_size; k += 8) {
  159. _mm256_store_si256((__m256i *)(final_image + k),
  160. _mm256_add_epi32(_mm256_load_si256((__m256i *)(final_image + k)),
  161. _mm256_load_si256((__m256i *)(other_image + k))));
  162. }
  163. }
  164.  
  165. for (int k = 0; k < image_size * image_size; ++k) {
  166. auto &e = final_image[k];
  167. e = min<int>(e >> 9, 255);
  168. e *= 0x10101;
  169. e |= 0xff000000;
  170. }
  171.  
  172. {
  173. std::fstream s{"output.raw", ios::binary|ios::out|ios::trunc};
  174. s.write((char *)final_image, image_size * image_size * 4);
  175. }
  176. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement