Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // public domain
- #include <cstdio>
- #include <random>
- #include <xmmintrin.h>
- #include <immintrin.h>
- #include <iostream>
- #include <fstream>
- #include <algorithm>
- #include <thread>
- #include <list>
- using namespace std;
- constexpr int num_iter = 100000000;
- constexpr int image_size_log2 = 9;
- constexpr int image_size = 1 << image_size_log2;
- static const float coefs[] = {
- 0.856f, 0.0414f, 0.07f,
- -0.0205f, 0.858f, 0.147f,
- 0.244f, -0.385f, 0.393f,
- 0.176f, 0.224f, -0.102f,
- -0.144f, 0.39f, 0.527f,
- 0.181f, 0.259f, -0.014f,
- 0.0f, 0.0f, 0.486f,
- 0.355f, 0.216f, 0.05f
- };
- constexpr int chance_map_size = 32;
- static int32_t chance_map[chance_map_size];
- int main() {
- list<thread> threads;
- vector<uint32_t *> images;
- {
- int i = 0;
- for (; i < 73 * chance_map_size / 100; ++i) {
- chance_map[i] = 0;
- }
- for (; i < 86 * chance_map_size / 100; ++i) {
- chance_map[i] = 1;
- }
- for (; i < 99 * chance_map_size / 100; ++i) {
- chance_map[i] = 2;
- }
- for (; i < 100 * chance_map_size / 100; ++i) {
- chance_map[i] = 3;
- }
- }
- int num_threads = thread::hardware_concurrency() / 2;
- cout << "plotting " << num_iter * num_threads << " points" << endl;
- for (unsigned i = 0; i < num_threads; ++i) {
- uint32_t *local_image = new uint32_t[image_size * image_size];
- images.push_back(local_image);
- threads.emplace_back([local_image] {
- fill(local_image, local_image + image_size * image_size, 0);
- __m256 M[6];
- for (int i = 0; i < 6; ++i) {
- M[i] = _mm256_set_ps(coefs[i + 18], coefs[i + 12], coefs[i + 6], coefs[i],
- coefs[i + 18], coefs[i + 12], coefs[i + 6], coefs[i]);
- }
- __m256i min_coord = _mm256_set1_epi32(0);
- __m256i max_coord = _mm256_set1_epi32(image_size - 1);
- __m256i xst;
- {
- random_device rd;
- xst = _mm256_set_epi32(rd()|1, rd()|1, rd()|1, rd()|1, rd()|1, rd()|1, rd()|1, rd()|1);
- }
- __m256i cur_x, cur_y;
- cur_x = _mm256_set1_ps(1.0f);
- cur_y = _mm256_set1_ps(1.0f);
- for (int i = 0; i < num_iter; ++i) {
- __m256i index = xst;
- if (false) {
- index = _mm256_and_si256(index, _mm256_set1_epi32(chance_map_size - 1));
- index = _mm256_i32gather_epi32(chance_map, index, 4);
- } else {
- index = _mm256_and_si256(index, _mm256_set1_epi32(0xffff));
- __m256i compare_result1 = _mm256_cmpgt_epi32(index, _mm256_set1_epi32(73 * 0x10000 / 100));
- __m256i compare_result2 = _mm256_cmpgt_epi32(index, _mm256_set1_epi32(99 * 0x10000 / 100));
- __m256i compare_result3 = _mm256_cmpgt_epi32(index, _mm256_set1_epi32(86 * 0x10000 / 100));
- index = _mm256_castps_si256(_mm256_blendv_ps(
- _mm256_castsi256_ps(compare_result1),
- _mm256_castsi256_ps(compare_result2),
- _mm256_castsi256_ps(compare_result3)
- ));
- index = _mm256_srli_epi32(index, 31);
- index = _mm256_or_si256(index, _mm256_slli_epi32(compare_result3, 1));
- }
- __m256 m00 = _mm256_permutevar_ps(M[0], index);
- __m256 m01 = _mm256_permutevar_ps(M[1], index);
- __m256 m02 = _mm256_permutevar_ps(M[2], index);
- __m256 m10 = _mm256_permutevar_ps(M[3], index);
- __m256 m11 = _mm256_permutevar_ps(M[4], index);
- __m256 m12 = _mm256_permutevar_ps(M[5], index);
- __m256 new_x = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(cur_x, m00), _mm256_mul_ps(cur_y, m01)), m02);
- __m256 new_y = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(cur_x, m10), _mm256_mul_ps(cur_y, m11)), m12);
- cur_x = new_x; cur_y = new_y;
- __m256i coord_x_f = cur_x;
- __m256i coord_y_f = cur_y;
- coord_x_f = _mm256_mul_ps(coord_x_f, _mm256_set1_ps(350.f));
- coord_y_f = _mm256_mul_ps(coord_y_f, _mm256_set1_ps(-350.f));
- coord_x_f = _mm256_add_ps(coord_x_f, _mm256_set1_ps(80.f));
- coord_y_f = _mm256_add_ps(coord_y_f, _mm256_set1_ps(400.f));
- __m256i coord_x = _mm256_cvtps_epi32(coord_x_f);
- __m256i coord_y = _mm256_cvtps_epi32(coord_y_f);
- coord_x = _mm256_max_epi32(coord_x, min_coord);
- coord_y = _mm256_max_epi32(coord_y, min_coord);
- coord_x = _mm256_min_epi32(coord_x, max_coord);
- coord_y = _mm256_min_epi32(coord_y, max_coord);
- __m256i address = _mm256_slli_epi32(coord_y, image_size_log2);
- address = _mm256_add_epi32(address, coord_x);
- ++local_image[_mm256_extract_epi32(address, 0)];
- ++local_image[_mm256_extract_epi32(address, 1)];
- ++local_image[_mm256_extract_epi32(address, 2)];
- ++local_image[_mm256_extract_epi32(address, 3)];
- ++local_image[_mm256_extract_epi32(address, 4)];
- ++local_image[_mm256_extract_epi32(address, 5)];
- ++local_image[_mm256_extract_epi32(address, 6)];
- ++local_image[_mm256_extract_epi32(address, 7)];
- xst = _mm256_xor_si256(xst, _mm256_slli_epi32(xst, 13));
- xst = _mm256_xor_si256(xst, _mm256_srli_epi32(xst, 17));
- xst = _mm256_xor_si256(xst, _mm256_slli_epi32(xst, 5));
- }
- });
- }
- for (auto &thread: threads) {
- thread.join();
- }
- uint32_t *final_image = images[0];
- for (size_t i = 1; i < images.size(); ++i) {
- uint32_t *other_image = images[i];
- for (int k = 0; k < image_size * image_size; k += 8) {
- _mm256_store_si256((__m256i *)(final_image + k),
- _mm256_add_epi32(_mm256_load_si256((__m256i *)(final_image + k)),
- _mm256_load_si256((__m256i *)(other_image + k))));
- }
- }
- for (int k = 0; k < image_size * image_size; ++k) {
- auto &e = final_image[k];
- e = min<int>(e >> 9, 255);
- e *= 0x10101;
- e |= 0xff000000;
- }
- {
- std::fstream s{"output.raw", ios::binary|ios::out|ios::trunc};
- s.write((char *)final_image, image_size * image_size * 4);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement