Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #define _GNU_SOURCE
- #include <immintrin.h>
- #include <limits.h>
- #include <omp.h>
- #include <pthread.h>
- #include <stdint.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <unistd.h>
- typedef __m256d vec_rwc_type;
- typedef double scal_rwc_type;
- #define VEC_TYPE_ALIGNMENT 64
- #define ARRAY_SIZE_IN_BYTE 2 * 1024 * 1024 * 1024u
- #define ARRAY_SIZE = ARRAY_SIZE_IN_BYTE / sizeof(vec_rwc_type)
- #define SCAL_IN_VEC sizeof(vec_rwc_type) / sizeof(scal_rwc_type)
- #define UNROLL_COF 2
- #define ALL_ITER 50
- void bind_thread_to_core(void);
- int main(int argc, const char* argv[]) {
- puts("Before atoi");
- int number_of_thread = atoi(argv[1]);
- puts("Before parallel section");
- size_t array_size_per_vec_type = ARRAY_SIZE_IN_BYTE / sizeof(vec_rwc_type);
- array_size_per_vec_type /= UNROLL_COF;
- size_t iteration_step = SCAL_IN_VEC * UNROLL_COF;
- const uint32_t second_cof = SCAL_IN_VEC;
- omp_set_num_threads(number_of_thread);
- puts("Before parallel section");
- #pragma omp parallel
- {
- bind_thread_to_core();
- int thread_self_number = omp_get_thread_num();
- printf("Thread[%d]: affter bind\n", thread_self_number);
- int thread_array_size = ARRAY_SIZE_IN_BYTE / number_of_thread;
- vec_rwc_type dummy_val = _mm256_set1_pd(0.0f);
- scal_rwc_type* rwc_array =
- (scal_rwc_type*)_mm_malloc(thread_array_size, VEC_TYPE_ALIGNMENT);
- scal_rwc_type* dummy_upack_vec =
- _mm_malloc(SCAL_IN_VEC * sizeof(scal_rwc_type), VEC_TYPE_ALIGNMENT);
- memset(dummy_upack_vec, 0, SCAL_IN_VEC * sizeof(scal_rwc_type));
- memset(rwc_array, 0xFF, thread_array_size);
- scal_rwc_type* array_end =
- rwc_array + thread_array_size / sizeof(scal_rwc_type);
- double start_time = 0.0;
- double end_time = 0.0;
- double time = 10.0;
- printf("Thread[%d]: before barrier\n", thread_self_number);
- #pragma omp barrier
- for (register scal_rwc_type* iter_ptr = rwc_array; iter_ptr < array_end;
- iter_ptr += iteration_step) {
- dummy_val += _mm256_load_pd(iter_ptr);
- dummy_val += _mm256_load_pd(iter_ptr + second_cof);
- }
- for (uint_fast32_t iter = 0; iter < ALL_ITER; ++iter) {
- start_time = omp_get_wtime();
- for (register scal_rwc_type* iter_ptr = rwc_array;
- iter_ptr < array_end; iter_ptr += iteration_step) {
- dummy_val += _mm256_load_pd(iter_ptr);
- dummy_val += _mm256_load_pd(iter_ptr + second_cof);
- }
- end_time = omp_get_wtime();
- if (time > end_time - start_time) {
- time = end_time - start_time;
- }
- }
- for (register scal_rwc_type* iter_ptr = rwc_array; iter_ptr < array_end;
- iter_ptr += iteration_step) {
- dummy_val += _mm256_load_pd(iter_ptr);
- dummy_val += _mm256_load_pd(iter_ptr + second_cof);
- }
- _mm256_store_pd(dummy_upack_vec, dummy_val);
- printf("Dummy print:\n");
- for (int i = 0; i < SCAL_IN_VEC; ++i) {
- printf("[%d]%f\n", i, dummy_upack_vec[i]);
- }
- printf("Thread[%d] read time: %f sec\n", thread_self_number, time);
- _mm_free(rwc_array);
- _mm_free(dummy_upack_vec);
- }
- return 0;
- }
- void bind_thread_to_core(void) {
- pthread_t thread = pthread_self();
- int thread_num = omp_get_thread_num();
- cpu_set_t affinity_masks;
- CPU_ZERO(&affinity_masks);
- CPU_SET(thread_num, &affinity_masks);
- pthread_setaffinity_np(thread, sizeof(cpu_set_t), &affinity_masks);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement