#define _GNU_SOURCE #include #include #include #include #include #include #include #include #include typedef __m256d vec_rwc_type; typedef double scal_rwc_type; #define VEC_TYPE_ALIGNMENT 64 #define ARRAY_SIZE_IN_BYTE 2 * 1024 * 1024 * 1024u #define ARRAY_SIZE = ARRAY_SIZE_IN_BYTE / sizeof(vec_rwc_type) #define SCAL_IN_VEC sizeof(vec_rwc_type) / sizeof(scal_rwc_type) #define UNROLL_COF 2 #define ALL_ITER 50 void bind_thread_to_core(void); int main(int argc, const char* argv[]) { puts("Before atoi"); int number_of_thread = atoi(argv[1]); puts("Before parallel section"); size_t array_size_per_vec_type = ARRAY_SIZE_IN_BYTE / sizeof(vec_rwc_type); array_size_per_vec_type /= UNROLL_COF; size_t iteration_step = SCAL_IN_VEC * UNROLL_COF; const uint32_t second_cof = SCAL_IN_VEC; omp_set_num_threads(number_of_thread); puts("Before parallel section"); #pragma omp parallel { bind_thread_to_core(); int thread_self_number = omp_get_thread_num(); printf("Thread[%d]: affter bind\n", thread_self_number); int thread_array_size = ARRAY_SIZE_IN_BYTE / number_of_thread; vec_rwc_type dummy_val = _mm256_set1_pd(0.0f); scal_rwc_type* rwc_array = (scal_rwc_type*)_mm_malloc(thread_array_size, VEC_TYPE_ALIGNMENT); scal_rwc_type* dummy_upack_vec = _mm_malloc(SCAL_IN_VEC * sizeof(scal_rwc_type), VEC_TYPE_ALIGNMENT); memset(dummy_upack_vec, 0, SCAL_IN_VEC * sizeof(scal_rwc_type)); memset(rwc_array, 0xFF, thread_array_size); scal_rwc_type* array_end = rwc_array + thread_array_size / sizeof(scal_rwc_type); double start_time = 0.0; double end_time = 0.0; double time = 10.0; printf("Thread[%d]: before barrier\n", thread_self_number); #pragma omp barrier for (register scal_rwc_type* iter_ptr = rwc_array; iter_ptr < array_end; iter_ptr += iteration_step) { dummy_val += _mm256_load_pd(iter_ptr); dummy_val += _mm256_load_pd(iter_ptr + second_cof); } for (uint_fast32_t iter = 0; iter < ALL_ITER; ++iter) { start_time = omp_get_wtime(); for (register scal_rwc_type* iter_ptr = rwc_array; iter_ptr < array_end; iter_ptr += iteration_step) { dummy_val += _mm256_load_pd(iter_ptr); dummy_val += _mm256_load_pd(iter_ptr + second_cof); } end_time = omp_get_wtime(); if (time > end_time - start_time) { time = end_time - start_time; } } for (register scal_rwc_type* iter_ptr = rwc_array; iter_ptr < array_end; iter_ptr += iteration_step) { dummy_val += _mm256_load_pd(iter_ptr); dummy_val += _mm256_load_pd(iter_ptr + second_cof); } _mm256_store_pd(dummy_upack_vec, dummy_val); printf("Dummy print:\n"); for (int i = 0; i < SCAL_IN_VEC; ++i) { printf("[%d]%f\n", i, dummy_upack_vec[i]); } printf("Thread[%d] read time: %f sec\n", thread_self_number, time); _mm_free(rwc_array); _mm_free(dummy_upack_vec); } return 0; } void bind_thread_to_core(void) { pthread_t thread = pthread_self(); int thread_num = omp_get_thread_num(); cpu_set_t affinity_masks; CPU_ZERO(&affinity_masks); CPU_SET(thread_num, &affinity_masks); pthread_setaffinity_np(thread, sizeof(cpu_set_t), &affinity_masks); }