Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- #include <immintrin.h> // Include SIMD intrinsics
- #include <time.h>
- #define N 1024 // Size of the matrix (N x N)
- #define SIM_WIDTH 8 // Number of elements processed at a time (AVX2)
- // Initialize a matrix with random values
- void initialize_matrix(float* matrix, int size)
- {
- for (int i = 0; i < size; i++) {
- for (int j = 0; j < size; j++) {
- matrix[i * size + j] = (float)rand() / RAND_MAX;
- }
- }
- }
- // Perform matrix multiplication using SIMD
- void matrix_multiply_simd(float* A, float* B, float* C, int size)
- {
- for (int i = 0; i < size; i++) {
- for (int j = 0; j < size; j += SIM_WIDTH) {
- __m256 c = _mm256_setzero_ps();
- for (int k = 0; k < size; k++) {
- __m256 a = _mm256_set1_ps(A[i * size + k]);
- __m256 b = _mm256_loadu_ps(&B[k * size + j]);
- }
- _mm256_storeu_ps(&C[i * size + j], c);
- }
- }
- }
- // // Perform matrix multiplication without SIMD (for comparison)
- // void matrix_multiply_scalar(float* A, float* B, float* C, int size)
- // {
- // for (int i = 0; i < size; i++) {
- // for (int j = 0; j < size; j++) {
- // float sum = 0.0f;
- // for (int k = 0; k < size; k++) {
- // sum += A[i * size + k] * B[k * size + j];
- // }
- // C[i * size + j] = sum;
- // }
- // }
- // }
- int main()
- {
- // Allocate memory for matrices
- float* A = (float*)aligned_alloc(32, N * N * sizeof(float));
- float* B = (float*)aligned_alloc(32, N * N * sizeof(float));
- float* C_simd = (float*)aligned_alloc(32, N * N * sizeof(float));
- // float* C_scalar = (float*)aligned_alloc(32, N * N * sizeof(float));
- // Initialize matrices with random values
- srand(time(NULL));
- initialize_matrix(A, N);
- initialize_matrix(B, N);
- // Benchmark SIMD matrix multiplication
- clock_t start = clock();
- matrix_multiply_simd(A, B, C_simd, N);
- clock_t end = clock();
- double simd_time = (double)(end - start) / CLOCKS_PER_SEC * 1000;
- printf("SIMD Matrix Multiplication Time: %.2f ms\n", simd_time);
- // Benchmark scalar matrix multiplication
- // start = clock();
- // matrix_multiply_scalar(A, B, C_scalar, N);
- // end = clock();
- // double scalar_time = (double)(end - start) / CLOCKS_PER_SEC * 1000;
- // printf("Scalar Matrix Multiplication Time: %.2f ms\n", scalar_time);
- //
- // // Verify correctness
- // for (int i = 0; i < N; i++) {
- // for (int j = 0; j < N; j++) {
- // if (fabs(C_simd[i * N + j] - C_scalar[i * N + j]) > 1e-5) {
- // printf("Mismatch at (%d, %d): SIMD=%f, Scalar=%f\n", i, j, C_simd[i * N + j], C_scalar[i * N + j]);
- // return 1;
- // }
- // }
- // }
- // printf("Results match!\n");
- // Free allocated memory
- free(A);
- free(B);
- free(C_simd);
- // free(C_scalar);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment