Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <complex.h>
- #include <xmmintrin.h>
- #include <pmmintrin.h>
- __m128 complex_multiplication(__m128 v1, __m128 v2) {
- __m128 t1 = _mm_moveldup_ps(v1);
- __m128 t2 = _mm_mul_ps(t1, v2);
- v2 = _mm_shuffle_ps(v2, v2, 0xb1);
- t1 = _mm_movehdup_ps(v1);
- t1 = _mm_mul_ps(t1, v2);
- return _mm_addsub_ps(t2, t1);
- }
- void chemm(complex float* A,
- complex float* B,
- complex float* C,
- int m,
- int n,
- complex float alpha,
- complex float beta){
- float* alpha_array = (float*)α
- float* beta_array = (float*)β
- float* c_complex_array;
- float* y_array_1;
- float* y_array_2;
- // Create two vectors with the repeated values of the 'alpha' and 'beta' constants
- __m128 alpha_vector = _mm_set_ps(alpha_array[1], alpha_array[0], alpha_array[1], alpha_array[0]);
- __m128 beta_vector = _mm_set_ps(beta_array[1], beta_array[0], beta_array[1], beta_array[0]);
- // Defining vectors used later
- __m128 x_vector, y_vector, c_vector, c_double_upper_vector, c_lower_sum_vector;
- for (int x = 0; x < n; x++) {
- for (int y = 0; y < m; y++) {
- // Load one complex number twice from C
- c_complex_array = (float*)&C[y*n + x];
- c_vector = _mm_castpd_ps(_mm_loaddup_pd((double*)c_complex_array));
- // Multiply with beta
- c_vector = complex_multiplication(c_vector, beta_vector);
- // Work with two and two complex multiplications in parallel
- for (int z = 0; z < m; z+=2) {
- // Load two and two complex numbers from A and B
- x_vector = _mm_loadu_ps((float*)&A[y*m + z]);
- y_array_1 = (float*)&B[z*n + x];
- y_array_2 = (float*)&B[(z+1)*n + x];
- y_vector = _mm_set_ps(y_array_2[1], y_array_2[0], y_array_1[1], y_array_1[0]);
- // Get multiplication results
- x_vector = complex_multiplication(x_vector, y_vector);
- // Also multiply with alpha
- x_vector = complex_multiplication(x_vector, alpha_vector);
- // Add result to C's vector
- c_vector = _mm_add_ps(c_vector, x_vector);
- }
- // The following stores the answers from the above loop
- // and stores them in C
- // Copy the upper 64 bits of C's result vector to both upper and lower 64 bits
- c_double_upper_vector = _mm_movehl_ps(c_vector, c_vector);
- // Add vectors. This makes the lower 64 the correct sum
- c_lower_sum_vector = _mm_add_ps(c_vector, c_double_upper_vector);
- // Store the lower 64 bits in memory (C matrix)
- _mm_store_sd((double*)c_complex_array, _mm_castps_pd(c_lower_sum_vector));
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment