Matrix multiplication with intrinsics

#include <complex.h>
#include <xmmintrin.h>
#include <pmmintrin.h>


__m128 complex_multiplication(__m128 v1, __m128 v2) {
  __m128 t1 = _mm_moveldup_ps(v1);
  __m128 t2 = _mm_mul_ps(t1, v2);
  v2 = _mm_shuffle_ps(v2, v2, 0xb1);
  t1 = _mm_movehdup_ps(v1);
  t1 = _mm_mul_ps(t1, v2);
  return _mm_addsub_ps(t2, t1);
}

void chemm(complex float* A,
        complex float* B,
        complex float* C,
        int m,
        int n,
        complex float alpha,
        complex float beta){

  float* alpha_array = (float*)&alpha;
  float* beta_array = (float*)&beta;
  float* c_complex_array;
  float* y_array_1;
  float* y_array_2;


  // Create two vectors with the repeated values of the 'alpha' and 'beta' constants
  __m128 alpha_vector = _mm_set_ps(alpha_array[1], alpha_array[0], alpha_array[1], alpha_array[0]);
  __m128 beta_vector = _mm_set_ps(beta_array[1], beta_array[0], beta_array[1], beta_array[0]);

  // Defining vectors used later
  __m128 x_vector, y_vector, c_vector, c_double_upper_vector, c_lower_sum_vector;

  for (int x = 0; x < n; x++) {
    for (int y = 0; y < m; y++) {
      // Load one complex number twice from C
      c_complex_array = (float*)&C[y*n + x];
      c_vector =  _mm_castpd_ps(_mm_loaddup_pd((double*)c_complex_array));

      // Multiply with beta
      c_vector = complex_multiplication(c_vector, beta_vector);

      // Work with two and two complex multiplications in parallel
      for (int z = 0; z < m; z+=2) {
        // Load two and two complex numbers from A and B
        x_vector = _mm_loadu_ps((float*)&A[y*m + z]);
        y_array_1 = (float*)&B[z*n + x];
        y_array_2 = (float*)&B[(z+1)*n + x];
        y_vector = _mm_set_ps(y_array_2[1], y_array_2[0], y_array_1[1], y_array_1[0]);

        // Get multiplication results
        x_vector = complex_multiplication(x_vector, y_vector);

        // Also multiply with alpha
        x_vector = complex_multiplication(x_vector, alpha_vector);

        // Add result to C's vector
        c_vector = _mm_add_ps(c_vector, x_vector);
      }

      // The following stores the answers from the above loop
      //  and stores them in C

      // Copy the upper 64 bits of C's result vector to both upper and lower 64 bits
      c_double_upper_vector = _mm_movehl_ps(c_vector, c_vector);
      // Add vectors. This makes the lower 64 the correct sum
      c_lower_sum_vector = _mm_add_ps(c_vector, c_double_upper_vector);
      // Store the lower 64 bits in memory (C matrix)
      _mm_store_sd((double*)c_complex_array, _mm_castps_pd(c_lower_sum_vector));
    }
  }
}