Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <x86intrin.h>
- #include <stdint.h>
- #include <omp.h>
- #include <stdio.h>
- typedef float float_v4_t __attribute__((vector_size(16)));
- typedef uint32_t uint32_v4_t __attribute__((vector_size(16)));
- extern "C" {
- void process(float_v4_t * x, float_v4_t * dirx, float_v4_t * y, float_v4_t * diry, uint64_t count) {
- static const float_v4_t vmin_x = _mm_set1_ps(0.f);
- static const float_v4_t vmax_x = _mm_set1_ps(640.0f);
- static const float_v4_t vmin_y = _mm_set1_ps(0.f);
- static const float_v4_t vmax_y = _mm_set1_ps(24.0f);
- float_v4_t * end = (float_v4_t *)((float *)x + count);
- do {
- {
- uint32_v4_t xmask = (uint32_v4_t)(*x <= vmin_x) | (uint32_v4_t)(*x >= vmax_x);
- uint32_v4_t ymask = (uint32_v4_t)(*y <= vmin_y) | (uint32_v4_t)(*y >= vmax_y);
- uint32_v4_t xmask_sign = (xmask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- uint32_v4_t ymask_sign = (ymask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- *x += (*dirx = (float_v4_t)(xmask_sign ^ (uint32_v4_t)(*dirx)));
- *y += (*diry = (float_v4_t)(ymask_sign ^ (uint32_v4_t)(*diry)));
- ++x, ++y, ++dirx, ++diry;
- }
- {
- uint32_v4_t xmask = (uint32_v4_t)(*x <= vmin_x) | (uint32_v4_t)(*x >= vmax_x);
- uint32_v4_t ymask = (uint32_v4_t)(*y <= vmin_y) | (uint32_v4_t)(*y >= vmax_y);
- uint32_v4_t xmask_sign = (xmask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- uint32_v4_t ymask_sign = (ymask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- *x += (*dirx = (float_v4_t)(xmask_sign ^ (uint32_v4_t)(*dirx)));
- *y += (*diry = (float_v4_t)(ymask_sign ^ (uint32_v4_t)(*diry)));
- ++x, ++y, ++dirx, ++diry;
- }
- {
- uint32_v4_t xmask = (uint32_v4_t)(*x <= vmin_x) | (uint32_v4_t)(*x >= vmax_x);
- uint32_v4_t ymask = (uint32_v4_t)(*y <= vmin_y) | (uint32_v4_t)(*y >= vmax_y);
- uint32_v4_t xmask_sign = (xmask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- uint32_v4_t ymask_sign = (ymask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- *x += (*dirx = (float_v4_t)(xmask_sign ^ (uint32_v4_t)(*dirx)));
- *y += (*diry = (float_v4_t)(ymask_sign ^ (uint32_v4_t)(*diry)));
- ++x, ++y, ++dirx, ++diry;
- }
- {
- uint32_v4_t xmask = (uint32_v4_t)(*x <= vmin_x) | (uint32_v4_t)(*x >= vmax_x);
- uint32_v4_t ymask = (uint32_v4_t)(*y <= vmin_y) | (uint32_v4_t)(*y >= vmax_y);
- uint32_v4_t xmask_sign = (xmask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- uint32_v4_t ymask_sign = (ymask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- *x += (*dirx = (float_v4_t)(xmask_sign ^ (uint32_v4_t)(*dirx)));
- *y += (*diry = (float_v4_t)(ymask_sign ^ (uint32_v4_t)(*diry)));
- ++x, ++y, ++dirx, ++diry;
- }
- {
- uint32_v4_t xmask = (uint32_v4_t)(*x <= vmin_x) | (uint32_v4_t)(*x >= vmax_x);
- uint32_v4_t ymask = (uint32_v4_t)(*y <= vmin_y) | (uint32_v4_t)(*y >= vmax_y);
- uint32_v4_t xmask_sign = (xmask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- uint32_v4_t ymask_sign = (ymask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- *x += (*dirx = (float_v4_t)(xmask_sign ^ (uint32_v4_t)(*dirx)));
- *y += (*diry = (float_v4_t)(ymask_sign ^ (uint32_v4_t)(*diry)));
- ++x, ++y, ++dirx, ++diry;
- }
- {
- uint32_v4_t xmask = (uint32_v4_t)(*x <= vmin_x) | (uint32_v4_t)(*x >= vmax_x);
- uint32_v4_t ymask = (uint32_v4_t)(*y <= vmin_y) | (uint32_v4_t)(*y >= vmax_y);
- uint32_v4_t xmask_sign = (xmask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- uint32_v4_t ymask_sign = (ymask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- *x += (*dirx = (float_v4_t)(xmask_sign ^ (uint32_v4_t)(*dirx)));
- *y += (*diry = (float_v4_t)(ymask_sign ^ (uint32_v4_t)(*diry)));
- ++x, ++y, ++dirx, ++diry;
- }
- {
- uint32_v4_t xmask = (uint32_v4_t)(*x <= vmin_x) | (uint32_v4_t)(*x >= vmax_x);
- uint32_v4_t ymask = (uint32_v4_t)(*y <= vmin_y) | (uint32_v4_t)(*y >= vmax_y);
- uint32_v4_t xmask_sign = (xmask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- uint32_v4_t ymask_sign = (ymask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- *x += (*dirx = (float_v4_t)(xmask_sign ^ (uint32_v4_t)(*dirx)));
- *y += (*diry = (float_v4_t)(ymask_sign ^ (uint32_v4_t)(*diry)));
- ++x, ++y, ++dirx, ++diry;
- }
- {
- uint32_v4_t xmask = (uint32_v4_t)(*x <= vmin_x) | (uint32_v4_t)(*x >= vmax_x);
- uint32_v4_t ymask = (uint32_v4_t)(*y <= vmin_y) | (uint32_v4_t)(*y >= vmax_y);
- uint32_v4_t xmask_sign = (xmask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- uint32_v4_t ymask_sign = (ymask & (uint32_v4_t)_mm_set1_epi32(1 << 31));
- *x += (*dirx = (float_v4_t)(xmask_sign ^ (uint32_v4_t)(*dirx)));
- *y += (*diry = (float_v4_t)(ymask_sign ^ (uint32_v4_t)(*diry)));
- ++x, ++y, ++dirx, ++diry;
- }
- } while(x != end);
- }
- };
Advertisement
Add Comment
Please, Sign In to add comment