Untitled

// ----------------------- do multi-threaded versions here ------------------


/* GLOBALS */
kvp* gsrc;
kvp* gdst;
int gdim;

/* Thread stuff */
pthread_barrier_t b1;
pthread_barrier_t b2;

/* Other stuff */
int nbits = 8;
int nbuckets = 1 << nbits; // 256
int nsize;
int numThreads = 4;

/* idk */
int tid;

int mask = 0xFF;

unsigned long long gbuckets[256][tid];
// Should this be 2D?
unsigned long gsum[256];

/* Peer thread routine */
void thread(void *id_ptr) {

  int thread_id = *(int)id_ptr;

  int i, k, bucketIndex, nthread, shift;

  for (nthread = 0; nthread < numThreads; nthread++) {
    pthread_barrier_wait(&b1);
    //the main is clearing the global stuff here <-- WTF mate
    pthread_barrier_wait(&b1);


    // `fill buckets, this is the same as before without index increment'

    // ^^ Doesn't make sense to me:

    // This is how we generated the bucket count in naive_singlethread (L89-93):
    /*
       for(int i = 0; i < dim; ++i) {
       int index = gen_shift(src[i].key,iter*log_radix,
       (bucket_size(log_radix)-1))+1;
       buckets[index][iter]++;
       }
    */

    // But here `iter' is the array of thread ids. I think the peer thread should
    // only generate the bucket count for the elements in that row, right?

    for (i = 0; i < dim; ++i) {
      k = gsrc[i].key;
      // nbits == log_radix => 8
      bucketIndex = ((k >> nthread*nbits) & mask); /* ((k >> 8n) & 0xFF)) */
      gbuckets[i][bucketIndex]++;
    }


    // new phase : Need to create an offset using buckets and sum
    // you basically add the global buckets to the global sum -1 */

    for (bucketIndex = 0;  bucketIndex < nbuckets; ++bucketIndex) {
      gbuckets[bucketIndex][nthread] += sum[bucketIndex-1];
    }

    // you will have to traverse each segment backwards
    // in order to place them in the right order.
    // But this basically works the same as single thread. But you decrement
    // instead of increment, and you decrement before you set it to out_index
    for (i = dim; i > 0; --i) {
      // I'm lost bro.
    }
  }
  return NULL;
}

/*
 *  Shitty Multithread - I'm dying inside.
 *  Three Phases:
 *  1. Generate bucket count for each part - Parallel
 *  2. Generate Local Prefix Sum for each part - Parallel .
 *  3. Generate Global Sum - Sequential
 *  4. Move data.
 */
char shitty_multithread_descr[] = "shitty_multithread: last thing you'll see before I sefault.";

void shitty_multithread(int dim, kvp *src, kvp *dst)
{
  pthread_t tids[numThreads];
  int short_ids[numThreads];

  pthread_barrier_init(&b1, NULL, numThreads);
  /* pthread_barrier_init(&b2, NULL, numThreads); */
  gdim = dim;
  gdst = dst;
  gsrc = src;

  nsize = dim/numThreads;

  for (i = 0; i < numThreads; i++) {
    tid = i;
    pthread_create(&tids[i], NULL, thread, &tids[i]);
  }

  pthread_barrier_wait(&b1);
  memset(*gbuckets, 0, sizeof(unsigned long long)*nbuckets*(numThreads+1));
  memset(*gsum, 0, sizeof(unsigned long)*nbuckets);
  int i , j;

  // Criticial phase
  for (i = 1; i < numThreads; i++) {
    for (j = 0 ; j < 256; j++) {
      gbuckets[j][i] += gbuckets[j][i-1];
    }
  }

  for (i = 0; i < numThreads; i++) {
    pthread_join(tid[i], NULL);
  }

  pthread_destroy(&b1);
}