Advertisement
Guest User

Untitled

a guest
Dec 26th, 2022
41
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 5.59 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <assert.h>
  4. #include <string.h>
  5.  
  6. #define MAX(x, y) (((x) > (y)) ? (x) : (y))
  7. #define MIN(x, y) (((x) < (y)) ? (x) : (y))
  8.  
  9. int main (int argc, char *argv[]) {
  10.    FILE *fp;
  11.    int flsz;
  12.    int bfsz;
  13.  
  14.    #define workernum 256
  15.  
  16.    if (argc != 2) {
  17.      puts("Excpects one argument. A path to a file mathcing Advent of code 2022 Problem 1 conventions");
  18.      return 1;
  19.    }
  20.    fp = fopen(argv[1],"r");
  21.    
  22.    if (fp == NULL) {
  23.      puts("Error opening file.");
  24.      return 1;
  25.    }
  26.  
  27.    fseek( fp, 0L, SEEK_END );
  28.    flsz = ftell(fp);
  29.    rewind(fp);
  30.  
  31.    printf("Filesize: %d\n", flsz);
  32.    long remainder = flsz % (8*workernum);
  33.    bfsz = flsz + MIN(1,remainder)*((8*workernum)-remainder);
  34.    printf("Buffersize: %d\n", bfsz);
  35.    char buffer[bfsz];
  36.    fread(buffer, sizeof(char), flsz, fp);
  37.    fclose(fp);
  38.  
  39.    for (long i=flsz; i<bfsz; i++) {
  40.      buffer[i] = '\n';
  41.    }
  42.  
  43.    u_int64_t * vmwords = calloc(bfsz, sizeof(char));
  44.  
  45.    memcpy(vmwords, buffer, bfsz);
  46.  
  47.    assert(((bfsz*sizeof(char)) % sizeof(u_int64_t)) == 0);
  48.  
  49.    int entries = (bfsz*sizeof(char)/sizeof(u_int64_t));
  50.  
  51.    for (long i=0; i<entries;i++) {
  52.      putchar((((vmwords[i]<<24) )>>24));
  53.      putchar((((vmwords[i]<<16) )>>24));
  54.      putchar( ((vmwords[i]<<8 )>>24));
  55.      putchar(((vmwords[i])>>24));
  56.  
  57.    }
  58.    fflush(stdout);
  59.  
  60.    u_int64_t partial_sums[256];
  61.    u_int64_t incomplete sum[256]
  62.    u_int64_t vector_mask[256];
  63.  
  64.    return(0);
  65. }
  66.  
  67. void process256 (u_int64_t * vmwords, u_int64_t offset, u_int64_t stride, u_int64_t * sums, u_int64_t * vectormask, u_int64_t * incomplete){
  68.     u_int64_t innercounter = 0;
  69.     stride *= 8;
  70.     offset += (u_int64_t)vmwords;
  71.     u_int64_t rounds = 0;
  72.     u_int64_t overhand = 0;
  73.     const u_int64_t zero = 0;
  74.  
  75.     asm("vxor %v11,  %v11, %v11 \n\t"
  76.         "vxor %v12,  %v12, %v12 \n\t"
  77.         "vxor %v13,  %v13, %v13 \n\t"
  78.         "vxor %v14,  %v14, %v14 \n\t"
  79.         "xorm %vm1, %vm1, %vm1 \n\t"
  80.         "xorm %vm3, %vm3, %vm3 \n\t"
  81.         "cmov.l.at %s5, %s5, %[strid] \n\t"
  82.         "loaddata:\n\t"
  83.         "adds.l %s5, -8, %s5 \n\t"
  84.         "cmov.l.at %[innercounter], %[innercounter], 8 \n\t"
  85.         "vld %v10, %[strid], %[offst] \n\t"
  86.         "adds.l %[offst], 8, %[offst] \n\t"
  87.         "processchar:\n\t"
  88.         "vsrl %v9, %v10, 56 \n\t"
  89.         "vadds.l %v9, -48, %v9 \n\t"
  90.         "vfmk.l.ge %vm2, %v9 \n\t"
  91.         "vmuls.l %v11, 10, %v11, %vm2\n\t"
  92.         "vadds.l %v11, %v9, %v11, %vm2\n\t"
  93.         "negm %vm2, %vm2\n\t"
  94.         "vadds.l %v12, %v11, %v12,%vm2\n\t"
  95.         "vxor %v11, %v11, %v11, %vm2 \n\t"
  96.         "orm %vm3, %vm3, %vm2 \n\t"
  97.         "negm %vm4, %vm3 \n\t"
  98.         "vmv ,$vm4\n\t"
  99.         "vmaxs.l %v13, %v12, %v13, %vm3 \n\t"
  100.         "vxor %v12, %v12, %v12, %vm3 \n\t"
  101.         "vfmk.l.at %vm3, %vm2 \n\t"
  102.         "orm %vm1, %vm3, %vm1 \n\t"
  103.         "adds.l %[innercounter], -1, %[innercounter] \n\t"
  104.         "beq.l %[innercounter], processchar \n\t"
  105.         "vsll %v10, %v10, 8 \n\t"
  106.         "bgt.l %s5, loaddata \n\t"
  107.         "cmov.l.at %[innercounter], %[innercounter], 8 \n\t"
  108.         "vbr %9, 1, %vm3 \n\t"
  109.         "vst %v13, 8, %[outvect] \n\t"
  110.         "vst %v9, 8, %[outmask] \n\t"
  111.         "vst %v14, 8, %[incomplete] \n\t"
  112.         :[round] "=r" (rounds),
  113.          [innercounter]    "=r" (innercounter)
  114.         :[strid] "r" (stride),
  115.          [offst] "r" (offset),
  116.          [zero] "r" (zero),
  117.          [outvect] "r" (sums),
  118.          [outmask] "r" (vectormask),
  119.          [outpart] "r" (incomplete)
  120.         :"%s5", "%v9", "%v10", "%v11", "%v12", "%v13", "%v14" "%vm1", "%vm2", "%vm3", "%vm4");
  121.  
  122. // we are aggregating digits and numbers, to calculate the number that was divided we need to know that partial sum and the number of digits that made it up
  123. // we futhermore need to restore the partial sums, howver no weighting is needed there, so the number of sums so dar is not important
  124. // (incoming incomplete number and its digit count, maximum(sum of completed numbers till the change of elf) over elfs in chunk, last incomplete number, outgoing incomplete number  
  125. // there is possibility to do encode incoming incomplete number and its digit count into one number (61 bit number, 5 bit counter)
  126. // since leading zeroes in the incomming number don't change the size of an imcomming number we only need to do this for the incomplete number which finishes
  127. // to fix up the partial sums multiply incomming by 10^{number of digits} and some with incomplete number if bloack has no number of it's own we need to do a
  128. // pass through in theory, one 64 bit word could be up to 26.575 binary digits, if we consider atleast a leading and following digit we are at 10 digits which
  129. // is enough, if we only claim correctness ofr 32 bit sized numbers that we can perform all fixing ups in parallel. Should we want 64 bit we need to consider 3
  130. // block at once, which seems a bit more complicated.
  131.  
  132. // After we correclty attributed the missing digits we need to fix up the incomplete number (calories per elf) by summing the contributions from both sides of the cut.
  133. // this requires care as those could be distributed through many slice so ti requires an O(n) scan, by parelleling and bailing early we can get that down to O(n/p + longest chain).
  134. // should we start in the longest chain multiple times that is no concer as we assume that all numbers are postive ->  incompletet options become irrelvant under the max operation.
  135. // handling nagtive numbers the same algorithm to  abs(x) and relu(x) should reveal the presence of an error and can fix the error if the max reduciont hasn't hapnnend yet.
  136.  
  137.  
  138. }
  139.  
  140.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement