Untitled

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>

#define MAX(x, y) (((x) > (y)) ? (x) : (y))
#define MIN(x, y) (((x) < (y)) ? (x) : (y))

int main (int argc, char *argv[]) {
   FILE *fp;
   int flsz;
   int bfsz;

   #define workernum 256

   if (argc != 2) {
     puts("Excpects one argument. A path to a file mathcing Advent of code 2022 Problem 1 conventions");
     return 1;
   }
   fp = fopen(argv[1],"r");

   if (fp == NULL) {
     puts("Error opening file.");
     return 1;
   }

   fseek( fp, 0L, SEEK_END );
   flsz = ftell(fp);
   rewind(fp);

   printf("Filesize: %d\n", flsz);
   long remainder = flsz % (8*workernum);
   bfsz = flsz + MIN(1,remainder)*((8*workernum)-remainder);
   printf("Buffersize: %d\n", bfsz);
   char buffer[bfsz];
   fread(buffer, sizeof(char), flsz, fp);
   fclose(fp);

   for (long i=flsz; i<bfsz; i++) {
     buffer[i] = '\n';
   }

   u_int64_t * vmwords = calloc(bfsz, sizeof(char));

   memcpy(vmwords, buffer, bfsz);

   assert(((bfsz*sizeof(char)) % sizeof(u_int64_t)) == 0);

   int entries = (bfsz*sizeof(char)/sizeof(u_int64_t));

   for (long i=0; i<entries;i++) {
     putchar((((vmwords[i]<<24) )>>24));
     putchar((((vmwords[i]<<16) )>>24));
     putchar( ((vmwords[i]<<8 )>>24));
     putchar(((vmwords[i])>>24));

   }
   fflush(stdout);

   u_int64_t partial_sums[256];
   u_int64_t incomplete sum[256]
   u_int64_t vector_mask[256];

   return(0);
}

void process256 (u_int64_t * vmwords, u_int64_t offset, u_int64_t stride, u_int64_t * sums, u_int64_t * vectormask, u_int64_t * incomplete){
    u_int64_t innercounter = 0;
    stride *= 8;
    offset += (u_int64_t)vmwords;
    u_int64_t rounds = 0;
    u_int64_t overhand = 0;
    const u_int64_t zero = 0;

    asm("vxor %v11,  %v11, %v11 \n\t"
        "vxor %v12,  %v12, %v12 \n\t"
        "vxor %v13,  %v13, %v13 \n\t"
        "vxor %v14,  %v14, %v14 \n\t"
        "xorm %vm1, %vm1, %vm1 \n\t"
        "xorm %vm3, %vm3, %vm3 \n\t"
        "cmov.l.at %s5, %s5, %[strid] \n\t"
        "loaddata:\n\t"
        "adds.l %s5, -8, %s5 \n\t"
        "cmov.l.at %[innercounter], %[innercounter], 8 \n\t"
        "vld %v10, %[strid], %[offst] \n\t"
        "adds.l %[offst], 8, %[offst] \n\t"
        "processchar:\n\t"
        "vsrl %v9, %v10, 56 \n\t"
        "vadds.l %v9, -48, %v9 \n\t"
        "vfmk.l.ge %vm2, %v9 \n\t"
        "vmuls.l %v11, 10, %v11, %vm2\n\t"
        "vadds.l %v11, %v9, %v11, %vm2\n\t"
        "negm %vm2, %vm2\n\t"
        "vadds.l %v12, %v11, %v12,%vm2\n\t"
        "vxor %v11, %v11, %v11, %vm2 \n\t"
        "orm %vm3, %vm3, %vm2 \n\t"
        "negm %vm4, %vm3 \n\t"
        "vmv ,$vm4\n\t"
        "vmaxs.l %v13, %v12, %v13, %vm3 \n\t"
        "vxor %v12, %v12, %v12, %vm3 \n\t"
        "vfmk.l.at %vm3, %vm2 \n\t"
        "orm %vm1, %vm3, %vm1 \n\t"
        "adds.l %[innercounter], -1, %[innercounter] \n\t"
        "beq.l %[innercounter], processchar \n\t"
        "vsll %v10, %v10, 8 \n\t"
        "bgt.l %s5, loaddata \n\t"
        "cmov.l.at %[innercounter], %[innercounter], 8 \n\t"
        "vbr %9, 1, %vm3 \n\t"
        "vst %v13, 8, %[outvect] \n\t"
        "vst %v9, 8, %[outmask] \n\t"
        "vst %v14, 8, %[incomplete] \n\t"
        :[round] "=r" (rounds),
         [innercounter]    "=r" (innercounter)
        :[strid] "r" (stride),
         [offst] "r" (offset),
         [zero] "r" (zero),
         [outvect] "r" (sums),
         [outmask] "r" (vectormask),
         [outpart] "r" (incomplete)
        :"%s5", "%v9", "%v10", "%v11", "%v12", "%v13", "%v14" "%vm1", "%vm2", "%vm3", "%vm4");

// we are aggregating digits and numbers, to calculate the number that was divided we need to know that partial sum and the number of digits that made it up
// we futhermore need to restore the partial sums, howver no weighting is needed there, so the number of sums so dar is not important
// (incoming incomplete number and its digit count, maximum(sum of completed numbers till the change of elf) over elfs in chunk, last incomplete number, outgoing incomplete number
// there is possibility to do encode incoming incomplete number and its digit count into one number (61 bit number, 5 bit counter)
// since leading zeroes in the incomming number don't change the size of an imcomming number we only need to do this for the incomplete number which finishes
// to fix up the partial sums multiply incomming by 10^{number of digits} and some with incomplete number if bloack has no number of it's own we need to do a
// pass through in theory, one 64 bit word could be up to 26.575 binary digits, if we consider atleast a leading and following digit we are at 10 digits which
// is enough, if we only claim correctness ofr 32 bit sized numbers that we can perform all fixing ups in parallel. Should we want 64 bit we need to consider 3
// block at once, which seems a bit more complicated.

// After we correclty attributed the missing digits we need to fix up the incomplete number (calories per elf) by summing the contributions from both sides of the cut.
// this requires care as those could be distributed through many slice so ti requires an O(n) scan, by parelleling and bailing early we can get that down to O(n/p + longest chain).
// should we start in the longest chain multiple times that is no concer as we assume that all numbers are postive ->  incompletet options become irrelvant under the max operation.
// handling nagtive numbers the same algorithm to  abs(x) and relu(x) should reveal the presence of an error and can fix the error if the max reduciont hasn't hapnnend yet.


}