Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- #include <assert.h>
- #include <string.h>
- #define MAX(x, y) (((x) > (y)) ? (x) : (y))
- #define MIN(x, y) (((x) < (y)) ? (x) : (y))
- int main (int argc, char *argv[]) {
- FILE *fp;
- int flsz;
- int bfsz;
- #define workernum 256
- if (argc != 2) {
- puts("Excpects one argument. A path to a file mathcing Advent of code 2022 Problem 1 conventions");
- return 1;
- }
- fp = fopen(argv[1],"r");
- if (fp == NULL) {
- puts("Error opening file.");
- return 1;
- }
- fseek( fp, 0L, SEEK_END );
- flsz = ftell(fp);
- rewind(fp);
- printf("Filesize: %d\n", flsz);
- long remainder = flsz % (8*workernum);
- bfsz = flsz + MIN(1,remainder)*((8*workernum)-remainder);
- printf("Buffersize: %d\n", bfsz);
- char buffer[bfsz];
- fread(buffer, sizeof(char), flsz, fp);
- fclose(fp);
- for (long i=flsz; i<bfsz; i++) {
- buffer[i] = '\n';
- }
- u_int64_t * vmwords = calloc(bfsz, sizeof(char));
- memcpy(vmwords, buffer, bfsz);
- assert(((bfsz*sizeof(char)) % sizeof(u_int64_t)) == 0);
- int entries = (bfsz*sizeof(char)/sizeof(u_int64_t));
- for (long i=0; i<entries;i++) {
- putchar((((vmwords[i]<<24) )>>24));
- putchar((((vmwords[i]<<16) )>>24));
- putchar( ((vmwords[i]<<8 )>>24));
- putchar(((vmwords[i])>>24));
- }
- fflush(stdout);
- u_int64_t partial_sums[256];
- u_int64_t incomplete sum[256]
- u_int64_t vector_mask[256];
- return(0);
- }
- void process256 (u_int64_t * vmwords, u_int64_t offset, u_int64_t stride, u_int64_t * sums, u_int64_t * vectormask, u_int64_t * incomplete){
- u_int64_t innercounter = 0;
- stride *= 8;
- offset += (u_int64_t)vmwords;
- u_int64_t rounds = 0;
- u_int64_t overhand = 0;
- const u_int64_t zero = 0;
- asm("vxor %v11, %v11, %v11 \n\t"
- "vxor %v12, %v12, %v12 \n\t"
- "vxor %v13, %v13, %v13 \n\t"
- "vxor %v14, %v14, %v14 \n\t"
- "xorm %vm1, %vm1, %vm1 \n\t"
- "xorm %vm3, %vm3, %vm3 \n\t"
- "cmov.l.at %s5, %s5, %[strid] \n\t"
- "loaddata:\n\t"
- "adds.l %s5, -8, %s5 \n\t"
- "cmov.l.at %[innercounter], %[innercounter], 8 \n\t"
- "vld %v10, %[strid], %[offst] \n\t"
- "adds.l %[offst], 8, %[offst] \n\t"
- "processchar:\n\t"
- "vsrl %v9, %v10, 56 \n\t"
- "vadds.l %v9, -48, %v9 \n\t"
- "vfmk.l.ge %vm2, %v9 \n\t"
- "vmuls.l %v11, 10, %v11, %vm2\n\t"
- "vadds.l %v11, %v9, %v11, %vm2\n\t"
- "negm %vm2, %vm2\n\t"
- "vadds.l %v12, %v11, %v12,%vm2\n\t"
- "vxor %v11, %v11, %v11, %vm2 \n\t"
- "orm %vm3, %vm3, %vm2 \n\t"
- "negm %vm4, %vm3 \n\t"
- "vmv ,$vm4\n\t"
- "vmaxs.l %v13, %v12, %v13, %vm3 \n\t"
- "vxor %v12, %v12, %v12, %vm3 \n\t"
- "vfmk.l.at %vm3, %vm2 \n\t"
- "orm %vm1, %vm3, %vm1 \n\t"
- "adds.l %[innercounter], -1, %[innercounter] \n\t"
- "beq.l %[innercounter], processchar \n\t"
- "vsll %v10, %v10, 8 \n\t"
- "bgt.l %s5, loaddata \n\t"
- "cmov.l.at %[innercounter], %[innercounter], 8 \n\t"
- "vbr %9, 1, %vm3 \n\t"
- "vst %v13, 8, %[outvect] \n\t"
- "vst %v9, 8, %[outmask] \n\t"
- "vst %v14, 8, %[incomplete] \n\t"
- :[round] "=r" (rounds),
- [innercounter] "=r" (innercounter)
- :[strid] "r" (stride),
- [offst] "r" (offset),
- [zero] "r" (zero),
- [outvect] "r" (sums),
- [outmask] "r" (vectormask),
- [outpart] "r" (incomplete)
- :"%s5", "%v9", "%v10", "%v11", "%v12", "%v13", "%v14" "%vm1", "%vm2", "%vm3", "%vm4");
- // we are aggregating digits and numbers, to calculate the number that was divided we need to know that partial sum and the number of digits that made it up
- // we futhermore need to restore the partial sums, howver no weighting is needed there, so the number of sums so dar is not important
- // (incoming incomplete number and its digit count, maximum(sum of completed numbers till the change of elf) over elfs in chunk, last incomplete number, outgoing incomplete number
- // there is possibility to do encode incoming incomplete number and its digit count into one number (61 bit number, 5 bit counter)
- // since leading zeroes in the incomming number don't change the size of an imcomming number we only need to do this for the incomplete number which finishes
- // to fix up the partial sums multiply incomming by 10^{number of digits} and some with incomplete number if bloack has no number of it's own we need to do a
- // pass through in theory, one 64 bit word could be up to 26.575 binary digits, if we consider atleast a leading and following digit we are at 10 digits which
- // is enough, if we only claim correctness ofr 32 bit sized numbers that we can perform all fixing ups in parallel. Should we want 64 bit we need to consider 3
- // block at once, which seems a bit more complicated.
- // After we correclty attributed the missing digits we need to fix up the incomplete number (calories per elf) by summing the contributions from both sides of the cut.
- // this requires care as those could be distributed through many slice so ti requires an O(n) scan, by parelleling and bailing early we can get that down to O(n/p + longest chain).
- // should we start in the longest chain multiple times that is no concer as we assume that all numbers are postive -> incompletet options become irrelvant under the max operation.
- // handling nagtive numbers the same algorithm to abs(x) and relu(x) should reveal the presence of an error and can fix the error if the max reduciont hasn't hapnnend yet.
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement