osht - v. 0.0.2

import re, sys
re1 = re.compile(rb"(([^=]|\\=)*)(=([^=]|\\=)*)?")
import pprint

def unescape(i):
    ret = b""
    esc = 0
    for c in i:
        if esc and c == b"\\":
            ret+= b"\\"
            esc = 0
        elif esc and c == b"=":
            ret+= b"="
            esc = 0
        elif esc and c == b"n":
            ret+= b"\n"
            esc = 0
        elif esc:
            raise ValueError(f"Parser error: got character {c} after \\")
        elif c == b"\\":
            esc = 1
        else:
            ret+= bytes([c])
    if esc:
        raise ValueError(f"Parser error: got EOL/EOF after \\")
    return ret


def parse(i):
    terminators = set()
    rules = set()
    lines = i.split(b"\n")
    for l in lines:
        m = re1.match(l)
        if m.group(3):
            rules|= {(unescape(m.group(1)), unescape(m.group(3)[1:]))}
        else:
            terminators|= {unescape(m.group(1))}
    return terminators, rules

def compile_f(c):
    pp = pprint.PrettyPrinter(indent=4)
    # Group based on the cardinality of elements in a multiset
    # Track information about order, but do not use it for grouping
    terms, rules = parse(c)
    states = {}
    for t in terms:
        k,s = lit_symm(t)
        if k in states:
            states[k] = merge_symm(s, states[k])
        else:
            states[k] = s
    while 1:
        #print("len states:", len(states))
        #pp.pprint(states)
        for k in states:
            v = states[k]
            #xprint(k,v)
        new_states = {}
        for chars in states:
            for a,b in rules:
                if charcounts_comp(b, chars):
                    k,v = trans_symm(chars, states[chars], a, b)
                    #print("\tRule",a,"->",b," got k:",k,", v:",v)
                    if k in new_states:
                        new_states[k] = merge_symm(new_states[k], v)
                    elif k in states:
                        new_states[k] = merge_symm(states[k], v)
                    else:
                        new_states[k] = v
        ns = states | new_states
        if states == ns:break
        states = ns
    #print("EWAFDSDFG")
    #print(states)
    # Generate code
    prog = """
        #include <stdlib.h>
        #include <stdio.h>
        #include <string.h>

        struct WordNode {
            unsigned char* ptr;
            size_t len;
            struct WordNode* next;
        };
    """
    statelist = [(k, states[k])for k in states]
    # Generate input-match-and-verify for every state
    for idx,kv in enumerate(statelist):
        chrs,symm = kv
        symmc = {*symm}
        ordchrs = {e for t in symm for e in t}
        for a in ordchrs:
            for b in ordchrs:
                for c in ordchrs:
                    if (a,c) in symmc and (a,b) in symm and (b,c) in symm:
                        symmc-= {(a,c)}
        prog+= f"""
            int match_{idx}(unsigned char* ptr, size_t len) {{
                size_t count_dummy = 0;
                {"".join([f"size_t count_{ch} = 0;" for ch,_ in chrs])}
                for (size_t i = 0; i < len;++i) {{
                    {"".join([f"if (ptr[i] == {ch}) {{if (count_{next((y for x,y in symmc if x == ch), 'dummy')})return 0;if (count_{ch} == 0) {{{''.join([f'count_{sm}+= count_{sm} ? 0 : 1;' for sm in ordchrs if (sm,ch) in symm])}count_{ch} = 1;}}}}" for ch in ordchrs])}
                    {"else ".join(["if(0)count_dummy=0;"]+[f"if (ptr[i] == {ch})count_{ch}+= 1;" for ch,_ in chrs])}else return 0;
                }}
                {"".join([f"count_{ch}-=count_{ch}?1:0;" for ch in ordchrs])}
                {"".join([f"if (count_{ch}{['?','?','!=1','>1','<2','==1','<1','<0'][amt]})return 0;" for ch,amt in chrs])}
                return 1;
            }}
        """
    prog+= f"""
        int match(unsigned char* ptr, size_t len) {{
            {"".join([f"if (match_{idx}(ptr, len))return 1;" for idx in range(len(statelist))])}
            return 0;
        }}

        int terminator(unsigned char* ptr, size_t len) {{
            {"".join([f"unsigned char arr_{idx}[{len(t)}]={{{str([*t])[1:-1]}}};if(len == {len(t)} && 0==memcmp(arr_{idx},ptr,len))return 1;" for idx,t in enumerate(terms)])}
            return 0;
        }}

        void infloop(){{
            while (1){{}}
        }}

        void mainloop(unsigned char* input, size_t len) {{
            if (!match(input, len)){{free(input);infloop();}};
            if (terminator(input, len)){{
                fwrite(input, 1, len, stdout);
                exit(0);
            }}
            struct WordNode startval = {{
                input,
                len,
                NULL,
            }};
            struct WordNode* start = malloc(sizeof(struct WordNode));
            if (start == NULL)exit(-1);
            *start = startval;
            struct WordNode* end = start;
            while (1) {{
                if (start == NULL)continue;
    """
    for a,b in rules:
        prog+= f"""
                if (start->len + {len(b)} >= {len(a)}) {{
                    unsigned char a[{len(a)}] = {{{str([*a])[1:-1]}}};
                    unsigned char b[{len(b)}] = {{{str([*b])[1:-1]}}};
                    size_t totlen = start->len + {len(b)} - {len(a)};
                    unsigned char arr[totlen ? totlen : 1];
                    for (size_t i = 0; i + {len(a)} < len + 1;++i) {{
                        size_t j = 0;
                        for (;j < {len(a)}; ++j) {{
                            if (a[j] != start->ptr[i+j])break;
                        }}
                        if (j == {len(a)}) {{
                            for(size_t k = 0;k < {len(b)};k++) {{
                                arr[i+k] = b[k];
                            }}
                            for(size_t p = i + {len(a)};p < start->len;p++) {{
                                arr[p+{len(b)}-{len(a)}] = start->ptr[p];
                            }}
                            if (match(arr, totlen)) {{
                                if (terminator(arr, totlen)) {{
                                    fwrite(arr, 1, totlen, stdout);
                                    exit(0);
                                }} else {{
                                    unsigned char* copy = malloc(totlen ? totlen : 1);
                                    if (copy == NULL)exit(-1);
                                    memcpy(copy, arr, totlen);
                                    struct WordNode* next = malloc(sizeof(struct WordNode));
                                    if (next == NULL)exit(-1);
                                    struct WordNode nextdata = {{
                                        copy,
                                        totlen,
                                        NULL,
                                    }};
                                    *next = nextdata;
                                    end->next = next;
                                    end = next;
                                }}
                            }}
                        }}
                        arr[i] = start->ptr[i];
                    }}
                }}
        """
    prog+= """
                struct WordNode* tmp = start;
                start = start->next;
                free(tmp->ptr);
                free(tmp);
            }
        }

        // Read stdin and write size to i
        char* getStdin(size_t* i)
        {
            int c;
            *i = 0;
            char* ptrBuff = malloc(1);

            while ((c = getchar()) && c != EOF)
            {
                if ((ptrBuff = (char*)realloc(ptrBuff, sizeof (char)+*i)) != NULL)
                    ptrBuff[(*i)++] = c;
                else
                {
                    free(ptrBuff);
                    return NULL;
                }
            }

            if (ptrBuff != NULL)
                ptrBuff[*i] = '\0';

            return ptrBuff;
        }

        int main() {
            size_t len;
            char* ptr = getStdin(&len);
            mainloop(ptr, len);
            return -2;
        }
    """
    with open("osht.c", "w") as f:
        f.write(prog)


def charcounts(lit):
    counts = {}
    for ch in lit:
        counts[ch] = ch in counts and counts[ch] + 1 or 1
    return frozenset((ch,2**min(counts[ch],2))for ch in lit)

def lit_symm(lit):
    s = {(a,b) for a in lit for b in lit if a != b}
    for j in range(len(lit)):
        for i in range(j):
            b = lit[i]
            a = lit[j]
            s-= {(a,b)}
    return charcounts(lit), s

def charcounts_comp(substring, counts):
    subc = charcounts(substring)
    lut = {k:v for k,v in counts}
    return next((0 for ch,amt in subc if ch not in lut or lut[ch]<amt),1)


def merge_symm(a,b):
    return a&b

def trans_symm(chrs, symm, a, b):
    counts = {}
    for ch in a:
        counts[ch] = counts[ch] + 1 if ch in counts else 1
    for ch in b:
        counts[ch] = counts[ch] - 1 if ch in counts else -1
    chrs_dict = {k:v for k,v in chrs}
    a_set,a_symm = lit_symm(a)
    a_dict = {k:v for k,v in a_set}
    tot = a_dict | chrs_dict
    ret_dict = {}
    #print(f"\t\tDbg: tot:{tot}, b_dict:{a_dict}")
    for ch in tot:
        amt = ch in chrs_dict and chrs_dict[ch] or 1
        diff = ch in counts and counts[ch]
        #print(f"\t\t\tTrying {chr(ch)}, amt: {amt}, diff: {diff}")
        ret = 0
        if amt & 4:
            ret|= (diff < -1) | 2*(diff < 0) | 4
        if diff == 1:
            ret|= amt << 1
        elif diff > 1:
            ret|= 4
        else:
            ret|= amt >> -diff
        ret = ret & 7
        if ret > 1:
            ret_dict[ch] = ret
    # Calculate the symms
    new_symm = set()
    for x,y in symm:
        #print(f"\t\t\tTrying {chr(x)}{chr(y)}")
        # These symbols will not be part of the output, so skip them
        if x not in ret_dict or y not in ret_dict:
            #print(f"\t\t\t Skipped because {x not in ret_dict} or {y not in ret_dict}")
            continue
        # No way it affects the order
        if x not in b and y not in b:
            #print(f"\t\t\t Added because x not in b and y not in b")
            new_symm|= {(x,y)}
            continue
        # Out of order in a
        fail = False
        for i in range(len(a)):
            if a[i] != x:continue
            for j in range(i):
                if a[j] == y:fail = True;break#print(f"\t\t\t Failed because a[{j}]=={chr(y)} and a[{i}]=={chr(x)}");break
            if fail:break
        if fail:continue
        # If x in a, does something imply b appears not after y?
        if x in a:
            for ch in b:
                if ch == x or (ch == y and chrs_dict[y] == 2) or (ch, y) in symm:break
            else:
                #print(f"\t\t\t Failed because nothing implies b appears not after {chr(y)}")
                fail = True
        if fail:continue
        # If y in a, does something imply b appears not before x?
        if y in a:
            for ch in b:
                if ch == y or (ch == x and chrs_dict[x] == 2) or (x, ch) in symm:break
            else:
                #print(f"\t\t\t Failed because nothing implies b appears not before {chr(x)}")
                fail = True
        #print(f"\t\t\t Conditions passed so adding {chr(x)}{chr(y)}")
        new_symm|= {(x,y)}
    # Add single char symmetries caused by new additions
    for ch0 in a:
        if ch0 in chrs_dict:continue
        for ch1 in chrs_dict:
            if ch1 in a:continue
            if ch1 not in ret_dict:continue
            for chb in b:
                if (ch1, chb) in symm:
                    #print(f"\t\t\t Adding single char symmetry {chr(ch1)}{chr(ch0)} because {chr(ch1)}{chr(chb)}")
                    new_symm|= {(ch1,ch0)}
                if (chb, ch1) in symm:
                    #print(f"\t\t\t Adding single char symmetry {chr(ch0)}{chr(ch1)} because {chr(chb)}{chr(ch1)}")
                    new_symm|= {(ch0, ch1)}


    # Add symmetries in a that are preserved
    for x,y in a_symm:
        add = None
        if x not in chrs_dict or y not in chrs_dict:
            add = {(x,y)}
        else:
            continue
        if x in chrs_dict:
            for z,_x in new_symm:
                if _x == x:add|= {(z,y)}
        if y in chrs_dict:
            for _y, z in new_symm:
                if _y == y:add|= {(x,z)}
        new_symm|= add

    return frozenset({(k, ret_dict[k]) for k in ret_dict}), new_symm

def xprint(k,v):
    out = "("
    for ch,amt in k:
        if amt < 2 or amt > 7:print("amt",amt);exit(-1)
        amtdsc = ["??", "0?", "1", "0 or 1", "2 or more", "not 1", "1 or more", "any amount"][amt]
        out+= f"{chr(ch)} -> {amtdsc}, "
    out+= ") => ["
    for a,b in v:
        out+= f" {chr(a)}{chr(b)} "
    out+= "]"
    print(out)

code =\
b"""
yes
=|
=!
a|b=|>
>b=b>
>!c=!
|!=yes
"""[1:-1]

res = compile_f(code)