Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -- * * * BEHIND THE SCENES * * *
- -- Tend to be simpler functions, used within methods
- -- Returns the size of a table, including numeric and non-numeric keys.
- local function tablesize(t)
- local sz=0
- for k, v in pairs(t) do
- sz=sz+1
- end
- return sz
- end
- -- I was hoping it wouldn't come to this, but these tree data structures are driving me crazy and I need a way to visualize what's in them.
- local function printtree(t, depth)
- depth=depth or 0
- for k, v in pairs(t) do
- if type(v) == "table" then
- print(string.rep(" ", depth) .. "{" .. k .. "=")
- printtree(v, depth+1)
- elseif type(v) == "function" then
- print(string.rep(" ", depth) .. "{" .. k .. "=" .. type(v) .. "}")
- else
- print(string.rep(" ", depth) .. "{" .. k .. "=" .. v .. "}")
- end
- end
- end
- -- Copy a table.
- local function tablecopy(t)
- local tcopy={}
- for k, v in pairs(t) do
- if type(v)=="table" then
- tcopy[k]=tablecopy(v)
- else
- tcopy[k]=v
- end
- end
- return tcopy
- end
- -- Export a table. Used specifically for exporting our scores table, which has syllables, scores, weights, and savings-- everything we need to pick up from where we left off. Path is optional.
- local function exportstats(t, path)
- path=path or "C:/CommonSyllables.txt"
- local s=""
- for k, v in pairs(t) do
- s=s .. k .. ";"
- if type(v)=="table" then
- for i=1,#v do
- s=s .. v[i] .. ","
- end
- s=s .. "\n"
- end
- end
- local f=io.open(path, "w+")
- f:write(s)
- f:flush()
- end
- -- Imports our stats table. Path is optional.
- local function importstats(path)
- path=path or "C:/CommonSyllables.txt"
- stats={}
- for s in io.lines(path) do
- k=string.match(s, "(.+);")
- stats[k]={}
- for v in string.gmatch(s, "([%w%-]+),") do
- if not v then
- table.insert(stats[k], nil)
- else
- table.insert(stats[k],tonumber(v))
- end
- end
- end
- return stats
- end
- -- Exports the graveyard. Because they are stored as a set, it's only necessary to list them in the file. Syllables are newline delimited to make them easier to read using io.lines.
- local function exportgraveyard(t, path)
- path=path or "C:/SyllableGraveyard.txt"
- local s=""
- for k in pairs(t) do
- s=s .. k .. "\n"
- end
- local f=io.open(path, "w+")
- f:write(s)
- f:flush()
- end
- -- Imports the graveyard.
- local function importgraveyard(path)
- path=path or "C:/SyllableGraveyard.txt"
- graveyard={}
- for s in io.lines(path) do
- graveyard[s]=true
- end
- return graveyard
- end
- -- As of 5.4, unpack has been changed to table.unpack. I'm reassigning it here, just in case...
- unpack = unpack or table.unpack
- -- Outputs a vector with our alphabet. Initial/final and apostrophe come first so it's easy to exclude them, if necessary.
- local function alph()
- local v={}
- function v:insert(x) table.insert(self,x) end
- v:insert("_")
- v:insert("\'") -- Apostrophe
- for i = 0x61, 0x7A do -- a through z
- v:insert(string.char(i))
- end
- return v
- end
- -- Several syllables are "forbidden" in pattern matching but still accidentally pop in. This function excludes them.
- local function permissible(s)
- local forbs = {"[^%a'].*[^%a']", -- " x ", words (x can be any number of characters, including 0)
- "[%a'][^%a']+[%a']" -- "x y", interior spaces, spanning words (any number of spaces between x and y but at least one)
- }
- for i=1,#forbs do
- if string.match(s, forbs[i]) then
- return false
- end
- end
- return true
- end
- -- Takes a string and substitutes in underscores for non-alphabetical, non-apostrophe characters. This is done in advance, but we don't want errors if one slips through.
- local function subin_(s)
- return string.gsub(s, "[^%a']+", "_")
- end
- -- Less than function, passable to sorting functions.
- local function lt(a, b) return a < b end
- -- Greater than function, passable to sorting functions.
- local function gt(a, b) return a > b end
- -- Sorts a list of tables by chosen element and function. Could instead use a metamethod to define __lt on a table but would probably be the same amount of work.
- local function sortby(t, e, f)
- e = e or 2 -- By default, sorts by second element.
- f = f or lt -- By default, sorts in increasing order.
- local function comp(a, b) return f(a[e], b[e]) end
- return table.sort(t, comp)
- end
- -- How deep in a tree are our syllables? Finds the Huffman score based on the tree and outputs it to result table.
- local function treedepth(tree, depth, result)
- result=result or {}
- depth=depth or 0
- for k, v in pairs(tree) do
- if type(v)=="table" then
- if type(v[1])=="string" then
- result[v[1]] = depth
- else
- result = treedepth(v, depth+1, result)
- end
- end
- end
- return result
- end
- -- Counts the number of instances of syllable s in the provided text. You can also make s a pattern, if you'd like.
- local function countsylls(text, s)
- local c=0
- for k in string.gmatch(text, s) do
- c=c+1
- end
- return c
- end
- -- Uses a greedy algorithm to find a good (but maybe not best) covering of a word. Can also be used to find the potential savings of longer syllables. Optional argument tally, associated with dontreset in compress below, tallies the weights in alph[1] if requested. I'm using a greedy algorithm because I strongly suspect this problem is NP-complete (see "set cover problem" on Wikipedia).
- local function greedy(alph, word, tally)
- word=subin_(word)
- -- ii is our index, ll and rr are the left and right positions of a match. They're nil for now because we want to use ll to end the loop.
- local ii, ll, rr = 0
- repeat
- -- ii starts at 0 and is incremented first because we want to use its value after the loop ends.
- ii=ii+1
- -- alph[3] is the extended alphabet sorted by potential savings.
- ll, rr = string.find(word, alph[3][ii][1])
- until ll
- local bits=alph[2][alph[3][ii][1]][1]
- if tally then
- alph[2][alph[3][ii][1]][2] = alph[2][alph[3][ii][1]][2] + 1
- end
- -- Only keep looking to the left if there's still a word there.
- if ll>1 then
- bits = bits + greedy(alph, string.sub(word, 1, ll-1), tally)
- end
- -- Only keep looking to the right if there's still a word there.
- if rr<#word then
- bits = bits + greedy(alph, string.sub(word, rr+1, #word), tally)
- end
- return bits
- end
- -- Uses the greedy algorithm above to compress a text using the alphabet we've formed. Outputs the number of bits we've compressed it to. Takes about 80 seconds to run using the single-letter alphabet, might run *faster* with syllables because they'll remove larger chunks at a time via our greedy algorithm. Optional argument suppressprint keeps the function from printing the bit total. Optional argument dontreset prevents the algorithm from re-tallying the syllable weights. Default is to reset the tallies so that we can update our weights and Huffman scores.
- local function compress(alph, text, suppressprint, dontreset)
- if not(dontreset) then
- for k, v in pairs(alph[2]) do
- v[2]=0
- end
- end
- local bittotal = 0
- -- The slightly awkward construction where we match words and then inject the underscores back in is necessary because if we include the _ delimeters in string.gmatch, it only picks up every other word because the next word overlaps with the previous one.
- for s in string.gmatch(text, "[%a']+") do
- bittotal = bittotal + greedy(alph, "_" .. s .. "_", not(dontreset))
- end
- -- Gotta divide by two because underscores appear at both the beginnings and ends of words. Add one to avoid fencepost error (9 words have eight spaces between them, plus _'s at start and end).
- if not(dontreset) then
- alph[2]["_"][2] = math.floor(alph[2]["_"][2]/2 + 0.5) + 1
- end
- alph[1]={} -- Reset our weights, then copy the new weights in their place.
- for k,v in pairs(alph[2]) do
- table.insert(alph[1], {k, v[2]})
- end
- alph:sort()
- -- 7948006 is the number of bits used with a one-character-per-codeword Huffman code.
- local totalsavings=7948006 - bittotal
- if not(suppressprint) then
- print("Compressed the text to " .. bittotal .. " bits, a total savings of " .. totalsavings .. ".")
- end
- return bittotal
- end
- -- For each entry in our alphabet computes the "Huffman score" , the minimum number of bits needed to create a prefix-free code including that syllable. Note that this doesn't actually create the Huffman code (which wouldn't be all that difficult)-- we just want the number of bits needed to encode a syllable.
- local function huffman(alph)
- alph:sort() -- Sort alph's weights if necessary
- local a=tablecopy(alph)
- local tree={}
- if #a[1]>1 then
- -- /-- What a mess! Have to sandwich the weight in the middle because I'm sorting by the second element. Will probably want to relable it ["w"] to get back on track...
- tree = {tablecopy(a[1][1]), a[1][1][2]+a[1][2][2], tablecopy(a[1][2])}
- table.insert(a[1], tablecopy(tree))
- table.remove(a[1], 1)
- table.remove(a[1], 1)
- huffman(a)
- else
- scores = treedepth(a[1])
- end
- return scores
- end
- -- Resets Huffman scores to arbitrary n.
- local function resetscores(alph, n)
- n=n or 10
- for k,v in pairs(alph[2]) do
- v[1] = n
- end
- end
- -- Resets all savings to some value to encourage searching for new, better syllable combinations. That value is optionally set with the second argument, n, although its value shouldn't make any difference.
- local function resetsavings(alph, n)
- n=n or 10
- for i=1,#alph[3] do
- alph[3][i][2] = n
- end
- end
- -- Nudges Huffman scores.
- local function nudgescores(alph, t)
- t=t or {{0.005, -3}, {0.025, -2}, {0.16, -1}, {0.68, 0}, {0.975, 1}, {0.995, 2}, {1, 3}}
- for k,v in pairs(alph[2]) do
- local RNG = math.random()
- local j=1
- while RNG>t[j][1] do
- j=j+1
- end
- v[1] = v[1] + t[j][2]
- end
- return alph
- end
- -- Nudges the savings in random directions. This can help find better syllable groupings Optional argument t is a table representing the cumulative distribution function, for use with Lua's RNG. t[i][1] must be between 0 and 1 and in strictly increasing order. t[i][2] represents how far to nudge based on the RNG falling between t[i-1][1] and t[i][1]. The default distribution I've employed is based on the standard normal distribution.
- local function nudgesavings(alph, t)
- t=t or {{0.005, -3}, {0.025, -2}, {0.16, -1}, {0.68, 0}, {0.975, 1}, {0.995, 2}, {1, 3}}
- for i=1,#alph[3] do
- local RNG = math.random()
- local j=1
- while RNG>t[j][1] do
- j=j+1
- end
- alph[3][i][2] = alph[3][i][2] + t[j][2]
- end
- return alph
- end
- -- Returns a Boolean reflecting whether the word we've created is any good based on a few simple rules off the top of my head: 1) must have at least one vowel, 2) no letter triplicates, 3) not a real word. That last condition, not a real word, can be overridden with includerealwords, which is optional. (And yeah, I know "hmm" and "shh" and "cwm" don't have vowels, but come on...)
- local function isgoodword(word, dictionary, includerealwords)
- local vowels={"a", "e", "i", "o", "u", "y"}
- local fullalphabet=alph()
- local good = true
- -- At least one vowel.
- for i=1,#vowels do
- if not(string.find(word, vowels[i])) then
- good=false
- end
- end
- -- No letter triplicates.
- for i=1,#fullalphabet do
- if string.find(word, string.rep(fullalphabet[i], 3)) then
- good=false
- end
- end
- -- Not a real word.
- -- TODDO: Account for possibility that word does/does not have underscores at its start and end.
- if not(includerealwords) then
- if string.find(dictionary, word) then
- good=false
- end
- end
- return good
- end
- -- * * *
- -- * * * METHODS * * *
- -- Typically don't want to call these directly
- -- Finds the total weight out of a specified node. Useful for generating random paths and pruning.
- -- This is a slightly awkward function but is necessary because huffman is recursive and trying to assign scores in there just ends up feeding them into the wrong table.
- local function scoresintoalph(alph)
- local scores=huffman(alph)
- for i=1,#alph[1] do
- alph[2][alph[1][i][1]] = {scores[alph[1][i][1]], alph[1][i][2]}
- end
- end
- -- Sorts all the tables in the alphabet.
- local function sortalphs(alph)
- sortby(alph[1], 2, lt)
- if alph[3] and alph[4] then -- Can't sort what doesn't exist. The sort function is called in initializealphabet before these are created.
- table.sort(alph[3], function(a,b) return a[2]>b[2] or (a[2]==b[2] and #a[1]>#b[1]) end) -- Sort from high savings to low or, if equal, from most characters to fewest.
- sortby(alph[4], 2, gt)
- end
- end
- -- Constructs a list of candidate syllables according to how promising they are. Also punts them to the graveyard if they suck.
- local function getcandidates(alph, text)
- local candlist={}
- for s1, v1 in pairs(alph[2]) do
- if v1[2]>1 then -- Component syllables must have at least two instances. Consider changing this threshold for efficiency.
- for s2, v2 in pairs(alph[2]) do
- if v2[2]>1 then
- local h1, h2 = v1[1], v2[1]
- local S, H = s1 .. s2, h1+h2
- if permissible(S) and not alph[5][S] then
- local c=countsylls(text, S) -- count
- local ps=H*c -- potential savings
- -- If the candidate syllable was found earlier in our comprehensive search, replace it only if the potential savings is now better.
- if candlist[S] then
- candlist[S] = candlist[S][3]<ps and {H, c, ps} or candlist[S]
- else
- candlist[S] = {H, c, ps}
- end
- end
- end
- end
- end
- end
- -- Create a sortable version of our list of candidates, then sort it.
- local sortable={}
- for k, v in pairs(candlist) do
- table.insert(sortable, {k, unpack(v)})
- end
- sortby(sortable, 4, gt)
- return sortable
- end
- -- How many bits would be necessary if this syllable were absent? Finds how many bits it takes to encode a syllable efficiently from shorter syllables. It's worth having an eye on the greedy algorithm, which is probably good enough for most words but falling short here could have rippling consequences down the line. For that reason, it might be worth it to (eventually) bite the bullet and perform a rigorous (NP-complete) check of syllable decomposition.
- local function savings(alph)
- local length=0
- for k in pairs(alph[2]) do
- length = math.max(length, #k)
- end
- while length>1 do -- Search from long syllables to short so savings of short syllables don't affect savings of long syllables.
- for k, v in pairs(alph[2]) do
- if #k==length then
- --[[
- for i=1,#alph[3] do
- if alph[3][i][1]==k then
- table.remove(alph[3],i)
- break
- end
- end
- --]]
- local found, i=false, 1
- repeat
- if alph[3][i][1]==k then
- table.remove(alph[3],i)
- found=true
- end
- i=i+1
- until found
- local h=v[1]
- local bits = greedy(alph, k)
- local sv = bits-h -- Savings
- alph[2][k][3]=sv
- table.insert(alph[3], {k, sv})
- alph:sort() -- We removed the syllable from alph[3] but we need to sort after adding it back in because it's important to subsequent greedy algorithm searches.
- end
- end
- length=length-1
- end
- end
- -- Saves our progress. Second argument, paths, is a table of strings with the paths for exporting our syllables and graveyard respectively.
- local function savesyllables(alph, paths)
- paths=paths or {nil,nil}
- exportstats(alph[2], paths[1])
- exportgraveyard(alph[5], paths[2])
- end
- -- Grows our list of allowed syllables. Optional argument howmany is how many syllables to add to our list, with default value 30.
- local function growsyllables(alph, text, howmany)
- local cc = getcandidates(alph, text)
- howmany = howmany or math.min(30, #cc) -- Add 30 syllables by default but can add fewer if too many of our syllable candidates are in the graveyard.
- alph:sort()
- local i, added = 1, 0 -- i is the index through which we check new syllables, added is the number of syllables actually added thus far in the loop.
- while added<howmany do
- if not alph[2][cc[i][1]] then -- only add syllables if they're not already in our alphabet
- table.insert(alph[1], {cc[i][1], cc[i][3]})
- alph[2][cc[i][1]] = {1, cc[i][3]} --{cc[i][2]-1, cc[i][3]} -- For lack of a better idea, give each new syllable a Huffman score of one bit and give it a weight of its entire number of matches.
- table.insert(alph[3], {cc[i][1], cc[i][2]-1})
- added=added+1
- end
- i=i+1
- end
- for i=1,#cc do
- if cc[i][3]<2 then -- Each code word must appear at least *twice* since it costs at least its own Huffman score to add it to the dictionary. Interestingly, this means "zyzzyva" might make it into our code words because even though it only appears twice ("zyzzyva" and "zyzzyvas"), it uses such expensive syllables that it has the potential to save a few bits.
- alph[5][cc[i][1]]=true
- end
- end
- alph:sort() -- Need to sort because we've added a bunch of new syllables and we want their savings sorted for use in the compress function.
- compress(alph, text) -- This is a throwaway iteration of the compression algorithm because we artificially assumed each new syllable saves one bit. After applying the subroutine, we can get accurate Huffman scores.
- alph:getscores()
- alph:getsavings()
- alph:sort()
- end
- -- Iteratively compresses the text until either a steady state is reached or it begins to *cost* more bits than the previous iteration. This allows us to get accurate measures of the weights and Huffman score.
- local function itercompress(alph, text, suppressprint)
- local prevalph={}
- local curr=-1
- repeat
- prevalph=tablecopy(alph)
- prev, curr = curr, compress(alph, text)
- alph:getscores()
- alph:getsavings()
- alph:sort()
- until prev>0 and curr-prev>=0
- alph=tablecopy(prevalph)
- return prev, alph
- end
- -- Uses only syllables with 0 or negative savings in compression. Used to find the best syllables that have been "accidentally" excluded from our syllable list. Also collects double-losers in a "hospice" list, with anticipation they might end up in the graveyard.
- local function loserstournament(alph, text)
- local beta = tablecopy(alph) -- Copy our alphabet into a new table, beta, because we don't want to screw it up and/or accidentally save over our good progress.
- local i=1
- while i<=#beta[3] do -- Gotta use a while loop because we'll be shrinking beta[3] along the way.
- if beta[3][i][2]>0 then
- beta[2][beta[3][i][1]]=nil
- table.remove(beta[3], i)
- else
- beta[3][i][2]=0 -- Reset all savings to 0 so all syllables are on equal footing.
- i=i+1
- end
- end
- itercompress(beta, text)
- local hospice={}
- for i=1,#beta[3] do
- if beta[3][i][2]<0 then
- hospice[beta[3][i][1]]=true
- end
- end
- return beta, hospice
- end
- -- Print alph[1], the list of weights.
- local function printweights(alph)
- for i=1,#alph[1] do
- print(i, alph[1][i][1], alph[1][i][2])
- end
- end
- -- Print alph[2], the record of Huffman scores, weights, and savings
- local function printstats(alph)
- for k, v in pairs(alph[2]) do
- print(k, v[1], v[2], v[3])
- end
- end
- -- Print alph[3], the list of savings
- local function printsavings(alph)
- for i=1,#alph[3] do
- print(i, alph[3][i][1], alph[3][i][2])
- end
- end
- -- Print alph[4], the list of bits per character.
- local function printtotalsavings(alph)
- for i=1,#alph[4] do
- print(i, alph[4][i][1], alph[4][i][2])
- end
- end
- -- Print alph[5], the graveyard.
- local function printgraveyard(alph)
- for k in pairs(alph[5]) do
- print(k)
- end
- end
- -- Print only some of the tables.
- local function printsome(alph, ...)
- local arg={...}
- for i=1,#arg do
- if arg[i]=="w" then
- printweights(alph)
- elseif arg[i]=="st" then
- printstats(alph)
- elseif arg[i]=="sv" then
- printsavings(alph)
- elseif arg[i]=="ts" then
- printtotalsavings(alph)
- elseif arg[i]=="g" then
- printgraveyard(alph)
- end
- end
- end
- -- Print everything.
- local function printall(alph)
- print("WEIGHTS:")
- alph:printweights()
- print("STATS:")
- alph:printstats()
- print("SAVINGS:")
- alph:printsavings()
- print("TOTAL SAVINGS:")
- alph:printts()
- print("GRAVEYARD:")
- alph:printgraveyard()
- end
- -- We have a graph g of unconnected nodes. Add connections to them (can even connect to self).
- local function makearcs(g)
- --[[
- Weights are simulated by having g[k][v] point to a table with {w,g[v]}. For example, suppose there are 723 instances of t --> h. Then g["t"]["h"] leads to {723, g["h"]}. This way, we can easily find the probability of reaching h from t, then go to h.
- --]]
- for k1, v1 in pairs(g) do
- if type(v1)=="table" then
- for k2, v2 in pairs(g) do
- if type(v2)=="table" then
- g[k1][k2] = {["w"]=0, ["dest"]=g[k2]}
- end
- end
- g[""][k1] = {["w"]=0, ["dest"]=g[k1]} -- Also have to create arcs out of the empty string, our starting point.
- end
- end
- end
- -- Reset all arcs in g to zero.
- local function zeroarcs(g)
- for k1, v1 in pairs(g) do
- if type(v1)=="table" then
- for k2, v2 in pairs(v1) do
- if type(v2)=="table" then
- v2["w"]=0
- end
- end
- end
- end
- end
- -- Uses the alphabet associated with g and selected text to populate the arcs of g.
- local function getarcs(g, alph, text)
- local beta=tablecopy(alph) -- Don't want to mess up our alphabet (although we should be done modifying it at this point).
- for s in string.gmatch(text, "[%a']+") do -- For each word...
- s= "_" .. s .. "_"
- beta[1]={} -- Reset all weights to zero.
- for k, v in pairs(beta[2]) do
- v[2]=0 -- Reset all weights to zero in the stats table as well.
- end
- greedy(beta, "_" .. s .. "_", true)
- local path = {} -- path is a linked list from syllable to syllable. Each element is a table whose key is the start of that syllable and contains two elements: the syllable and link to the next syllable.
- for i=1,#beta[3] do
- local count = beta[2][beta[3][i][1]][2]
- if count>0 then
- local start=0 -- Start searching for this syllable here.
- for j=1,count do
- key, start = string.find(s, beta[3][i][1], start)
- start=start+1
- path[key]={[1]=beta[3][i][1], ["destnum"]=start} -- Pointers are kind of broken or something, so instead I'm adding the key "destnum". After this loop, all the destnums are consolidated into their respective destinations proper.
- --print(j, key, start, beta[3][i][1])
- --if path[1] then print("!!!", path[1]["dest"], path[3]) end
- end
- s=string.gsub(s, beta[3][i][1], string.rep("*", string.len(beta[3][i][1])))
- --print(s)
- end
- end
- local k=1
- local curr, prev=path[k], nil
- g[""]["WWW"]=g[""]["WWW"]+1 -- WWW represents the total weight into/out of each node. (Except "", for which it's just out, and terminal nodes, which have none out.)
- g[""][curr[1]]["w"] = g[""][curr[1]]["w"] + 1 -- Start from the empty string and add weights to prefixes.
- k=curr["destnum"]
- prev, curr = curr, path[k]
- repeat
- g[prev[1]][curr[1]]["w"] = g[prev[1]][curr[1]]["w"] + 1
- g[prev[1]]["WWW"]=g[prev[1]]["WWW"]+1
- k=curr["destnum"]
- prev, curr = curr, path[k]
- until not curr
- --[=[
- print(path[1][1], path[3][1], path[4][1])
- print("AAA")
- local k=path[1]
- repeat
- print(k[1])
- k=k["dest"]
- until not k[1]
- print("BBB")
- local step=path[1]
- g[""]["WWW"]=g[""]["WWW"]+1 -- WWW represents the total weight into/out of each node. (Except "", for which it's just out, and terminal nodes, which have none out.)
- g[""][step[1]]["w"] = g[""][step[1]]["w"] + 1 -- Start from the empty string and add weights to prefixes.
- repeat
- g[step[1]]["WWW"] = g[step[1]]["WWW"] + 1
- print(step[1], step["dest"][1])--, g[step[1]][step["dest"][1]]["w"])
- for k,v in pairs(g[step[1]][step["dest"][1]]) do print(k, v) end
- g[step[1]][step["dest"][1]]["w"] = g[step[1]][step["dest"][1]]["w"] + 1
- step=step["dest"]
- until #step["dest"]==0
- --]=]
- end
- return g
- end
- -- Take a single random step from current syllable syl. Suppose we're at a node with two destinations: "a" with weight 3 and "b" with weight 5. We construct table dests with elements {"a", "b"} and table sumweights with elements {0, 3, 8}. Roll an RNG from 1 to 8 inclusive. If it rolls 1, 2, or 3, go to "a". If it rolls 4, 5, 6, 7, or 8, go to "b".
- local function takestep(g, syl)
- local dests={} -- Possible destinations after rolling our RNG.
- local sumweights={0}
- for k, v in pairs(g[syl]) do
- if type(v) == "table" then
- table.insert(dests, k)
- table.insert(sumweights, sumweights[#sumweights]+g[syl][k]["w"]) -- Make the next value of sumweights the previous value plus the weight represented by the arc.
- end
- end
- --table.remove(sumweights, 1) -- We started with a superfluous sumweights[1]=0 for arithmetic purposes, so now we delete that entry.
- local RNG = math.random(sumweights[#sumweights]) -- sumweights[#sumweights] is just "WWW" for the current node, so I guess I didn't need to calculate it after all...
- local i=1
- repeat
- i = i+1
- until (sumweights[i-1] < RNG and RNG <= sumweights[i])
- i=i-1 -- Gotta subtract out 1 to correct for our dummy 0 to start sumweights. There's probably a clever way of doing all this but I'm getting tired.
- return dests[i]
- end
- -- Walks randomly along graph g, creating a fake word.
- local function randomwalk(g)
- local s, syl = "", ""
- syl=takestep(g, syl)
- s = s .. syl
- repeat
- syl = takestep(g, syl)
- s = s .. syl
- until string.find(syl, "_") --string.sub(syl, #syl, #syl)=="_" -- Repeat until our syllable ends in _
- return s
- end
- -- * * *
- -- * * * INTERMEDIATE * * *
- -- These are some functions that depend on the methods up the page but before initialization. Not sure where else to put them...
- -- Loads methods into our alphabet table.
- local function loadmethods(xalph)
- function xalph:getweights(text) end -- Incomplete. Need to construct a method that scans the text for efficient encoding and tallies up weights.
- function xalph:getscores() scoresintoalph(self) end
- function xalph:getsavings() savings(self) end
- function xalph:sort() sortalphs(self) end
- --function xalph:candidates(text) getcandidates(self, text) end -- Methods don't return anything, only act on parent table. Have to externalize this.
- function xalph:printweights() printweights(self) end
- function xalph:printstats() printstats(self) end
- function xalph:printsavings() printsavings(self) end
- function xalph:printts() printtotalsavings(self) end
- function xalph:printgraveyard() printgraveyard(self) end
- function xalph:print(...) printsome(self, ...) end
- function xalph:printall() printall(self)end
- function xalph:save(paths) savesyllables(self, paths) end
- function xalph:growsyllables(text, howmany) growsyllables(self, text, howmany) end
- function xalph:itercompress(text, suppressprint) itercompress(self, text, suppressprint) end
- function xalph:resetscores() resetscores(self) end
- function xalph:resetsavings() resetsavings(self) end
- end
- -- Loads the stats table and the graveyard, then pulls the information from the stats table into the other tables and sorts them. Exports the recovered alphabet table so we can pick up where we left off.
- local function loadsyllables(paths)
- paths = paths or {nil, nil}
- local alph={{},{},{},{},{}}
- loadmethods(alph)
- alph[2]=importstats(paths[1])
- alph[5]=importgraveyard(paths[2])
- for k, v in pairs(alph[2]) do
- table.insert(alph[1], {k, v[2]})
- local sv=v[3] or 0 -- If it's one character, v[3] is nil, so instead make it 0 for 0 savings.
- table.insert(alph[3], {k, sv})
- table.insert(alph[4], {k, v[2]*sv}) -- Here I'm defining alph[4] to be a list of total savings because why not?
- end
- alph:sort()
- return alph
- end
- -- * * *
- -- * * * INITIALIZATION * * *
- -- For creating variables and stuff in the first place
- -- Outputs a cleaned-up version of the requested text. Replaces all non-alphabetical characters with underscores. Optional second argument adds a short phrase with apostrophes if you want them.
- local function choosetext(title, addapostrophes)
- apostrophes = addapostrophes and "'tis ne'er e'er o'er m' l'o'er o' clo'er_" or "" -- Many sample texts don't include apostrophes, so here are some to append if necessary.
- -- A selection of training texts. Use shorter, more repetitive texts for troubleshooting, rapid execution, and easier pattern recognition.
- filenames = {["dictionary"] = "C:/Program Files/Lua/Lua5.4/dictionary.txt", -- Complete dictionary via https://norvig.com/ngrams/enable1.txt (Doesn't include 1 letter words or contractions.)
- ["GEaH"] = "C:/Program Files/Lua/Lua5.4/Green_Eggs_and_Ham.txt", -- Green Eggs and Ham via https://www.clear.rice.edu/comp200/resources/texts/Green%20Eggs%20and%20Ham.txt
- }
- local pathname=filenames[title] or filenames["GEaH"] -- If no match is found, just load Green Eggs and Ham because who doesn't like Green Eggs and Ham?
- local f=assert(io.open(pathname, "r"))
- text = "_" .. string.gsub(string.lower(f:read("*all")), "[^%a']+", "_") .. apostrophes .. "_" --.. apostrophes -- Append space to start and end to help pattern matching if there are words at start and/or end. Also substitutes underscores in for non-alphabetical characters.
- f:close()
- return text
- end
- -- Creates the directed graph with keys corresponding to ASCII values for each character.
- local function initializegraph(alph)
- local g={}
- g[""]={["WWW"]=0}
- alph:sort()
- local i=1
- while alph[3][i][2]>=0 do
- g[alph[3][i][1]]={["WWW"]=0} -- WWW represents the total weight into/out of each node. (Except "", for which it's just out, and terminal nodes, which have none out.) I'd use W except that might be a valid key to another node.
- i=i+1
- end
- function g:makearcs() makearcs(self) end
- function g:zeroarcs() zeroarcs(self) end
- function g:getarcs(alph, text) getarcs(self, alph, text) end
- g:makearcs()
- return g
- end
- -- Creates our alphabet table and conducts an initial search of the sample text to establish weights. Also adds methods and cleans up unused characters. Optional argument includezero leaves in characters that are unused (in case you're interested in doing something like building a Huffman code from Shakespeare and using it to encode the dictionary).
- local function initializealphabet(text, includezero)
- local xalph={{}} -- Gotta call it xalph to avoid collision with alph function, which we'll use immediately.
- --[[
- Structure of xalph:
- 1: list: { {syllable1, weight}, {syllable2, weight}, ... } -- Sorted low to high for ease of computing Huffman scores.
- 2: record: { syllable2 = {Huffman score, weight, savings}, syllable2 = {Huffman score, weight, savings}, ...} -- All the information except indexed by syllable for ease of lookup.
- 3: list: { {syllable1, savings}, {syllable2, savings}, ...} -- "savings" is the total Huffman score of the syllable's constituent syllables minus its own Huffman score. Thus, if a syllable is S = s1 .. s2 .. s3 with Huffman scores B, b1, b2, and b3, savings is b1 + b2 + b3 - B. Sorted high to low.
- 4: list: Not sure. Could make it total savings or total bits used... -- { {syllable1, bits per character}, {syllable2, bits per character}, ...} -- Sorted low to high. Probably won't use this.
- 5: {syllable1, syllable2, ...} -- The "graveyard". Potential syllable combinations that are so rare that they aren't worth adding.
- --]]
- loadmethods(xalph)
- local a=alph()
- for i=1,#a do
- w=countsylls(text, a[i])
- if w>0 or includezero then
- table.insert(xalph[1], {a[i], w})
- end
- end
- xalph[2]={}
- xalph:getscores()
- xalph[3]={} -- Empty at first because our alphabet is just single characters. Maybe create a function that permits initializing with multiple characters by back-searching for component syllables.
- for k, v in pairs(xalph[2]) do
- table.insert(xalph[3], {k, 0}) -- Initial savings are 0 because no single letter can save any value over itself.
- end
- xalph[4]={}
- for syll, v in pairs(xalph[2]) do
- table.insert(xalph[4], {syll, v[2]/#syll}) -- Old definition of xalph[4] is bits per character. I'm leaving it in because it doesn't matter and it populates the table until we replace that information with total savings.
- end
- xalph[5]={} -- The graveyard.
- return xalph
- end
- -- * * *
- -- * * * START OF ACTUAL PROGRAM * * *
- -- Text choices include "dictionary" (a list of English words) and "GEaH" (Green Eggs and Ham). See the choosetext function for a list of texts and websites where to find them.
- local tt=choosetext("dictionary")
- -- Uncomment these lines if it's your first time using the program.
- ---[[
- local aa=initializealphabet(text)
- aa:getscores()
- aa:getsavings()
- aa:save()
- --]]
- -- Run this line if you'd like to pick up where you left off.
- --local aa=loadsyllables()
- -- Run this chunk if you'd like to assign random Huffman scores and savings to all syllables. I should probably put all this in a function.
- --[=[
- local best = 7948006
- local madethecut = {}
- for k in pairs(aa[2]) do
- madethecut[k]=0
- end
- while true do
- aa:resetsavings()
- aa:resetscores()
- for i=1,5 do
- nudgescores(aa)
- nudgesavings(aa)
- end
- aa:sort()
- compress(aa, tt)
- local bits
- bits, aa = itercompress(aa, tt)
- if bits<best then
- best=bits
- aa:save()
- print("New best found: " .. bits .. " bits after " .. os.clock() .. " seconds.")
- end
- local i=1
- repeat
- madethecut[aa[3][i][1]] = madethecut[aa[3][i][1]] + 1
- i=i+1
- until aa[3][i][2]<0
- local s=""
- for k,v in pairs(madethecut) do
- s=s .. k .. "," .. v .. "\n"
- end
- local f=io.open("C:/MadeTheCut.txt", "w+")
- f:write(s)
- f:flush()
- f:close()
- end
- --]=]
- -- Run this chunk if you want to create a graph from your syllables and walk that graph.
- --[[
- local gg = initializegraph(aa)
- gg:getarcs(aa, tt)
- for i=1,50 do
- local ss=randomwalk(gg)
- print(ss)
- end
- --]]
- -- Uncomment these lines to grow our syllable list until user interrupts the process. Results are saved to the default file location.
- --[[
- while true do
- aa:growsyllables(tt)
- aa:itercompress(tt)
- aa:print("w", "st", "sv")
- aa:save()
- end
- --]]
- -- Uncomment these lines to conduct a "losers' tournament", finding the best syllables that have been excluded from consideration. Double-losers are collected in a "hospice" table.
- --[[
- bb, hospice = loserstournament(aa, tt)
- bb:print("w", "st", "sv")
- for k in pairs(hospice) do
- print(k)
- end
- --]]
- print(os.clock())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement