Advertisement
Guest User

Make random English words

a guest
Dec 16th, 2021
61
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Lua 34.00 KB | None | 0 0
  1. -- * * * BEHIND THE SCENES * * *
  2. -- Tend to be simpler functions, used within methods
  3.  
  4. -- Returns the size of a table, including numeric and non-numeric keys.
  5. local function tablesize(t)
  6.     local sz=0
  7.     for k, v in pairs(t) do
  8.         sz=sz+1
  9.     end
  10.    
  11.     return sz
  12. end
  13.  
  14. -- I was hoping it wouldn't come to this, but these tree data structures are driving me crazy and I need a way to visualize what's in them.
  15. local function printtree(t, depth)
  16.     depth=depth or 0
  17.     for k, v in pairs(t) do
  18.         if type(v) == "table" then
  19.             print(string.rep("   ", depth) .. "{" .. k .. "=")
  20.             printtree(v, depth+1)
  21.         elseif type(v) == "function" then
  22.             print(string.rep("   ", depth) .. "{" .. k .. "=" .. type(v) .. "}")
  23.         else
  24.             print(string.rep("   ", depth) .. "{" .. k .. "=" .. v .. "}")
  25.         end
  26.     end
  27. end
  28.  
  29. -- Copy a table.
  30. local function tablecopy(t)
  31.     local tcopy={}
  32.     for k, v in pairs(t) do
  33.         if type(v)=="table" then
  34.             tcopy[k]=tablecopy(v)
  35.         else
  36.             tcopy[k]=v
  37.         end
  38.     end
  39.     return tcopy
  40. end
  41.  
  42. -- Export a table.  Used specifically for exporting our scores table, which has syllables, scores, weights, and savings-- everything we need to pick up from where we left off.  Path is optional.
  43. local function exportstats(t, path)
  44.     path=path or "C:/CommonSyllables.txt"
  45.     local s=""
  46.     for k, v in pairs(t) do
  47.         s=s .. k .. ";"
  48.         if type(v)=="table" then
  49.             for i=1,#v do
  50.                 s=s .. v[i] .. ","
  51.             end
  52.             s=s .. "\n"
  53.         end
  54.     end
  55.     local f=io.open(path, "w+")
  56.     f:write(s)
  57.     f:flush()
  58. end
  59.  
  60. -- Imports our stats table.  Path is optional.
  61. local function importstats(path)
  62.     path=path or "C:/CommonSyllables.txt"
  63.     stats={}
  64.     for s in io.lines(path) do
  65.         k=string.match(s, "(.+);")
  66.         stats[k]={}
  67.         for v in string.gmatch(s, "([%w%-]+),") do
  68.             if not v then
  69.                 table.insert(stats[k], nil)
  70.             else
  71.                 table.insert(stats[k],tonumber(v))
  72.             end
  73.         end
  74.     end
  75.     return stats
  76. end
  77.  
  78. -- Exports the graveyard.  Because they are stored as a set, it's only necessary to list them in the file.  Syllables are newline delimited to make them easier to read using io.lines.
  79. local function exportgraveyard(t, path)
  80.     path=path or "C:/SyllableGraveyard.txt"
  81.     local s=""
  82.     for k in pairs(t) do
  83.         s=s .. k .. "\n"
  84.     end
  85.     local f=io.open(path, "w+")
  86.     f:write(s)
  87.     f:flush()
  88. end
  89.  
  90. -- Imports the graveyard.
  91. local function importgraveyard(path)
  92.     path=path or "C:/SyllableGraveyard.txt"
  93.     graveyard={}
  94.     for s in io.lines(path) do
  95.         graveyard[s]=true
  96.     end
  97.     return graveyard
  98. end
  99.  
  100. -- As of 5.4, unpack has been changed to table.unpack.  I'm reassigning it here, just in case...
  101. unpack = unpack or table.unpack
  102.  
  103. -- Outputs a vector with our alphabet.  Initial/final and apostrophe come first so it's easy to exclude them, if necessary.
  104. local function alph()
  105.     local v={}
  106.     function v:insert(x) table.insert(self,x) end
  107.    
  108.     v:insert("_")
  109.     v:insert("\'")                  -- Apostrophe
  110.     for i = 0x61, 0x7A do           -- a through z
  111.         v:insert(string.char(i))
  112.     end
  113.    
  114.     return v
  115. end
  116.  
  117. -- Several syllables are "forbidden" in pattern matching but still accidentally pop in.  This function excludes them.
  118. local function permissible(s)
  119.     local forbs = {"[^%a'].*[^%a']",            -- " x ", words (x can be any number of characters, including 0)
  120.                    "[%a'][^%a']+[%a']"              -- "x y", interior spaces, spanning words (any number of spaces between x and y but at least one)
  121.                   }
  122.  
  123.     for i=1,#forbs do
  124.         if string.match(s, forbs[i]) then
  125.             return false
  126.         end
  127.     end
  128.     return true
  129. end
  130.  
  131. -- Takes a string and substitutes in underscores for non-alphabetical, non-apostrophe characters.  This is done in advance, but we don't want errors if one slips through.
  132. local function subin_(s)
  133.     return string.gsub(s, "[^%a']+", "_")
  134. end
  135.  
  136. -- Less than function, passable to sorting functions.
  137. local function lt(a, b) return a < b end
  138.  
  139. -- Greater than function, passable to sorting functions.
  140. local function gt(a, b) return a > b end
  141.  
  142. -- Sorts a list of tables by chosen element and function.  Could instead use a metamethod to define __lt on a table but would probably be the same amount of work.
  143. local function sortby(t, e, f)
  144.     e = e or 2      -- By default, sorts by second element.
  145.     f = f or lt -- By default, sorts in increasing order.
  146.     local function comp(a, b) return f(a[e], b[e]) end
  147.     return table.sort(t, comp)
  148. end
  149.  
  150. -- How deep in a tree are our syllables?  Finds the Huffman score based on the tree and outputs it to result table.
  151. local function treedepth(tree, depth, result)
  152.     result=result or {}
  153.     depth=depth or 0
  154.     for k, v in pairs(tree) do
  155.         if type(v)=="table" then
  156.             if type(v[1])=="string" then
  157.                 result[v[1]] = depth
  158.             else
  159.                 result = treedepth(v, depth+1, result)
  160.             end
  161.         end
  162.     end
  163.     return result
  164. end
  165.  
  166. -- Counts the number of instances of syllable s in the provided text.  You can also make s a pattern, if you'd like.
  167. local function countsylls(text, s)
  168.     local c=0
  169.     for k in string.gmatch(text, s) do
  170.         c=c+1
  171.     end
  172.    
  173.     return c
  174. end
  175.  
  176. -- Uses a greedy algorithm to find a good (but maybe not best) covering of a word.  Can also be used to find the potential savings of longer syllables.  Optional argument tally, associated with dontreset in compress below, tallies the weights in alph[1] if requested.  I'm using a greedy algorithm because I strongly suspect this problem is NP-complete (see "set cover problem" on Wikipedia).
  177. local function greedy(alph, word, tally)
  178.     word=subin_(word)
  179.     -- ii is our index, ll and rr are the left and right positions of a match.  They're nil for now because we want to use ll to end the loop.
  180.     local ii, ll, rr = 0
  181.     repeat
  182.         -- ii starts at 0 and is incremented first because we want to use its value after the loop ends.
  183.         ii=ii+1
  184.         -- alph[3] is the extended alphabet sorted by potential savings.
  185.         ll, rr = string.find(word, alph[3][ii][1])
  186.     until ll
  187.    
  188.     local bits=alph[2][alph[3][ii][1]][1]
  189.     if tally then
  190.         alph[2][alph[3][ii][1]][2] = alph[2][alph[3][ii][1]][2] + 1
  191.     end
  192.    
  193.     -- Only keep looking to the left if there's still a word there.
  194.     if ll>1 then
  195.         bits = bits + greedy(alph, string.sub(word, 1, ll-1), tally)
  196.     end
  197.    
  198.     -- Only keep looking to the right if there's still a word there.
  199.     if rr<#word then
  200.         bits = bits + greedy(alph, string.sub(word, rr+1, #word), tally)
  201.     end
  202.    
  203.     return bits
  204. end
  205.  
  206. -- Uses the greedy algorithm above to compress a text using the alphabet we've formed.  Outputs the number of bits we've compressed it to.  Takes about 80 seconds to run using the single-letter alphabet, might run *faster* with syllables because they'll remove larger chunks at a time via our greedy algorithm.  Optional argument suppressprint keeps the function from printing the bit total.  Optional argument dontreset prevents the algorithm from re-tallying the syllable weights.  Default is to reset the tallies so that we can update our weights and Huffman scores.
  207. local function compress(alph, text, suppressprint, dontreset)
  208.     if not(dontreset) then
  209.         for k, v in pairs(alph[2]) do
  210.             v[2]=0
  211.         end
  212.     end
  213.  
  214.     local bittotal = 0
  215.     -- The slightly awkward construction where we match words and then inject the underscores back in is necessary because if we include the _ delimeters in string.gmatch, it only picks up every other word because the next word overlaps with the previous one.
  216.     for s in string.gmatch(text, "[%a']+") do
  217.         bittotal = bittotal + greedy(alph, "_" .. s .. "_", not(dontreset))
  218.     end
  219.    
  220.     -- Gotta divide by two because underscores appear at both the beginnings and ends of words.  Add one to avoid fencepost error (9 words have eight spaces between them, plus _'s at start and end).
  221.     if not(dontreset) then
  222.         alph[2]["_"][2] = math.floor(alph[2]["_"][2]/2 + 0.5) + 1
  223.     end
  224.    
  225.     alph[1]={}  -- Reset our weights, then copy the new weights in their place.
  226.     for k,v in pairs(alph[2]) do
  227.         table.insert(alph[1], {k, v[2]})
  228.     end
  229.     alph:sort()
  230.    
  231.     -- 7948006 is the number of bits used with a one-character-per-codeword Huffman code.
  232.     local totalsavings=7948006 - bittotal
  233.     if not(suppressprint) then
  234.         print("Compressed the text to " .. bittotal .. " bits, a total savings of " .. totalsavings .. ".")
  235.     end
  236.     return bittotal
  237. end
  238.  
  239. -- For each entry in our alphabet computes the "Huffman score" , the minimum number of bits needed to create a prefix-free code including that syllable.  Note that this doesn't actually create the Huffman code (which wouldn't be all that difficult)-- we just want the number of bits needed to encode a syllable.
  240. local function huffman(alph)
  241.     alph:sort() -- Sort alph's weights if necessary
  242.     local a=tablecopy(alph)
  243.     local tree={}
  244.     if #a[1]>1 then
  245.         --                                    /-- What a mess!  Have to sandwich the weight in the middle because I'm sorting by the second element.  Will probably want to relable it ["w"] to get back on track...
  246.         tree = {tablecopy(a[1][1]), a[1][1][2]+a[1][2][2], tablecopy(a[1][2])}
  247.         table.insert(a[1], tablecopy(tree))
  248.         table.remove(a[1], 1)
  249.         table.remove(a[1], 1)
  250.         huffman(a)
  251.     else
  252.         scores = treedepth(a[1])
  253.     end
  254.     return scores
  255. end
  256.  
  257. -- Resets Huffman scores to arbitrary n.
  258. local function resetscores(alph, n)
  259.     n=n or 10
  260.     for k,v in pairs(alph[2]) do
  261.         v[1] = n
  262.     end
  263. end
  264.  
  265. -- Resets all savings to some value to encourage searching for new, better syllable combinations.  That value is optionally set with the second argument, n, although its value shouldn't make any difference.
  266. local function resetsavings(alph, n)
  267.     n=n or 10
  268.     for i=1,#alph[3] do
  269.         alph[3][i][2] = n
  270.     end
  271. end
  272.  
  273. -- Nudges Huffman scores.
  274. local function nudgescores(alph, t)
  275.     t=t or {{0.005, -3}, {0.025, -2}, {0.16, -1}, {0.68, 0}, {0.975, 1}, {0.995, 2}, {1, 3}}
  276.     for k,v in pairs(alph[2]) do
  277.         local RNG = math.random()
  278.         local j=1
  279.         while RNG>t[j][1] do
  280.             j=j+1
  281.         end
  282.         v[1] = v[1] + t[j][2]
  283.     end
  284.     return alph
  285. end
  286.  
  287. -- Nudges the savings in random directions.  This can help find better syllable groupings   Optional argument t is a table representing the cumulative distribution function, for use with Lua's RNG.  t[i][1] must be between 0 and 1 and in strictly increasing order.  t[i][2] represents how far to nudge based on the RNG falling between t[i-1][1] and t[i][1].  The default distribution I've employed is based on the standard normal distribution.
  288. local function nudgesavings(alph, t)
  289.     t=t or {{0.005, -3}, {0.025, -2}, {0.16, -1}, {0.68, 0}, {0.975, 1}, {0.995, 2}, {1, 3}}
  290.     for i=1,#alph[3] do
  291.         local RNG = math.random()
  292.         local j=1
  293.         while RNG>t[j][1] do
  294.             j=j+1
  295.         end
  296.         alph[3][i][2] = alph[3][i][2] + t[j][2]
  297.     end
  298.     return alph
  299. end
  300.  
  301. -- Returns a Boolean reflecting whether the word we've created is any good based on a few simple rules off the top of my head: 1) must have at least one vowel, 2) no letter triplicates, 3) not a real word.  That last condition, not a real word, can be overridden with includerealwords, which is optional.  (And yeah, I know "hmm" and "shh" and "cwm" don't have vowels, but come on...)
  302. local function isgoodword(word, dictionary, includerealwords)
  303.     local vowels={"a", "e", "i", "o", "u", "y"}
  304.     local fullalphabet=alph()
  305.     local good = true
  306.    
  307.     -- At least one vowel.
  308.     for i=1,#vowels do
  309.         if not(string.find(word, vowels[i])) then
  310.             good=false
  311.         end
  312.     end
  313.    
  314.     -- No letter triplicates.
  315.     for i=1,#fullalphabet do
  316.         if string.find(word, string.rep(fullalphabet[i], 3)) then
  317.             good=false
  318.         end
  319.     end
  320.    
  321.     -- Not a real word.
  322.     -- TODDO: Account for possibility that word does/does not have underscores at its start and end.
  323.     if not(includerealwords) then
  324.         if string.find(dictionary, word) then
  325.             good=false
  326.         end
  327.     end
  328.    
  329.     return good
  330. end
  331. -- * * *
  332.  
  333.  
  334.  
  335. -- * * * METHODS * * *
  336. -- Typically don't want to call these directly
  337. -- Finds the total weight out of a specified node.  Useful for generating random paths and pruning.
  338.  
  339. -- This is a slightly awkward function but is necessary because huffman is recursive and trying to assign scores in there just ends up feeding them into the wrong table.
  340. local function scoresintoalph(alph)
  341.     local scores=huffman(alph)
  342.     for i=1,#alph[1] do
  343.         alph[2][alph[1][i][1]] = {scores[alph[1][i][1]], alph[1][i][2]}
  344.     end
  345. end
  346.  
  347. -- Sorts all the tables in the alphabet.
  348. local function sortalphs(alph)
  349.     sortby(alph[1], 2, lt)
  350.     if alph[3] and alph[4] then     -- Can't sort what doesn't exist.  The sort function is called in initializealphabet before these are created.
  351.         table.sort(alph[3], function(a,b) return a[2]>b[2] or (a[2]==b[2] and #a[1]>#b[1]) end)     -- Sort from high savings to low or, if equal, from most characters to fewest.
  352.         sortby(alph[4], 2, gt)
  353.     end
  354. end
  355.  
  356. -- Constructs a list of candidate syllables according to how promising they are.  Also punts them to the graveyard if they suck.
  357. local function getcandidates(alph, text)
  358.     local candlist={}
  359.     for s1, v1 in pairs(alph[2]) do
  360.         if v1[2]>1 then     -- Component syllables must have at least two instances.  Consider changing this threshold for efficiency.
  361.             for s2, v2 in pairs(alph[2]) do
  362.                 if v2[2]>1 then
  363.                     local h1, h2 = v1[1], v2[1]
  364.                     local S, H = s1 .. s2, h1+h2
  365.                     if permissible(S) and not alph[5][S] then
  366.                         local c=countsylls(text, S)     -- count
  367.                         local ps=H*c                    -- potential savings
  368.                        
  369.                         -- If the candidate syllable was found earlier in our comprehensive search, replace it only if the potential savings is now better.
  370.                         if candlist[S] then
  371.                             candlist[S] = candlist[S][3]<ps and {H, c, ps} or candlist[S]
  372.                         else
  373.                             candlist[S] = {H, c, ps}
  374.                         end
  375.                     end
  376.                 end
  377.             end
  378.         end
  379.     end
  380.     -- Create a sortable version of our list of candidates, then sort it.
  381.     local sortable={}
  382.     for k, v in pairs(candlist) do
  383.         table.insert(sortable, {k, unpack(v)})
  384.     end
  385.     sortby(sortable, 4, gt)
  386.     return sortable
  387. end
  388.  
  389. -- How many bits would be necessary if this syllable were absent?  Finds how many bits it takes to encode a syllable efficiently from shorter syllables.  It's worth having an eye on the greedy algorithm, which is probably good enough for most words but falling short here could have rippling consequences down the line.  For that reason, it might be worth it to (eventually) bite the bullet and perform a rigorous (NP-complete) check of syllable decomposition.
  390. local function savings(alph)
  391.     local length=0
  392.     for k in pairs(alph[2]) do
  393.         length = math.max(length, #k)
  394.     end
  395.     while length>1 do       -- Search from long syllables to short so savings of short syllables don't affect savings of long syllables.
  396.         for k, v in pairs(alph[2]) do
  397.             if #k==length then
  398.                 --[[
  399.                 for i=1,#alph[3] do
  400.                     if alph[3][i][1]==k then
  401.                         table.remove(alph[3],i)
  402.                         break
  403.                     end
  404.                 end
  405.                 --]]
  406.                 local found, i=false, 1
  407.                 repeat
  408.                     if alph[3][i][1]==k then
  409.                         table.remove(alph[3],i)
  410.                         found=true
  411.                     end
  412.                     i=i+1
  413.                 until found
  414.                
  415.                 local h=v[1]
  416.                 local bits = greedy(alph, k)
  417.                
  418.                
  419.                 local sv = bits-h       -- Savings
  420.                 alph[2][k][3]=sv
  421.                 table.insert(alph[3], {k, sv})
  422.                
  423.                 alph:sort()     -- We removed the syllable from alph[3] but we need to sort after adding it back in because it's important to subsequent greedy algorithm searches.
  424.             end
  425.         end
  426.         length=length-1
  427.     end
  428. end
  429.  
  430. -- Saves our progress.  Second argument, paths, is a table of strings with the paths for exporting our syllables and graveyard respectively.
  431. local function savesyllables(alph, paths)
  432.     paths=paths or {nil,nil}
  433.     exportstats(alph[2], paths[1])
  434.     exportgraveyard(alph[5], paths[2])
  435. end
  436.  
  437. -- Grows our list of allowed syllables.  Optional argument howmany is how many syllables to add to our list, with default value 30.
  438. local function growsyllables(alph, text, howmany)
  439.     local cc = getcandidates(alph, text)
  440.     howmany = howmany or math.min(30, #cc)      -- Add 30 syllables by default but can add fewer if too many of our syllable candidates are in the graveyard.
  441.     alph:sort()
  442.    
  443.     local i, added = 1, 0       -- i is the index through which we check new syllables, added is the number of syllables actually added thus far in the loop.
  444.     while added<howmany do
  445.         if not alph[2][cc[i][1]] then       -- only add syllables if they're not already in our alphabet
  446.             table.insert(alph[1], {cc[i][1], cc[i][3]})
  447.             alph[2][cc[i][1]] = {1, cc[i][3]}       --{cc[i][2]-1, cc[i][3]}        -- For lack of a better idea, give each new syllable a Huffman score of one bit and give it a weight of its entire number of matches.
  448.             table.insert(alph[3], {cc[i][1], cc[i][2]-1})
  449.             added=added+1
  450.         end
  451.         i=i+1
  452.     end
  453.    
  454.     for i=1,#cc do
  455.         if cc[i][3]<2 then      -- Each code word must appear at least *twice* since it costs at least its own Huffman score to add it to the dictionary.  Interestingly, this means "zyzzyva" might make it into our code words because even though it only appears twice ("zyzzyva" and "zyzzyvas"), it uses such expensive syllables that it has the potential to save a few bits.
  456.                 alph[5][cc[i][1]]=true
  457.         end
  458.     end
  459.    
  460.     alph:sort()     -- Need to sort because we've added a bunch of new syllables and we want their savings sorted for use in the compress function.
  461.     compress(alph, text)        -- This is a throwaway iteration of the compression algorithm because we artificially assumed each new syllable saves one bit.  After applying the subroutine, we can get accurate Huffman scores.
  462.     alph:getscores()
  463.     alph:getsavings()
  464.     alph:sort()
  465. end
  466.  
  467. -- Iteratively compresses the text until either a steady state is reached or it begins to *cost* more bits than the previous iteration.  This allows us to get accurate measures of the weights and Huffman score.
  468. local function itercompress(alph, text, suppressprint)
  469.     local prevalph={}
  470.     local curr=-1
  471.     repeat
  472.         prevalph=tablecopy(alph)
  473.         prev, curr = curr, compress(alph, text)
  474.         alph:getscores()
  475.         alph:getsavings()
  476.         alph:sort()
  477.     until prev>0 and curr-prev>=0
  478.     alph=tablecopy(prevalph)
  479.     return prev, alph
  480. end
  481.  
  482. -- Uses only syllables with 0 or negative savings in compression.  Used to find the best syllables that have been "accidentally" excluded from our syllable list.  Also collects double-losers in a "hospice" list, with anticipation they might end up in the graveyard.
  483. local function loserstournament(alph, text)
  484.     local beta = tablecopy(alph)        -- Copy our alphabet into a new table, beta, because we don't want to screw it up and/or accidentally save over our good progress.
  485.     local i=1
  486.     while i<=#beta[3] do        -- Gotta use a while loop because we'll be shrinking beta[3] along the way.
  487.         if beta[3][i][2]>0 then
  488.             beta[2][beta[3][i][1]]=nil
  489.             table.remove(beta[3], i)
  490.         else
  491.             beta[3][i][2]=0     -- Reset all savings to 0 so all syllables are on equal footing.
  492.             i=i+1
  493.         end
  494.     end
  495.     itercompress(beta, text)
  496.     local hospice={}
  497.     for i=1,#beta[3] do
  498.         if beta[3][i][2]<0 then
  499.             hospice[beta[3][i][1]]=true
  500.         end
  501.     end
  502.     return beta, hospice
  503. end
  504.  
  505. -- Print alph[1], the list of weights.
  506. local function printweights(alph)
  507.     for i=1,#alph[1] do
  508.         print(i, alph[1][i][1], alph[1][i][2])
  509.     end
  510. end
  511.  
  512. -- Print alph[2], the record of Huffman scores, weights, and savings
  513. local function printstats(alph)
  514.     for k, v in pairs(alph[2]) do
  515.         print(k, v[1], v[2], v[3])
  516.     end
  517. end
  518.  
  519. -- Print alph[3], the list of savings
  520. local function printsavings(alph)
  521.     for i=1,#alph[3] do
  522.         print(i, alph[3][i][1], alph[3][i][2])
  523.     end
  524. end
  525.  
  526. -- Print alph[4], the list of bits per character.
  527. local function printtotalsavings(alph)
  528.     for i=1,#alph[4] do
  529.         print(i, alph[4][i][1], alph[4][i][2])
  530.     end
  531. end
  532.  
  533. -- Print alph[5], the graveyard.
  534. local function printgraveyard(alph)
  535.     for k in pairs(alph[5]) do
  536.         print(k)
  537.     end
  538. end
  539.  
  540. -- Print only some of the tables.
  541. local function printsome(alph, ...)
  542.     local arg={...}
  543.     for i=1,#arg do
  544.         if arg[i]=="w" then
  545.             printweights(alph)
  546.         elseif arg[i]=="st" then
  547.             printstats(alph)
  548.         elseif arg[i]=="sv" then
  549.             printsavings(alph)
  550.         elseif arg[i]=="ts" then
  551.             printtotalsavings(alph)
  552.         elseif arg[i]=="g" then
  553.             printgraveyard(alph)
  554.         end
  555.     end
  556. end
  557.  
  558. -- Print everything.
  559. local function printall(alph)
  560.     print("WEIGHTS:")
  561.     alph:printweights()
  562.     print("STATS:")
  563.     alph:printstats()
  564.     print("SAVINGS:")
  565.     alph:printsavings()
  566.     print("TOTAL SAVINGS:")
  567.     alph:printts()
  568.     print("GRAVEYARD:")
  569.     alph:printgraveyard()
  570. end
  571.  
  572. -- We have a graph g of unconnected nodes.  Add connections to them (can even connect to self).
  573. local function makearcs(g)
  574.     --[[
  575.         Weights are simulated by having g[k][v] point to a table with {w,g[v]}.  For example, suppose there are 723 instances of t --> h.  Then g["t"]["h"] leads to {723, g["h"]}.  This way, we can easily find the probability of reaching h from t, then go to h.
  576.     --]]
  577.     for k1, v1 in pairs(g) do
  578.         if type(v1)=="table" then
  579.             for k2, v2 in pairs(g) do
  580.                 if type(v2)=="table" then
  581.                     g[k1][k2] = {["w"]=0, ["dest"]=g[k2]}
  582.                 end
  583.             end
  584.            
  585.             g[""][k1] = {["w"]=0, ["dest"]=g[k1]}       -- Also have to create arcs out of the empty string, our starting point.
  586.         end
  587.     end
  588. end
  589.  
  590. -- Reset all arcs in g to zero.
  591. local function zeroarcs(g)
  592.     for k1, v1 in pairs(g) do
  593.         if type(v1)=="table" then
  594.             for k2, v2 in pairs(v1) do
  595.                 if type(v2)=="table" then
  596.                     v2["w"]=0
  597.                 end
  598.             end
  599.         end
  600.     end
  601. end
  602.  
  603. -- Uses the alphabet associated with g and selected text to populate the arcs of g.
  604. local function getarcs(g, alph, text)
  605.     local beta=tablecopy(alph)      -- Don't want to mess up our alphabet (although we should be done modifying it at this point).
  606.     for s in string.gmatch(text, "[%a']+") do       -- For each word...
  607.         s= "_" .. s .. "_"
  608.         beta[1]={}      -- Reset all weights to zero.
  609.         for k, v in pairs(beta[2]) do
  610.             v[2]=0      -- Reset all weights to zero in the stats table as well.
  611.         end
  612.         greedy(beta, "_" .. s .. "_", true)
  613.        
  614.         local path = {}     -- path is a linked list from syllable to syllable.  Each element is a table whose key is the start of that syllable and contains two elements: the syllable and link to the next syllable.
  615.         for i=1,#beta[3] do
  616.             local count = beta[2][beta[3][i][1]][2]
  617.             if count>0 then
  618.                 local start=0       -- Start searching for this syllable here.
  619.                 for j=1,count do
  620.                     key, start = string.find(s, beta[3][i][1], start)
  621.                     start=start+1
  622.                     path[key]={[1]=beta[3][i][1], ["destnum"]=start}        -- Pointers are kind of broken or something, so instead I'm adding the key "destnum".  After this loop, all the destnums are consolidated into their respective destinations proper.
  623.                     --print(j, key, start, beta[3][i][1])
  624.                     --if path[1] then print("!!!", path[1]["dest"], path[3]) end
  625.                 end
  626.                 s=string.gsub(s, beta[3][i][1], string.rep("*", string.len(beta[3][i][1])))
  627.                 --print(s)
  628.             end
  629.         end
  630.        
  631.         local k=1
  632.         local curr, prev=path[k], nil
  633.         g[""]["WWW"]=g[""]["WWW"]+1     -- WWW represents the total weight into/out of each node.  (Except "", for which it's just out, and terminal nodes, which have none out.)
  634.         g[""][curr[1]]["w"] = g[""][curr[1]]["w"] + 1       -- Start from the empty string and add weights to prefixes.
  635.         k=curr["destnum"]
  636.         prev, curr = curr, path[k]
  637.         repeat
  638.             g[prev[1]][curr[1]]["w"] = g[prev[1]][curr[1]]["w"] + 1
  639.             g[prev[1]]["WWW"]=g[prev[1]]["WWW"]+1
  640.             k=curr["destnum"]
  641.             prev, curr = curr, path[k]
  642.         until not curr
  643.        
  644.         --[=[
  645.         print(path[1][1], path[3][1], path[4][1])
  646.        
  647.         print("AAA")
  648.         local k=path[1]
  649.         repeat
  650.             print(k[1])
  651.             k=k["dest"]
  652.         until not k[1]
  653.         print("BBB")
  654.        
  655.         local step=path[1]
  656.         g[""]["WWW"]=g[""]["WWW"]+1     -- WWW represents the total weight into/out of each node.  (Except "", for which it's just out, and terminal nodes, which have none out.)
  657.         g[""][step[1]]["w"] = g[""][step[1]]["w"] + 1       -- Start from the empty string and add weights to prefixes.
  658.         repeat
  659.             g[step[1]]["WWW"] = g[step[1]]["WWW"] + 1
  660.             print(step[1], step["dest"][1])--, g[step[1]][step["dest"][1]]["w"])
  661.             for k,v in pairs(g[step[1]][step["dest"][1]]) do print(k, v) end
  662.             g[step[1]][step["dest"][1]]["w"] = g[step[1]][step["dest"][1]]["w"] + 1
  663.             step=step["dest"]
  664.         until #step["dest"]==0
  665.         --]=]
  666.     end
  667.     return g
  668. end
  669.  
  670. -- Take a single random step from current syllable syl.  Suppose we're at a node with two destinations: "a" with weight 3 and "b" with weight 5.  We construct table dests with elements {"a", "b"} and table sumweights with elements {0, 3, 8}.  Roll an RNG from 1 to 8 inclusive.  If it rolls 1, 2, or 3, go to "a".  If it rolls 4, 5, 6, 7, or 8, go to "b".
  671. local function takestep(g, syl)
  672.     local dests={}      -- Possible destinations after rolling our RNG.
  673.     local sumweights={0}
  674.     for k, v in pairs(g[syl]) do
  675.         if type(v) == "table" then
  676.             table.insert(dests, k)
  677.             table.insert(sumweights, sumweights[#sumweights]+g[syl][k]["w"])        -- Make the next value of sumweights the previous value plus the weight represented by the arc.
  678.         end
  679.     end
  680.    
  681.     --table.remove(sumweights, 1)       -- We started with a superfluous sumweights[1]=0 for arithmetic purposes, so now we delete that entry.
  682.     local RNG = math.random(sumweights[#sumweights])        -- sumweights[#sumweights] is just "WWW" for the current node, so I guess I didn't need to calculate it after all...
  683.     local i=1
  684.     repeat
  685.         i = i+1
  686.     until (sumweights[i-1] < RNG and RNG <= sumweights[i])
  687.     i=i-1       -- Gotta subtract out 1 to correct for our dummy 0 to start sumweights.  There's probably a clever way of doing all this but I'm getting tired.
  688.    
  689.     return dests[i]
  690. end
  691.  
  692. -- Walks randomly along graph g, creating a fake word.
  693. local function randomwalk(g)
  694.     local s, syl = "", ""
  695.     syl=takestep(g, syl)
  696.     s = s .. syl
  697.     repeat
  698.         syl = takestep(g, syl)
  699.         s = s .. syl
  700.     until string.find(syl, "_")     --string.sub(syl, #syl, #syl)=="_"      -- Repeat until our syllable ends in _
  701.     return s
  702. end
  703. -- * * *
  704.  
  705.  
  706. -- * * * INTERMEDIATE * * *
  707. -- These are some functions that depend on the methods up the page but before initialization.  Not sure where else to put them...
  708. -- Loads methods into our alphabet table.
  709. local function loadmethods(xalph)
  710.     function xalph:getweights(text) end     -- Incomplete.  Need to construct a method that scans the text for efficient encoding and tallies up weights.
  711.     function xalph:getscores() scoresintoalph(self) end
  712.     function xalph:getsavings() savings(self) end
  713.     function xalph:sort() sortalphs(self) end
  714.     --function xalph:candidates(text) getcandidates(self, text) end     -- Methods don't return anything, only act on parent table.  Have to externalize this.
  715.     function xalph:printweights() printweights(self) end
  716.     function xalph:printstats() printstats(self) end
  717.     function xalph:printsavings() printsavings(self) end
  718.     function xalph:printts() printtotalsavings(self) end
  719.     function xalph:printgraveyard() printgraveyard(self) end
  720.     function xalph:print(...) printsome(self, ...) end
  721.     function xalph:printall() printall(self)end
  722.     function xalph:save(paths) savesyllables(self, paths) end
  723.     function xalph:growsyllables(text, howmany) growsyllables(self, text, howmany) end
  724.     function xalph:itercompress(text, suppressprint) itercompress(self, text, suppressprint) end
  725.     function xalph:resetscores() resetscores(self) end
  726.     function xalph:resetsavings() resetsavings(self) end
  727. end
  728.  
  729. -- Loads the stats table and the graveyard, then pulls the information from the stats table into the other tables and sorts them.  Exports the recovered alphabet table so we can pick up where we left off.
  730. local function loadsyllables(paths)
  731.     paths = paths or {nil, nil}
  732.     local alph={{},{},{},{},{}}
  733.     loadmethods(alph)
  734.     alph[2]=importstats(paths[1])
  735.     alph[5]=importgraveyard(paths[2])
  736.    
  737.     for k, v in pairs(alph[2]) do
  738.         table.insert(alph[1], {k, v[2]})
  739.         local sv=v[3] or 0      -- If it's one character, v[3] is nil, so instead make it 0 for 0 savings.
  740.         table.insert(alph[3], {k, sv})
  741.         table.insert(alph[4], {k, v[2]*sv})     -- Here I'm defining alph[4] to be a list of total savings because why not?
  742.     end
  743.     alph:sort()
  744.     return alph
  745. end
  746. -- * * *
  747.  
  748.  
  749. -- * * * INITIALIZATION * * *
  750. -- For creating variables and stuff in the first place
  751.  
  752. -- Outputs a cleaned-up version of the requested text.  Replaces all non-alphabetical characters with underscores.  Optional second argument adds a short phrase with apostrophes if you want them.
  753. local function choosetext(title, addapostrophes)
  754.     apostrophes = addapostrophes and "'tis ne'er e'er o'er m' l'o'er o' clo'er_" or ""      -- Many sample texts don't include apostrophes, so here are some to append if necessary.
  755.  
  756.     -- A selection of training texts.  Use shorter, more repetitive texts for troubleshooting, rapid execution, and easier pattern recognition.
  757.     filenames = {["dictionary"] = "C:/Program Files/Lua/Lua5.4/dictionary.txt",                 -- Complete dictionary via https://norvig.com/ngrams/enable1.txt  (Doesn't include 1 letter words or contractions.)
  758.                  ["GEaH"]       = "C:/Program Files/Lua/Lua5.4/Green_Eggs_and_Ham.txt",         -- Green Eggs and Ham via https://www.clear.rice.edu/comp200/resources/texts/Green%20Eggs%20and%20Ham.txt
  759.                 }
  760.     local pathname=filenames[title] or filenames["GEaH"]        -- If no match is found, just load Green Eggs and Ham because who doesn't like Green Eggs and Ham?
  761.     local f=assert(io.open(pathname, "r"))
  762.  
  763.     text = "_" .. string.gsub(string.lower(f:read("*all")), "[^%a']+", "_") .. apostrophes .. "_"       --.. apostrophes            -- Append space to start and end to help pattern matching if there are words at start and/or end.  Also substitutes underscores in for non-alphabetical characters.
  764.     f:close()
  765.    
  766.     return text
  767.     end
  768.  
  769. -- Creates the directed graph with keys corresponding to ASCII values for each character.
  770. local function initializegraph(alph)
  771.     local g={}
  772.     g[""]={["WWW"]=0}
  773.    
  774.     alph:sort()
  775.     local i=1
  776.     while alph[3][i][2]>=0 do
  777.         g[alph[3][i][1]]={["WWW"]=0}        -- WWW represents the total weight into/out of each node.  (Except "", for which it's just out, and terminal nodes, which have none out.)  I'd use W except that might be a valid key to another node.
  778.         i=i+1
  779.     end
  780.    
  781.     function g:makearcs() makearcs(self) end
  782.     function g:zeroarcs() zeroarcs(self) end
  783.     function g:getarcs(alph, text) getarcs(self, alph, text) end
  784.    
  785.     g:makearcs()
  786.    
  787.     return g
  788. end
  789.  
  790. -- Creates our alphabet table and conducts an initial search of the sample text to establish weights.  Also adds methods and cleans up unused characters.  Optional argument includezero leaves in characters that are unused (in case you're interested in doing something like building a Huffman code from Shakespeare and using it to encode the dictionary).
  791. local function initializealphabet(text, includezero)
  792.     local xalph={{}}        -- Gotta call it xalph to avoid collision with alph function, which we'll use immediately.
  793.     --[[
  794.     Structure of xalph:
  795.       1: list: { {syllable1, weight}, {syllable2, weight}, ... }        -- Sorted low to high for ease of computing Huffman scores.
  796.       2: record: { syllable2 = {Huffman score, weight, savings}, syllable2 = {Huffman score, weight, savings}, ...}     -- All the information except indexed by syllable for ease of lookup.
  797.       3: list: { {syllable1, savings}, {syllable2, savings}, ...}       -- "savings" is the total Huffman score of the syllable's constituent syllables minus its own Huffman score.  Thus, if a syllable is S = s1 .. s2 .. s3 with Huffman scores B, b1, b2, and b3, savings is b1 + b2 + b3 - B.  Sorted high to low.
  798.       4: list: Not sure.  Could make it total savings or total bits used...                 -- { {syllable1, bits per character}, {syllable2, bits per character}, ...}     -- Sorted low to high.  Probably won't use this.
  799.       5: {syllable1, syllable2, ...}        -- The "graveyard".  Potential syllable combinations that are so rare that they aren't worth adding.
  800.     --]]
  801.    
  802.     loadmethods(xalph)
  803.    
  804.     local a=alph()
  805.     for i=1,#a do
  806.         w=countsylls(text, a[i])
  807.         if w>0 or includezero then
  808.             table.insert(xalph[1], {a[i], w})
  809.         end
  810.     end
  811.     xalph[2]={}
  812.     xalph:getscores()
  813.     xalph[3]={}     -- Empty at first because our alphabet is just single characters.  Maybe create a function that permits initializing with multiple characters by back-searching for component syllables.
  814.     for k, v in pairs(xalph[2]) do
  815.         table.insert(xalph[3], {k, 0})      -- Initial savings are 0 because no single letter can save any value over itself.
  816.     end
  817.     xalph[4]={}
  818.     for syll, v in pairs(xalph[2]) do
  819.         table.insert(xalph[4], {syll, v[2]/#syll})      -- Old definition of xalph[4] is bits per character.  I'm leaving it in because it doesn't matter and it populates the table until we replace that information with total savings.
  820.     end
  821.     xalph[5]={}     -- The graveyard.
  822.    
  823.     return xalph
  824. end
  825. -- * * *
  826.  
  827.  
  828.  
  829. -- * * * START OF ACTUAL PROGRAM * * *
  830.  
  831. -- Text choices include "dictionary" (a list of English words) and "GEaH" (Green Eggs and Ham).  See the choosetext function for a list of texts and websites where to find them.
  832. local tt=choosetext("dictionary")
  833.  
  834. -- Uncomment these lines if it's your first time using the program.
  835. ---[[
  836. local aa=initializealphabet(text)
  837. aa:getscores()
  838. aa:getsavings()
  839. aa:save()
  840. --]]
  841.  
  842. -- Run this line if you'd like to pick up where you left off.
  843. --local aa=loadsyllables()
  844.  
  845. -- Run this chunk if you'd like to assign random Huffman scores and savings to all syllables.  I should probably put all this in a function.
  846. --[=[
  847. local best = 7948006
  848. local madethecut = {}
  849. for k in pairs(aa[2]) do
  850.     madethecut[k]=0
  851. end
  852.  
  853. while true do
  854.     aa:resetsavings()
  855.     aa:resetscores()
  856.     for i=1,5 do
  857.         nudgescores(aa)
  858.         nudgesavings(aa)
  859.     end
  860.     aa:sort()
  861.     compress(aa, tt)
  862.     local bits
  863.     bits, aa = itercompress(aa, tt)
  864.     if bits<best then
  865.         best=bits
  866.         aa:save()
  867.         print("New best found: " .. bits .. " bits after " .. os.clock() .. " seconds.")
  868.     end
  869.    
  870.     local i=1
  871.     repeat
  872.         madethecut[aa[3][i][1]] = madethecut[aa[3][i][1]] + 1
  873.         i=i+1
  874.     until aa[3][i][2]<0
  875.    
  876.     local s=""
  877.     for k,v in pairs(madethecut) do
  878.         s=s .. k .. "," .. v .. "\n"
  879.     end
  880.     local f=io.open("C:/MadeTheCut.txt", "w+")
  881.     f:write(s)
  882.     f:flush()
  883.     f:close()
  884. end
  885. --]=]
  886.  
  887. -- Run this chunk if you want to create a graph from your syllables and walk that graph.
  888. --[[
  889. local gg = initializegraph(aa)
  890. gg:getarcs(aa, tt)
  891. for i=1,50 do
  892.     local ss=randomwalk(gg)
  893.     print(ss)
  894. end
  895. --]]
  896.  
  897. -- Uncomment these lines to grow our syllable list until user interrupts the process.  Results are saved to the default file location.
  898. --[[
  899. while true do
  900.     aa:growsyllables(tt)
  901.     aa:itercompress(tt)
  902.     aa:print("w", "st", "sv")
  903.     aa:save()
  904. end
  905. --]]
  906.  
  907. -- Uncomment these lines to conduct a "losers' tournament", finding the best syllables that have been excluded from consideration.  Double-losers are collected in a "hospice" table.
  908. --[[
  909. bb, hospice = loserstournament(aa, tt)
  910. bb:print("w", "st", "sv")
  911. for k in pairs(hospice) do
  912.     print(k)
  913. end
  914. --]]
  915.  
  916. print(os.clock())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement