Advertisement
TheMadWally

DEX_V4_LEXER

Nov 16th, 2018
834
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Lua 14.79 KB | None | 0 0
  1. local yield,wrap = coroutine.yield,coroutine.wrap
  2. local strfind = string.find
  3. local strsub = string.sub
  4. local append = table.insert
  5. local env = getfenv()
  6.  
  7. local function assert_arg(idx,val,tp)
  8.     if type(val) ~= tp then
  9.         error("argument "..idx.." must be "..tp, 2)
  10.     end
  11. end
  12.  
  13. local lexer = {}
  14. local oldToken = "";
  15.  
  16. local NUMBER1 = '^[%+%-]?%d+%.?%d*[eE][%+%-]?%d+'
  17. local NUMBER2 = '^[%+%-]?%d+%.?%d*'
  18. local NUMBER3 = '^0x[%da-fA-F]+'
  19. local NUMBER4 = '^%d+%.?%d*[eE][%+%-]?%d+'
  20. local NUMBER5 = '^%d+%.?%d*'
  21. local IDEN = '^[%a_][%w_]*'
  22. local IDEN2 = '^[%s_]%.[%w_]&'
  23. local WSPACE = '^%s+'
  24. local STRING1 = "^(['\"])%1" -- empty string
  25. local STRING2 = [[^(['"])(\*)%2%1]]
  26. local STRING3 = [[^(['"]).-[^\](\*)%2%1]]
  27. local CHAR1 = "^''"
  28. local CHAR2 = [[^'(\*)%1']]
  29. local CHAR3 = [[^'.-[^\](\*)%1']]
  30. local PREPRO = '^#.-[^\\]\n'
  31. local PUNC = "%p"
  32.  
  33. local plain_matches,lua_matches,cpp_matches,lua_keyword,cpp_keyword
  34.  
  35. local function tdump(tok)
  36.    return yield(tok,tok)
  37. end
  38.  
  39. local function ndump(tok,options)
  40.    if options and options.number then
  41.        tok = tonumber(tok)
  42.    end
  43.    return yield("number",tok)
  44. end
  45.  
  46. -- regular strings, single or double quotes; usually we want them
  47. -- without the quotes
  48. local function sdump(tok,options)
  49.    if options and options.string then
  50.        tok = tok:sub(2,-2)
  51.    end
  52.    return yield("string",tok)
  53. end
  54.  
  55. -- long Lua strings need extra work to get rid of the quotes
  56. local function sdump_l(tok,options,findres)
  57.    if options and options.string then
  58.        local quotelen = 3
  59.        if findres[3] then
  60.            quotelen = quotelen + findres[3]:len()
  61.        end
  62.        tok = tok:sub(quotelen, -quotelen)
  63.        if tok:sub(1, 1) == "\n" then
  64.            tok = tok:sub(2)
  65.        end
  66.    end
  67.    return yield("string",tok)
  68. end
  69.  
  70. local function chdump(tok,options)
  71.    if options and options.string then
  72.        tok = tok:sub(2,-2)
  73.    end
  74.    return yield("char",tok)
  75. end
  76.  
  77. local function cdump(tok)
  78.    return yield('comment',tok)
  79. end
  80.  
  81. local function wsdump (tok)
  82.    return yield("space",tok)
  83. end
  84.  
  85. local function pdump (tok)
  86.    return yield('prepro',tok)
  87. end
  88.  
  89. local function plain_vdump(tok)
  90.    return yield("iden",tok)
  91. end
  92.  
  93. local function lua_vdump(tok)
  94.    if lua_keyword[tok] then
  95.        return yield("keyword",tok)
  96.     elseif env[tok] then
  97.         oldToken = tok
  98.         return yield('global', tok)
  99.     elseif tok:match(PUNC) then
  100.         return yield('punc', tok)
  101.    else
  102.         if env[oldToken] and type(env[oldToken]) == "table" and rawget(env[oldToken], tok) ~= nil then
  103.             return yield('global', tok)
  104.         else
  105.             return yield("iden",tok)
  106.         end
  107.    end
  108. end
  109.  
  110. local function cpp_vdump(tok)
  111.    if cpp_keyword[tok] then
  112.        return yield("keyword",tok)
  113.    else
  114.        return yield("iden",tok)
  115.    end
  116. end
  117.  
  118. --- create a plain token iterator from a string or file-like object.
  119. -- @tparam string|file s a string or a file-like object with `:read()` method returning lines.
  120. -- @tab matches an optional match table - array of token descriptions.
  121. -- A token is described by a `{pattern, action}` pair, where `pattern` should match
  122. -- token body and `action` is a function called when a token of described type is found.
  123. -- @tab[opt] filter a table of token types to exclude, by default `{space=true}`
  124. -- @tab[opt] options a table of options; by default, `{number=true,string=true}`,
  125. -- which means convert numbers and strip string quotes.
  126. function lexer.scan(s,matches,filter,options)
  127.    local file = type(s) ~= 'string' and s
  128.    filter = filter or {space=true}
  129.    options = options or {number=true,string=true}
  130.    if filter then
  131.        if filter.space then filter[wsdump] = true end
  132.        if filter.comments then
  133.            filter[cdump] = true
  134.        end
  135.    end
  136.    if not matches then
  137.        if not plain_matches then
  138.            plain_matches = {
  139.                {WSPACE,wsdump},
  140.                {NUMBER3,ndump},
  141.                {IDEN,plain_vdump},
  142.                {NUMBER1,ndump},
  143.                {NUMBER2,ndump},
  144.                {STRING1,sdump},
  145.                {STRING2,sdump},
  146.                {STRING3,sdump},
  147.                {'^.',tdump}
  148.            }
  149.        end
  150.        matches = plain_matches
  151.    end
  152.    local function lex(first_arg)
  153.        local line_nr = 0
  154.        local next_line = file and file:read()
  155.        local sz = file and 0 or #s
  156.        local idx = 1
  157.  
  158.        -- res is the value used to resume the coroutine.
  159.        local function handle_requests(res)
  160.            while res do
  161.                local tp = type(res)
  162.                -- insert a token list
  163.                if tp == 'table' then
  164.                    res = yield('','')
  165.                    for _,t in ipairs(res) do
  166.                        res = yield(t[1],t[2])
  167.                    end
  168.                elseif tp == 'string' then -- or search up to some special pattern
  169.                    local i1,i2 = strfind(s,res,idx)
  170.                    if i1 then
  171.                        local tok = strsub(s,i1,i2)
  172.                        idx = i2 + 1
  173.                        res = yield('',tok)
  174.                    else
  175.                        res = yield('','')
  176.                        idx = sz + 1
  177.                    end
  178.                else
  179.                    res = yield(line_nr,idx)
  180.                end
  181.            end
  182.        end
  183.  
  184.        handle_requests(first_arg)
  185.        if not file then line_nr = 1 end
  186.  
  187.        while true do
  188.            if idx > sz then
  189.                if file then
  190.                    if not next_line then return end
  191.                    s = next_line
  192.                    line_nr = line_nr + 1
  193.                    next_line = file:read()
  194.                    if next_line then
  195.                        s = s .. '\n'
  196.                    end
  197.                    idx, sz = 1, #s
  198.                else
  199.                    while true do
  200.                        handle_requests(yield())
  201.                    end
  202.                end
  203.            end
  204.  
  205.            for _,m in ipairs(matches) do
  206.                local pat = m[1]
  207.                local fun = m[2]
  208.                local findres = {strfind(s,pat,idx)}
  209.                local i1, i2 = findres[1], findres[2]
  210.                if i1 then
  211.                    local tok = strsub(s,i1,i2)
  212.                    idx = i2 + 1
  213.                    local res
  214.                    if not (filter and filter[fun]) then
  215.                        lexer.finished = idx > sz
  216.                        res = fun(tok, options, findres)
  217.                    end
  218.                    if not file and tok:find("\n") then
  219.                        -- Update line number.
  220.                        local _, newlines = tok:gsub("\n", {})
  221.                        line_nr = line_nr + newlines
  222.                    end
  223.                    handle_requests(res)
  224.                    break
  225.                end
  226.            end
  227.        end
  228.    end
  229.    return wrap(lex)
  230. end
  231.  
  232. local function isstring (s)
  233.    return type(s) == 'string'
  234. end
  235.  
  236. --- insert tokens into a stream.
  237. -- @param tok a token stream
  238. -- @param a1 a string is the type, a table is a token list and
  239. -- a function is assumed to be a token-like iterator (returns type & value)
  240. -- @string a2 a string is the value
  241. function lexer.insert (tok,a1,a2)
  242.    if not a1 then return end
  243.    local ts
  244.    if isstring(a1) and isstring(a2) then
  245.        ts = {{a1,a2}}
  246.    elseif type(a1) == 'function' then
  247.        ts = {}
  248.        for t,v in a1() do
  249.            append(ts,{t,v})
  250.        end
  251.    else
  252.        ts = a1
  253.    end
  254.    tok(ts)
  255. end
  256.  
  257. --- get everything in a stream upto a newline.
  258. -- @param tok a token stream
  259. -- @return a string
  260. function lexer.getline (tok)
  261.    local _,v = tok('.-\n')
  262.    return v
  263. end
  264.  
  265. --- get current line number.
  266. -- @param tok a token stream
  267. -- @return the line number.
  268. -- if the input source is a file-like object,
  269. -- also return the column.
  270. function lexer.lineno (tok)
  271.    return tok(0)
  272. end
  273.  
  274. --- get the rest of the stream.
  275. -- @param tok a token stream
  276. -- @rzeturn a string
  277. function lexer.getrest (tok)
  278.    local _,v = tok('.+')
  279.    return v
  280. end
  281.  
  282. --- get the Lua keywords as a set-like table.
  283. -- So `res["and"]` etc would be `true`.
  284. -- @return a table
  285. function lexer.get_keywords ()
  286.    if not lua_keyword then
  287.        lua_keyword = {
  288.            ["and"] = true, ["break"] = true,  ["do"] = true,
  289.            ["else"] = true, ["elseif"] = true, ["end"] = true,
  290.            ["false"] = true, ["for"] = true, ["function"] = true,
  291.            ["if"] = true, ["in"] = true,  ["local"] = true, ["nil"] = true,
  292.            ["not"] = true, ["or"] = true, ["repeat"] = true,
  293.            ["return"] = true, ["then"] = true, ["true"] = true,
  294.            ["until"] = true,  ["while"] = true,
  295.        }
  296.    end
  297.    return lua_keyword
  298. end
  299.  
  300. --- create a Lua token iterator from a string or file-like object.
  301. -- Will return the token type and value.
  302. -- @string s the string
  303. -- @tab[opt] filter a table of token types to exclude, by default `{space=true,comments=true}`
  304. -- @tab[opt] options a table of options; by default, `{number=true,string=true}`,
  305. -- which means convert numbers and strip string quotes.
  306. function lexer.lua(s,filter,options)
  307.    filter = filter or {space=true,comments=true}
  308.    lexer.get_keywords()
  309.    if not lua_matches then
  310.        lua_matches = {
  311.            {WSPACE,wsdump},
  312.            {NUMBER3,ndump},
  313.            {IDEN,lua_vdump},
  314.            {NUMBER4,ndump},
  315.            {NUMBER5,ndump},
  316.            {STRING1,sdump},
  317.            {STRING2,sdump},
  318.            {STRING3,sdump},
  319.             {PUNC, lua_vdump},
  320.            {'^%-%-%[(=*)%[.-%]%1%]',cdump},
  321.            {'^%-%-.-\n',cdump},
  322.            {'^%[(=*)%[.-%]%1%]',sdump_l},
  323.            {'^==',tdump},
  324.            {'^~=',tdump},
  325.            {'^<=',tdump},
  326.            {'^>=',tdump},
  327.            {'^%.%.%.',tdump},
  328.            {'^%.%.',tdump},
  329.            {'^.',tdump}
  330.        }
  331.    end
  332.    return lexer.scan(s,lua_matches,filter,options)
  333. end
  334.  
  335. --- create a C/C++ token iterator from a string or file-like object.
  336. -- Will return the token type type and value.
  337. -- @string s the string
  338. -- @tab[opt] filter a table of token types to exclude, by default `{space=true,comments=true}`
  339. -- @tab[opt] options a table of options; by default, `{number=true,string=true}`,
  340. -- which means convert numbers and strip string quotes.
  341. function lexer.cpp(s,filter,options)
  342.    filter = filter or {space=true,comments=true}
  343.    if not cpp_keyword then
  344.        cpp_keyword = {
  345.            ["class"] = true, ["break"] = true,  ["do"] = true, ["sizeof"] = true,
  346.            ["else"] = true, ["continue"] = true, ["struct"] = true,
  347.            ["false"] = true, ["for"] = true, ["public"] = true, ["void"] = true,
  348.            ["private"] = true, ["protected"] = true, ["goto"] = true,
  349.            ["if"] = true, ["static"] = true,  ["const"] = true, ["typedef"] = true,
  350.            ["enum"] = true, ["char"] = true, ["int"] = true, ["bool"] = true,
  351.            ["long"] = true, ["float"] = true, ["true"] = true, ["delete"] = true,
  352.            ["double"] = true,  ["while"] = true, ["new"] = true,
  353.            ["namespace"] = true, ["try"] = true, ["catch"] = true,
  354.            ["switch"] = true, ["case"] = true, ["extern"] = true,
  355.            ["return"] = true,["default"] = true,['unsigned']  = true,['signed'] = true,
  356.            ["union"] =  true, ["volatile"] = true, ["register"] = true,["short"] = true,
  357.        }
  358.    end
  359.    if not cpp_matches then
  360.        cpp_matches = {
  361.            {WSPACE,wsdump},
  362.            {PREPRO,pdump},
  363.            {NUMBER3,ndump},
  364.            {IDEN,cpp_vdump},
  365.            {NUMBER4,ndump},
  366.            {NUMBER5,ndump},
  367.            {CHAR1,chdump},
  368.            {CHAR2,chdump},
  369.            {CHAR3,chdump},
  370.            {STRING1,sdump},
  371.            {STRING2,sdump},
  372.            {STRING3,sdump},
  373.            {'^//.-\n',cdump},
  374.            {'^/%*.-%*/',cdump},
  375.            {'^==',tdump},
  376.            {'^!=',tdump},
  377.            {'^<=',tdump},
  378.            {'^>=',tdump},
  379.            {'^->',tdump},
  380.            {'^&&',tdump},
  381.            {'^||',tdump},
  382.            {'^%+%+',tdump},
  383.            {'^%-%-',tdump},
  384.            {'^%+=',tdump},
  385.            {'^%-=',tdump},
  386.            {'^%*=',tdump},
  387.            {'^/=',tdump},
  388.            {'^|=',tdump},
  389.            {'^%^=',tdump},
  390.            {'^::',tdump},
  391.            {'^.',tdump}
  392.        }
  393.    end
  394.    return lexer.scan(s,cpp_matches,filter,options)
  395. end
  396.  
  397. --- get a list of parameters separated by a delimiter from a stream.
  398. -- @param tok the token stream
  399. -- @string[opt=')'] endtoken end of list. Can be '\n'
  400. -- @string[opt=','] delim separator
  401. -- @return a list of token lists.
  402. function lexer.get_separated_list(tok,endtoken,delim)
  403.    endtoken = endtoken or ')'
  404.    delim = delim or ','
  405.    local parm_values = {}
  406.    local level = 1 -- used to count ( and )
  407.    local tl = {}
  408.    local function tappend (tl,t,val)
  409.        val = val or t
  410.        append(tl,{t,val})
  411.    end
  412.    local is_end
  413.    if endtoken == '\n' then
  414.        is_end = function(t,val)
  415.            return t == 'space' and val:find '\n'
  416.        end
  417.    else
  418.        is_end = function (t)
  419.            return t == endtoken
  420.        end
  421.    end
  422.    local token,value
  423.    while true do
  424.        token,value=tok()
  425.        if not token then return nil,'EOS' end -- end of stream is an error!
  426.        if is_end(token,value) and level == 1 then
  427.            append(parm_values,tl)
  428.            break
  429.        elseif token == '(' then
  430.            level = level + 1
  431.            tappend(tl,'(')
  432.        elseif token == ')' then
  433.            level = level - 1
  434.            if level == 0 then -- finished with parm list
  435.                append(parm_values,tl)
  436.                break
  437.            else
  438.                tappend(tl,')')
  439.            end
  440.        elseif token == delim and level == 1 then
  441.            append(parm_values,tl) -- a new parm
  442.            tl = {}
  443.        else
  444.            tappend(tl,token,value)
  445.        end
  446.    end
  447.    return parm_values,{token,value}
  448. end
  449.  
  450. --- get the next non-space token from the stream.
  451. -- @param tok the token stream.
  452. function lexer.skipws (tok)
  453.    local t,v = tok()
  454.    while t == 'space' do
  455.        t,v = tok()
  456.    end
  457.    return t,v
  458. end
  459.  
  460. local skipws = lexer.skipws
  461.  
  462. --- get the next token, which must be of the expected type.
  463. -- Throws an error if this type does not match!
  464. -- @param tok the token stream
  465. -- @string expected_type the token type
  466. -- @bool no_skip_ws whether we should skip whitespace
  467. function lexer.expecting (tok,expected_type,no_skip_ws)
  468.    assert_arg(1,tok,'function')
  469.    assert_arg(2,expected_type,'string')
  470.    local t,v
  471.    if no_skip_ws then
  472.        t,v = tok()
  473.    else
  474.        t,v = skipws(tok)
  475.    end
  476.    if t ~= expected_type then error ("expecting "..expected_type,2) end
  477.    return v
  478. end
  479.  
  480. return lexer
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement