Sorry for bumping this very old post, but I never stopped working on this!
It’s a major part of my game Lua Learning so I kept making little tweaks.
@pobammer made some optimizations, making it noticeably faster.
I also updated the builtin dictionary to include the changes that Roblox has made over time!
Major Update August 6th 2020:
Huge rework to make improvements to lexer behavior.
It used to return operators and unknowns as tokens themselves, so your implementation had to handle random symbol tokens. The fact that the old lexer treated valid operators the same as random unicode was ridiculous. The fact that it could return emoji as the token was even crazier. ‘�’ should never have been a valid token return.
I’ve reworked it to guarantee that it will only return tokens from the documented list of possible tokens. This makes implementation simpler.
It also opened up a doorway to new token grouping optimizations for us! ((())) used to be six tokens, but thanks to our new operator token handling, its grouped as one. This can seriously help bring down total tokens per lex and make your highlighter run faster.
Also, minor improvements and pattern match fixes, etc.
The following code is up-to-date as of August 6th, 2020.
Source Code:
--[=[
Lexical scanner for creating a sequence of tokens from Lua source code.
This is a heavily modified and Roblox-optimized version of
the original Penlight Lexer module:
https://github.com/stevedonovan/Penlight
Authors:
stevedonovan <https://github.com/stevedonovan> ----------------- Original Penlight lexer author
ryanjmulder <https://github.com/ryanjmulder> ----------------- Penlight lexer contributer
mpeterv <https://github.com/mpeterv> ----------------- Penlight lexer contributer
Tieske <https://github.com/Tieske> ----------------- Penlight lexer contributer
boatbomber <https://github.com/boatbomber> ----------------- Roblox port, added builtin token, added patterns for incomplete strings and comments, bug fixes, behavior changes, token optimization
Sleitnick <https://github.com/Sleitnick> ----------------- Roblox optimizations
howmanysmall <https://github.com/howmanysmall> ----------------- Lua + Roblox optimizations
List of possible tokens:
- keyword
- builtin
- iden
- string
- number
- space
- comment
- operator
Usage:
local source = "for i = 1,n do end"
-- The 'scan' function returns a token iterator:
for token,src in lexer.scan(source) do
print(token, src)
end
> keyword for
> space
> iden i
> space
> operator =
> space
> number 1
> operator ,
> iden n
> space
> keyword do
> space
> keyword end
--]=]
local lexer = {}
local ipairs = ipairs
local NUMBER_A = "^0x[%da-fA-F]+"
local NUMBER_B = "^%d+%.?%d*[eE][%+%-]?%d+"
local NUMBER_C = "^%d+[%._]?[%d_eE]*"
local IDEN = "^[%a_][%w_]*"
local WSPACE = "^[ \t\n]+"
local STRING_EMPTY = "^(['\"])%1" --Empty String
local STRING_PLAIN = [=[^(['"])[%w%p \t\v\b\f\r\a]-([^%\]%1)]=] --TODO: Handle escaping escapes
local STRING_INCOMP_A = "^(['\"]).-\n" --Incompleted String with next line
local STRING_INCOMP_B = "^(['\"])[^\n]*" --Incompleted String without next line
local STRING_MULTI = "^%[(=*)%[.-%]%1%]" --Multiline-String
local STRING_MULTI_INCOMP = "^%[=*%[.-.*" --Incompleted Multiline-String
local COMMENT_MULTI = "^%-%-%[(=*)%[.-%]%1%]" --Completed Multiline-Comment
local COMMENT_MULTI_INCOMP = "^%-%-%[=*%[.-.*" --Incompleted Multiline-Comment
local COMMENT_PLAIN = "^%-%-.-\n" --Completed Singleline-Comment
local COMMENT_INCOMP = "^%-%-.*" --Incompleted Singleline-Comment
local TABLE_EMPTY = {}
local lua_keyword = {
["and"] = true, ["break"] = true, ["do"] = true, ["else"] = true, ["elseif"] = true,
["end"] = true, ["false"] = true, ["for"] = true, ["function"] = true, ["if"] = true,
["in"] = true, ["local"] = true, ["nil"] = true, ["not"] = true, ["while"] = true,
["or"] = true, ["repeat"] = true, ["return"] = true, ["then"] = true, ["true"] = true,
["self"] = true, ["until"] = true,
["continue"] = true,
["plugin"] = true, --Highlights as a keyword instead of a builtin cuz Roblox is weird
}
local lua_builtin = {
-- Lua Functions
["assert"] = true;["collectgarbage"] = true;["error"] = true;["getfenv"] = true;
["getmetatable"] = true;["ipairs"] = true;["loadstring"] = true;["newproxy"] = true;
["next"] = true;["pairs"] = true;["pcall"] = true;["print"] = true;["rawequal"] = true;
["rawget"] = true;["rawset"] = true;["select"] = true;["setfenv"] = true;["setmetatable"] = true;
["tonumber"] = true;["tostring"] = true;["type"] = true;["unpack"] = true;["xpcall"] = true;
-- Lua Variables
["_G"] = true;["_VERSION"] = true;
-- Lua Tables
["bit32"] = true;["coroutine"] = true;["debug"] = true;
["math"] = true;["os"] = true;["string"] = true;
["table"] = true;["utf8"] = true;
-- Roblox Functions
["delay"] = true;["elapsedTime"] = true;["gcinfo"] = true;["require"] = true;
["settings"] = true;["spawn"] = true;["tick"] = true;["time"] = true;["typeof"] = true;
["UserSettings"] = true;["wait"] = true;["warn"] = true;["ypcall"] = true;
-- Roblox Variables
["Enum"] = true;["game"] = true;["shared"] = true;["script"] = true;
["workspace"] = true;
-- Roblox Tables
["Axes"] = true;["BrickColor"] = true;["CellId"] = true;["CFrame"] = true;["Color3"] = true;
["ColorSequence"] = true;["ColorSequenceKeypoint"] = true;["DateTime"] = true;
["DockWidgetPluginGuiInfo"] = true;["Faces"] = true;["Instance"] = true;["NumberRange"] = true;
["NumberSequence"] = true;["NumberSequenceKeypoint"] = true;["PathWaypoint"] = true;
["PhysicalProperties"] = true;["PluginDrag"] = true;["Random"] = true;["Ray"] = true;["Rect"] = true;
["Region3"] = true;["Region3int16"] = true;["TweenInfo"] = true;["UDim"] = true;["UDim2"] = true;
["Vector2"] = true;["Vector2int16"] = true;["Vector3"] = true;["Vector3int16"] = true;
}
local function idump(tok)
return coroutine.yield("iden", tok)
end
local function odump(tok)
return coroutine.yield("operator", tok)
end
local function ndump(tok)
return coroutine.yield("number", tok)
end
local function sdump(tok)
return coroutine.yield("string", tok)
end
local function cdump(tok)
return coroutine.yield("comment", tok)
end
local function wsdump(tok)
return coroutine.yield("space", tok)
end
local function lua_vdump(tok)
if lua_keyword[tok] then
return coroutine.yield("keyword", tok)
elseif lua_builtin[tok] then
return coroutine.yield("builtin", tok)
else
return coroutine.yield("iden", tok)
end
end
local lua_matches = {
-- Indentifiers
{IDEN, lua_vdump},
-- Whitespace
{WSPACE, wsdump},
-- Numbers
{NUMBER_A, ndump},
{NUMBER_B, ndump},
{NUMBER_C, ndump},
-- Strings
{STRING_EMPTY, sdump},
{STRING_PLAIN, sdump},
{STRING_INCOMP_A, sdump},
{STRING_INCOMP_B, sdump},
{STRING_MULTI, sdump},
{STRING_MULTI_INCOMP, sdump},
-- Comments
{COMMENT_MULTI, cdump},
{COMMENT_MULTI_INCOMP, cdump},
{COMMENT_PLAIN, cdump},
{COMMENT_INCOMP, cdump},
-- Operators
{"^[:;/~%*%(%)%-=<>%[%]{},%.#%^%+%%]+", odump},
{"^.", idump}
}
--- Create a plain token iterator from a string.
-- @tparam string s a string.
function lexer.scan(s)
local startTick = tick()
lexer.finished = false
local function lex(first_arg)
local line_nr = 0
local sz = #s
local idx = 1
-- res is the value used to resume the coroutine.
local function handle_requests(res)
while res do
local tp = type(res)
-- Insert a token list:
if tp == "table" then
res = coroutine.yield("", "")
for _, t in ipairs(res) do
res = coroutine.yield(t[1], t[2])
end
elseif tp == "string" then -- Or search up to some special pattern:
local i1, i2 = string.find(s, res, idx)
if i1 then
idx = i2 + 1
res = coroutine.yield("", string.sub(s, i1, i2))
else
res = coroutine.yield("", "")
idx = sz + 1
end
else
res = coroutine.yield(line_nr, idx)
end
end
end
handle_requests(first_arg)
line_nr = 1
while true do
if idx > sz then
while true do
handle_requests(coroutine.yield())
end
end
for _, m in ipairs(lua_matches) do
local findres = table.create(2)
local i1, i2 = string.find(s, m[1], idx)
findres[1], findres[2] = i1, i2
if i1 then
local tok = string.sub(s, i1, i2)
idx = i2 + 1
lexer.finished = idx > sz
-- if lexer.finished then
-- print(tick()-startTick)
-- end
local res = m[2](tok, findres)
if string.find(tok, "\n") then
-- Update line number:
local _, newlines = string.gsub(tok, "\n", TABLE_EMPTY)
line_nr = line_nr + newlines
end
handle_requests(res)
break
end
end
end
end
return coroutine.wrap(lex)
end
return lexer
Edit: Check out the true latest on the GitHub repository instead! New features, perf improvements, and a syntax highlighter with RichText!