Lexer for RBX Lua

There’s this awesome lexer for Lua in the Penlight library. It works well, but has a few bugs.

@boatbomber came to the rescue and heavily modified the lexer, and added support for tokenizing built-in items (such as ‘print’ and ‘math’). I believe he also made some changes so that it could be invoked on-the-fly and still perform well. So the majority of the credit for this goes to him!

I then took his edits and made a couple optimizations of my own. So now we have a couple iterations of fixes and optimizations, and are left with a lexer that works really well for RBX Lua. I’m using it for my framework plugin, and I believe boatbomber is also using it in a project.

Here’s an example of the lexer being used to do syntax highlighting on its own source. This is a plugin widget, not the script editor:

Here’s the Lexer code:

--[[

	Lexical scanner for creating a sequence of tokens from Lua source code.

	This is a heavily modified and Roblox-optimized version of
	the original Penlight Lexer module:
		https://github.com/stevedonovan/Penlight

	Authors:
		stevedonovan <https://github.com/stevedonovan> ----------------- Original Penlight lexer author
		ryanjmulder  <https://github.com/ryanjmulder>  ----------------- Penlight lexer contributer
		mpeterv      <https://github.com/mpeterv>      ----------------- Penlight lexer contributer
		Tieske       <https://github.com/Tieske>       ----------------- Penlight lexer contributer
		boatbomber   <https://github.com/boatbomber>   ----------------- Roblox port, optimizations, and bug fixes
		Sleitnick    <https://github.com/Sleitnick>    ----------------- Roblox optimizations

	Usage:

		local source = "for i = 1,n do end"
		
		-- The 'scan' function returns a token iterator:
		for token,src in lexer.scan(source) do
			print(token, src)
		end

			> keyword for
			> iden    i
			> =       =
			> number  1
			> ,       ,
			> iden    n
			> keyword do
			> keyword end

	List of tokens:
		- keyword
		- builtin
		- iden
		- string
		- number
		- space
		- comment

	Other tokens that don't fall into the above categories
	will simply be returned as itself. For instance, operators
	like "+" will simply return "+" as the token.

--]]

local lexer = {}

local yield, wrap  = coroutine.yield, coroutine.wrap
local strfind      = string.find
local strsub       = string.sub
local append       = table.insert
local type         = type

local NUMBER1	= "^[%+%-]?%d+%.?%d*[eE][%+%-]?%d+"
local NUMBER2	= "^[%+%-]?%d+%.?%d*"
local NUMBER3	= "^0x[%da-fA-F]+"
local NUMBER4	= "^%d+%.?%d*[eE][%+%-]?%d+"
local NUMBER5	= "^%d+%.?%d*"
local IDEN		= "^[%a_][%w_]*"
local WSPACE	= "^%s+"
local STRING1	= "^(['\"])%1"							--Empty String
local STRING2	= [[^(['"])(\*)%2%1]]
local STRING3	= [[^(['"]).-[^\](\*)%2%1]]
local STRING4	= "^(['\"]).-.*"						--Incompleted String
local STRING5	= "^%[(=*)%[.-%]%1%]"					--Multiline-String
local STRING6	= "^%[%[.-.*"							--Incompleted Multiline-String
local CHAR1		= "^''"
local CHAR2		= [[^'(\*)%1']]
local CHAR3		= [[^'.-[^\](\*)%1']]
local PREPRO	= "^#.-[^\\]\n"
local MCOMMENT1	= "^%-%-%[(=*)%[.-%]%1%]"				--Completed Multiline-Comment
local MCOMMENT2	= "^%-%-%[%[.-.*"						--Incompleted Multiline-Comment
local SCOMMENT1	= "^%-%-.-\n"							--Completed Singleline-Comment
local SCOMMENT2	= "^%-%-.-.*"							--Incompleted Singleline-Comment

local lua_keyword = {
	["and"] = true,  ["break"] = true,  ["do"] = true,      ["else"] = true,      ["elseif"] = true,
	["end"] = true,  ["false"] = true,  ["for"] = true,     ["function"] = true,  ["if"] = true,
	["in"] = true,   ["local"] = true,  ["nil"] = true,     ["not"] = true,       ["while"] = true,
	["or"] = true,   ["repeat"] = true, ["return"] = true,  ["then"] = true,      ["true"] = true,
	["self"] = true, ["until"] = true
}

local lua_builtin = {
	["assert"] = true;["collectgarbage"] = true;["error"] = true;["_G"] = true;
	["gcinfo"] = true;["getfenv"] = true;["getmetatable"] = true;["ipairs"] = true;
	["loadstring"] = true;["newproxy"] = true;["next"] = true;["pairs"] = true;
	["pcall"] = true;["print"] = true;["rawequal"] = true;["rawget"] = true;["rawset"] = true;
	["select"] = true;["setfenv"] = true;["setmetatable"] = true;["tonumber"] = true;
	["tostring"] = true;["type"] = true;["unpack"] = true;["_VERSION"] = true;["xpcall"] = true;
	["delay"] = true;["elapsedTime"] = true;["require"] = true;["spawn"] = true;["tick"] = true;
	["time"] = true;["typeof"] = true;["UserSettings"] = true;["wait"] = true;["warn"] = true;
	["game"] = true;["Enum"] = true;["script"] = true;["shared"] = true;["workspace"] = true;
	["Axes"] = true;["BrickColor"] = true;["CFrame"] = true;["Color3"] = true;["ColorSequence"] = true;
	["ColorSequenceKeypoint"] = true;["Faces"] = true;["Instance"] = true;["NumberRange"] = true;
	["NumberSequence"] = true;["NumberSequenceKeypoint"] = true;["PhysicalProperties"] = true;
	["Random"] = true;["Ray"] = true;["Rect"] = true;["Region3"] = true;["Region3int16"] = true;
	["TweenInfo"] = true;["UDim"] = true;["UDim2"] = true;["Vector2"] = true;["Vector3"] = true;
	["Vector3int16"] = true;["next"] = true;
	["os"] = true;
		--["os.time"] = true;["os.date"] = true;["os.difftime"] = true;
	["debug"] = true;
		--["debug.traceback"] = true;["debug.profilebegin"] = true;["debug.profileend"] = true;
	["math"] = true;
		--["math.abs"] = true;["math.acos"] = true;["math.asin"] = true;["math.atan"] = true;["math.atan2"] = true;["math.ceil"] = true;["math.clamp"] = true;["math.cos"] = true;["math.cosh"] = true;["math.deg"] = true;["math.exp"] = true;["math.floor"] = true;["math.fmod"] = true;["math.frexp"] = true;["math.ldexp"] = true;["math.log"] = true;["math.log10"] = true;["math.max"] = true;["math.min"] = true;["math.modf"] = true;["math.noise"] = true;["math.pow"] = true;["math.rad"] = true;["math.random"] = true;["math.randomseed"] = true;["math.sign"] = true;["math.sin"] = true;["math.sinh"] = true;["math.sqrt"] = true;["math.tan"] = true;["math.tanh"] = true;
	["coroutine"] = true;
		--["coroutine.create"] = true;["coroutine.resume"] = true;["coroutine.running"] = true;["coroutine.status"] = true;["coroutine.wrap"] = true;["coroutine.yield"] = true;
	["string"] = true;
		--["string.byte"] = true;["string.char"] = true;["string.dump"] = true;["string.find"] = true;["string.format"] = true;["string.len"] = true;["string.lower"] = true;["string.match"] = true;["string.rep"] = true;["string.reverse"] = true;["string.sub"] = true;["string.upper"] = true;["string.gmatch"] = true;["string.gsub"] = true;
	["table"] = true;
		--["table.concat"] = true;["table.insert"] = true;["table.remove"] = true;["table.sort"] = true;
}

local function tdump(tok)
	return yield(tok, tok)
end

local function ndump(tok)
	return yield("number", tok)
end

local function sdump(tok)
	return yield("string", tok)
end

local function cdump(tok)
	return yield("comment", tok)
end

local function wsdump(tok)
	return yield("space", tok)
end

local function lua_vdump(tok)
	if (lua_keyword[tok]) then
		return yield("keyword", tok)
	elseif (lua_builtin[tok]) then
		return yield("builtin", tok)
	else
		return yield("iden", tok)
	end
end

local lua_matches = {
	{IDEN,      lua_vdump},        -- Indentifiers
	{WSPACE,    wsdump},           -- Whitespace
	{NUMBER3,   ndump},            -- Numbers
	{NUMBER4,   ndump},
	{NUMBER5,   ndump},
	{STRING1,   sdump},            -- Strings
	{STRING2,   sdump},
	{STRING3,   sdump},
	{STRING4,   sdump},
	{STRING5,   sdump},            -- Multiline-Strings
	{STRING6,   sdump},            -- Multiline-Strings
	
	{MCOMMENT1, cdump},            -- Multiline-Comments
	{MCOMMENT2, cdump},			
	{SCOMMENT1, cdump},            -- Singleline-Comments
	{SCOMMENT2, cdump},
	
	{"^==",     tdump},            -- Operators
	{"^~=",     tdump},
	{"^<=",     tdump},
	{"^>=",     tdump},
	{"^%.%.%.", tdump},
	{"^%.%.",   tdump},
	{"^.",      tdump}
}

local num_lua_matches = #lua_matches


--- Create a plain token iterator from a string.
-- @tparam string s a string.
function lexer.scan(s)

	local function lex(first_arg)

		local line_nr = 0
		local sz = #s
		local idx = 1

		-- res is the value used to resume the coroutine.
		local function handle_requests(res)
			while (res) do
				local tp = type(res)
				-- Insert a token list:
				if (tp == "table") then
					res = yield("", "")
					for i = 1,#res do
						local t = res[i]
						res = yield(t[1], t[2])
					end
				elseif (tp == "string") then -- Or search up to some special pattern:
					local i1, i2 = strfind(s, res, idx)
					if (i1) then
						local tok = strsub(s, i1, i2)
						idx = (i2 + 1)
						res = yield("", tok)
					else
						res = yield("", "")
						idx = (sz + 1)
					end
				else
					res = yield(line_nr, idx)
				end
			end
		end

		handle_requests(first_arg)
		line_nr = 1

		while (true) do

			if (idx > sz) then
				while (true) do
					handle_requests(yield())
				end
			end

			for i = 1,num_lua_matches do
				local m = lua_matches[i]
				local pat = m[1]
				local fun = m[2]
				local findres = {strfind(s, pat, idx)}
				local i1, i2 = findres[1], findres[2]
				if (i1) then
					local tok = strsub(s, i1, i2)
					idx = (i2 + 1)
					lexer.finished = (idx > sz)
					local res = fun(tok, findres)
					if (tok:find("\n")) then
						-- Update line number:
						local _,newlines = tok:gsub("\n", {})
						line_nr = (line_nr + newlines)
					end
					handle_requests(res)
					break
				end
			end

		end

	end

	return wrap(lex)

end

return lexer
58 Likes

This is awesome!
What do you mean by framework plugin?
Will it be open source?
Will your plugin make custom global via setfenv practical?

AeroGameFramework, which is a framework I made mostly for myself so that I could code games faster and in a more organized manner. It’s open-sourced. The plugin is a helper utility that makes it much easier to use the framework in Studio. The code previewer widget is using this lexer for code highlighting.

2 Likes

What other features does your plugin have? (I feel like the plugin readme is not very explicit)

It includes a hierarchy (similar to the Explorer) that allows you to only see code within the framework. If you watch my documentation videos on the framework, you will see the plugin in use

1 Like

btw thought i’d let you know about https://www.youtube.com/watch?v=DCIISE78HIA

1 Like

lol

1 Like

Here’s a neat trick that lets you automatically determine if a variable name is a built-in keyword like Vector3 or print.

local function isBuiltInKeyword(varName)
	local env = getfenv()
	return env[varName] ~= nil and rawget(env, varName) == nil
end
9 Likes

This is as neato as incognito

Sorry for bumping this very old post, but I never stopped working on this!
It’s a major part of my game Lua Learning so I kept making little tweaks.

@pobammer made some optimizations, making it noticeably faster.

I also updated the builtin dictionary to include the changes that Roblox has made over time!


Major Update August 6th 2020:

Huge rework to make improvements to lexer behavior.
It used to return operators and unknowns as tokens themselves, so your implementation had to handle random symbol tokens. The fact that the old lexer treated valid operators the same as random unicode was ridiculous. The fact that it could return emoji as the token was even crazier. ‘�’ should never have been a valid token return.

I’ve reworked it to guarantee that it will only return tokens from the documented list of possible tokens. This makes implementation simpler.

It also opened up a doorway to new token grouping optimizations for us! ((())) used to be six tokens, but thanks to our new operator token handling, its grouped as one. This can seriously help bring down total tokens per lex and make your highlighter run faster.

Also, minor improvements and pattern match fixes, etc.


The following code is up-to-date as of August 6th, 2020.
Source Code:

--[=[
	Lexical scanner for creating a sequence of tokens from Lua source code.
	This is a heavily modified and Roblox-optimized version of
	the original Penlight Lexer module:
		https://github.com/stevedonovan/Penlight
	Authors:
		stevedonovan <https://github.com/stevedonovan> ----------------- Original Penlight lexer author
		ryanjmulder <https://github.com/ryanjmulder> ----------------- Penlight lexer contributer
		mpeterv <https://github.com/mpeterv> ----------------- Penlight lexer contributer
		Tieske <https://github.com/Tieske> ----------------- Penlight lexer contributer
		boatbomber <https://github.com/boatbomber> ----------------- Roblox port, added builtin token, added patterns for incomplete strings and comments, bug fixes, behavior changes, token optimization
		Sleitnick <https://github.com/Sleitnick> ----------------- Roblox optimizations
		howmanysmall <https://github.com/howmanysmall> ----------------- Lua + Roblox optimizations


	List of possible tokens:
		- keyword
		- builtin
		- iden
		- string
		- number
		- space
		- comment
		- operator

	Usage:
		local source = "for i = 1,n do end"
		
		-- The 'scan' function returns a token iterator:
		for token,src in lexer.scan(source) do
			print(token, src)
		end
			> keyword for
			> space  
			> iden i
			> space  
			> operator =
			> space  
			> number 1
			> operator ,
			> iden n
			> space  
			> keyword do
			> space  
			> keyword end
			
	
--]=]

local lexer = {}

local ipairs = ipairs

local NUMBER_A = "^0x[%da-fA-F]+"
local NUMBER_B = "^%d+%.?%d*[eE][%+%-]?%d+"
local NUMBER_C = "^%d+[%._]?[%d_eE]*"
local IDEN = "^[%a_][%w_]*"
local WSPACE = "^[ \t\n]+"
local STRING_EMPTY = "^(['\"])%1"							--Empty String
local STRING_PLAIN = [=[^(['"])[%w%p \t\v\b\f\r\a]-([^%\]%1)]=]	--TODO: Handle escaping escapes
local STRING_INCOMP_A = "^(['\"]).-\n"						--Incompleted String with next line
local STRING_INCOMP_B = "^(['\"])[^\n]*"					--Incompleted String without next line
local STRING_MULTI = "^%[(=*)%[.-%]%1%]"					--Multiline-String
local STRING_MULTI_INCOMP = "^%[=*%[.-.*"						--Incompleted Multiline-String
local COMMENT_MULTI = "^%-%-%[(=*)%[.-%]%1%]"				--Completed Multiline-Comment
local COMMENT_MULTI_INCOMP = "^%-%-%[=*%[.-.*"				--Incompleted Multiline-Comment
local COMMENT_PLAIN = "^%-%-.-\n"							--Completed Singleline-Comment
local COMMENT_INCOMP = "^%-%-.*"							--Incompleted Singleline-Comment

local TABLE_EMPTY = {}

local lua_keyword = {
	["and"] = true, ["break"] = true, ["do"] = true, ["else"] = true, ["elseif"] = true,
	["end"] = true, ["false"] = true, ["for"] = true, ["function"] = true, ["if"] = true,
	["in"] = true, ["local"] = true, ["nil"] = true, ["not"] = true, ["while"] = true,
	["or"] = true, ["repeat"] = true, ["return"] = true, ["then"] = true, ["true"] = true,
	["self"] = true, ["until"] = true,
	
	["continue"] = true,
	
	["plugin"] = true, --Highlights as a keyword instead of a builtin cuz Roblox is weird
}


local lua_builtin = {
	-- Lua Functions
	["assert"] = true;["collectgarbage"] = true;["error"] = true;["getfenv"] = true;
	["getmetatable"] = true;["ipairs"] = true;["loadstring"] = true;["newproxy"] = true;
	["next"] = true;["pairs"] = true;["pcall"] = true;["print"] = true;["rawequal"] = true;
	["rawget"] = true;["rawset"] = true;["select"] = true;["setfenv"] = true;["setmetatable"] = true;
	["tonumber"] = true;["tostring"] = true;["type"] = true;["unpack"] = true;["xpcall"] = true;

	-- Lua Variables
	["_G"] = true;["_VERSION"] = true;

	-- Lua Tables
	["bit32"] = true;["coroutine"] = true;["debug"] = true;
	["math"] = true;["os"] = true;["string"] = true;
	["table"] = true;["utf8"] = true;

	-- Roblox Functions
	["delay"] = true;["elapsedTime"] = true;["gcinfo"] = true;["require"] = true;
	["settings"] = true;["spawn"] = true;["tick"] = true;["time"] = true;["typeof"] = true;
	["UserSettings"] = true;["wait"] = true;["warn"] = true;["ypcall"] = true;

	-- Roblox Variables
	["Enum"] = true;["game"] = true;["shared"] = true;["script"] = true;
	["workspace"] = true;

	-- Roblox Tables
	["Axes"] = true;["BrickColor"] = true;["CellId"] = true;["CFrame"] = true;["Color3"] = true;
	["ColorSequence"] = true;["ColorSequenceKeypoint"] = true;["DateTime"] = true;
	["DockWidgetPluginGuiInfo"] = true;["Faces"] = true;["Instance"] = true;["NumberRange"] = true;
	["NumberSequence"] = true;["NumberSequenceKeypoint"] = true;["PathWaypoint"] = true;
	["PhysicalProperties"] = true;["PluginDrag"] = true;["Random"] = true;["Ray"] = true;["Rect"] = true;
	["Region3"] = true;["Region3int16"] = true;["TweenInfo"] = true;["UDim"] = true;["UDim2"] = true;
	["Vector2"] = true;["Vector2int16"] = true;["Vector3"] = true;["Vector3int16"] = true;
}

local function idump(tok)
	return coroutine.yield("iden", tok)
end

local function odump(tok)
	return coroutine.yield("operator", tok)
end

local function ndump(tok)
	return coroutine.yield("number", tok)
end

local function sdump(tok)
	return coroutine.yield("string", tok)
end

local function cdump(tok)
	return coroutine.yield("comment", tok)
end

local function wsdump(tok)
	return coroutine.yield("space", tok)
end

local function lua_vdump(tok)
	if lua_keyword[tok] then
		return coroutine.yield("keyword", tok)
	elseif lua_builtin[tok] then
		return coroutine.yield("builtin", tok)
	else
		return coroutine.yield("iden", tok)
	end
end

local lua_matches = {
	-- Indentifiers
	{IDEN, lua_vdump},
	
	 -- Whitespace
	{WSPACE, wsdump},
	
	-- Numbers
	{NUMBER_A, ndump},
	{NUMBER_B, ndump},
	{NUMBER_C, ndump},
	
	-- Strings
	{STRING_EMPTY, sdump},
	{STRING_PLAIN, sdump},
	{STRING_INCOMP_A, sdump},
	{STRING_INCOMP_B, sdump},
	{STRING_MULTI, sdump},
	{STRING_MULTI_INCOMP, sdump},
	
	-- Comments
	{COMMENT_MULTI, cdump},			
	{COMMENT_MULTI_INCOMP, cdump},
	{COMMENT_PLAIN, cdump},
	{COMMENT_INCOMP, cdump},
	
	-- Operators
	{"^[:;/~%*%(%)%-=<>%[%]{},%.#%^%+%%]+", odump},
	{"^.", idump}
}

--- Create a plain token iterator from a string.
-- @tparam string s a string.	
	
function lexer.scan(s)
	local startTick = tick()
	lexer.finished = false
	
	local function lex(first_arg)
		local line_nr = 0
		local sz = #s
		local idx = 1
		
		-- res is the value used to resume the coroutine.
		local function handle_requests(res)
			while res do
				local tp = type(res)
				-- Insert a token list:
				if tp == "table" then
					res = coroutine.yield("", "")
					for _, t in ipairs(res) do
						res = coroutine.yield(t[1], t[2])
					end
				elseif tp == "string" then -- Or search up to some special pattern:
					local i1, i2 = string.find(s, res, idx)
					if i1 then
						idx = i2 + 1
						res = coroutine.yield("", string.sub(s, i1, i2))
					else
						res = coroutine.yield("", "")
						idx = sz + 1
					end
				else
					res = coroutine.yield(line_nr, idx)
				end
			end
		end
		
		handle_requests(first_arg)
		line_nr = 1
		
		while true do
			if idx > sz then
				while true do
					handle_requests(coroutine.yield())
				end
			end
			for _, m in ipairs(lua_matches) do
				local findres = table.create(2)
				local i1, i2 = string.find(s, m[1], idx)
				findres[1], findres[2] = i1, i2
				if i1 then
					local tok = string.sub(s, i1, i2)
					idx = i2 + 1
					lexer.finished = idx > sz
--					if lexer.finished then
--						print(tick()-startTick)
--					end
					
					local res = m[2](tok, findres)
					
					if string.find(tok, "\n") then
						-- Update line number:
						local _, newlines = string.gsub(tok, "\n", TABLE_EMPTY)
						line_nr = line_nr + newlines
					end
					
					handle_requests(res)
					break
				end
			end
		end
	end
	return coroutine.wrap(lex)
end

return lexer
20 Likes

As a note, I’m still testing the table.create function. I think table.pack might be faster, but I don’t even know.

2 Likes

I’ve always complained that Lua’s ‘regex’ was bad.

Then someone made a lexer with it. Nice job…
But seems like it does not have the facilities for a code generation :wink:

Oops! There was a bug with incompleted single line strings that could cause it to error.

I fixed it, and edited my above reply with my current (fixed) version as of 12/12/19.

2 Likes

So, how do you use the lexer?
What does it return, a function?

image

use os.clock, it’s faster and more precise

2 Likes

This new version is not fully backwards compatible.

(Which is why it’s a new reply and not just another edit to the one above)


Previous versions had a space token type. Seeing as the source content of them was always invisible text, I felt that it was pretty wasteful to have tons of token returns of effectively blank tokens.

This new version no longer has a space token and instead all token types can have trailing/leading whitespace in them. This cuts down on the total token count by a massive amount, averaging around 30%-50% lower counts.


Simple example of what this means:

local SourceSample = "for i = 1, n do end"

for token,src in lexer.scan(source) do
	print(token, "'"...src..."'")
end

Old Output: (14 Token returns)

keyword 'for'
space ' '
iden 'i'
space ' '
operator '='
space ' '
number '1'
operator ','
space ' '
iden 'n'
space ' '
keyword 'do'
space ' '
keyword 'end'

New Output: (8 Token returns)

keyword 'for '
iden 'i '
operator '= '
number '1'
operator ', '
iden 'n '
keyword 'do '
keyword 'end'

Source Code: Last updated 8/16/2020

--[=[
	Lexical scanner for creating a sequence of tokens from Lua source code.
	This is a heavily modified and Roblox-optimized version of
	the original Penlight Lexer module:
		https://github.com/stevedonovan/Penlight
	Authors:
		stevedonovan <https://github.com/stevedonovan> ----------------- Original Penlight lexer author
		ryanjmulder <https://github.com/ryanjmulder> ----------------- Penlight lexer contributer
		mpeterv <https://github.com/mpeterv> ----------------- Penlight lexer contributer
		Tieske <https://github.com/Tieske> ----------------- Penlight lexer contributer
		boatbomber <https://github.com/boatbomber> ----------------- Roblox port, added builtin token, added patterns for incomplete syntax, bug fixes, behavior changes, token optimization
		Sleitnick <https://github.com/Sleitnick> ----------------- Roblox optimizations
		howmanysmall <https://github.com/howmanysmall> ----------------- Lua + Roblox optimizations
	
	Usage:
		local source = "for i = 1, n do end"
		
		-- The 'scan' function returns a token iterator:
		for token,src in lexer.scan(source) do
			print(token, "'"..src.."'")
		end
		->	keyword 'for '
		->	iden 'i '
		->	operator '= '
		->	number '1'
		->	operator ', '
		->	iden 'n '
		->	keyword 'do '
		->	keyword 'end'
			
	List of possible tokens:
		- iden
		- keyword
		- builtin
		- string
		- number
		- comment
		- operator
--]=]

local lexer = {}

local ipairs = ipairs

local Prefix,Suffix,Cleaner = "^[ \t\n\0\a\b\v\f\r]*", "[ \t\n\0\a\b\v\f\r]*", "[ \t\n\0\a\b\v\f\r]+"
local NUMBER_A = "0x[%da-fA-F]+"
local NUMBER_B = "%d+%.?%d*[eE][%+%-]?%d+"
local NUMBER_C = "%d+[%._]?[%d_eE]*"
local OPERATORS = "[:;<>/~%*%(%)%-={},%.#%^%+%%]+"
local BRACKETS = "[%[%]]+" -- needs to be separate pattern from other operators or it'll mess up multiline strings
local IDEN = "[%a_][%w_]*"
local STRING_EMPTY = "(['\"])%1"							--Empty String
local STRING_PLAIN = [=[(['"])[%w%p \t\v\b\f\r\a]-([^%\]%1)]=]	--TODO: Handle escaping escapes
local STRING_INCOMP_A = "(['\"]).-\n"						--Incompleted String with next line
local STRING_INCOMP_B = "(['\"])[^\n]*"					--Incompleted String without next line
local STRING_MULTI = "%[(=*)%[.-%]%1%]"					--Multiline-String
local STRING_MULTI_INCOMP = "%[=*%[.-.*"						--Incompleted Multiline-String
local COMMENT_MULTI = "%-%-%[(=*)%[.-%]%1%]"				--Completed Multiline-Comment
local COMMENT_MULTI_INCOMP = "%-%-%[=*%[.-.*"				--Incompleted Multiline-Comment
local COMMENT_PLAIN = "%-%-.-\n"							--Completed Singleline-Comment
local COMMENT_INCOMP = "%-%-.*"							--Incompleted Singleline-Comment

local TABLE_EMPTY = {}

local lua_keyword = {
	["and"] = true, ["break"] = true, ["do"] = true, ["else"] = true, ["elseif"] = true,
	["end"] = true, ["false"] = true, ["for"] = true, ["function"] = true, ["if"] = true,
	["in"] = true, ["local"] = true, ["nil"] = true, ["not"] = true, ["while"] = true,
	["or"] = true, ["repeat"] = true, ["return"] = true, ["then"] = true, ["true"] = true,
	["self"] = true, ["until"] = true,
	
	["continue"] = true,
	
	["plugin"] = true, --Highlights as a keyword instead of a builtin cuz Roblox is weird
}


local lua_builtin = {
	-- Lua Functions
	["assert"] = true;["collectgarbage"] = true;["error"] = true;["getfenv"] = true;
	["getmetatable"] = true;["ipairs"] = true;["loadstring"] = true;["newproxy"] = true;
	["next"] = true;["pairs"] = true;["pcall"] = true;["print"] = true;["rawequal"] = true;
	["rawget"] = true;["rawset"] = true;["select"] = true;["setfenv"] = true;["setmetatable"] = true;
	["tonumber"] = true;["tostring"] = true;["type"] = true;["unpack"] = true;["xpcall"] = true;

	-- Lua Variables
	["_G"] = true;["_VERSION"] = true;

	-- Lua Tables
	["bit32"] = true;["coroutine"] = true;["debug"] = true;
	["math"] = true;["os"] = true;["string"] = true;
	["table"] = true;["utf8"] = true;

	-- Roblox Functions
	["delay"] = true;["elapsedTime"] = true;["gcinfo"] = true;["require"] = true;
	["settings"] = true;["spawn"] = true;["tick"] = true;["time"] = true;["typeof"] = true;
	["UserSettings"] = true;["wait"] = true;["warn"] = true;["ypcall"] = true;

	-- Roblox Variables
	["Enum"] = true;["game"] = true;["shared"] = true;["script"] = true;
	["workspace"] = true;

	-- Roblox Tables
	["Axes"] = true;["BrickColor"] = true;["CellId"] = true;["CFrame"] = true;["Color3"] = true;
	["ColorSequence"] = true;["ColorSequenceKeypoint"] = true;["DateTime"] = true;
	["DockWidgetPluginGuiInfo"] = true;["Faces"] = true;["Instance"] = true;["NumberRange"] = true;
	["NumberSequence"] = true;["NumberSequenceKeypoint"] = true;["PathWaypoint"] = true;
	["PhysicalProperties"] = true;["PluginDrag"] = true;["Random"] = true;["Ray"] = true;["Rect"] = true;
	["Region3"] = true;["Region3int16"] = true;["TweenInfo"] = true;["UDim"] = true;["UDim2"] = true;
	["Vector2"] = true;["Vector2int16"] = true;["Vector3"] = true;["Vector3int16"] = true;
}

local function idump(tok)
	--print("tok unknown:",tok)
	return coroutine.yield("iden", tok)
end

local function odump(tok)
	return coroutine.yield("operator", tok)
end

local function ndump(tok)
	return coroutine.yield("number", tok)
end

local function sdump(tok)
	return coroutine.yield("string", tok)
end

local function cdump(tok)
	return coroutine.yield("comment", tok)
end

local function lua_vdump(tok)
	-- Since we merge spaces into the tok, we need to remove them
	-- in order to check the actual word it contains
	local cleanTok = string.gsub(tok,Cleaner,"")
		
	if lua_keyword[cleanTok] then
		return coroutine.yield("keyword", tok)
	elseif lua_builtin[cleanTok] then
		return coroutine.yield("builtin", tok)
	else
		return coroutine.yield("iden", tok)
	end
end

local lua_matches = {
	-- Indentifiers
	{Prefix.. IDEN ..Suffix, lua_vdump},
	
	-- Numbers
	{Prefix.. NUMBER_A ..Suffix, ndump},
	{Prefix.. NUMBER_B ..Suffix, ndump},
	{Prefix.. NUMBER_C ..Suffix, ndump},
	
	-- Strings
	{Prefix.. STRING_EMPTY ..Suffix, sdump},
	{Prefix.. STRING_PLAIN ..Suffix, sdump},
	{Prefix.. STRING_INCOMP_A ..Suffix, sdump},
	{Prefix.. STRING_INCOMP_B ..Suffix, sdump},
	{Prefix.. STRING_MULTI ..Suffix, sdump},
	{Prefix.. STRING_MULTI_INCOMP ..Suffix, sdump},
	
	-- Comments
	{Prefix.. COMMENT_MULTI ..Suffix, cdump},			
	{Prefix.. COMMENT_MULTI_INCOMP ..Suffix, cdump},
	{Prefix.. COMMENT_PLAIN ..Suffix, cdump},
	{Prefix.. COMMENT_INCOMP ..Suffix, cdump},
	
	-- Operators
	{Prefix.. OPERATORS ..Suffix, odump},
	{Prefix.. BRACKETS ..Suffix, odump},
	
	-- Unknown
	{"^.", idump}
}

--- Create a plain token iterator from a string.
-- @tparam string s a string.	
	
function lexer.scan(s)
	local startTime = os.clock()
	lexer.finished = false
	
	local function lex(first_arg)
		local line_nr = 0
		local sz = #s
		local idx = 1
		
		-- res is the value used to resume the coroutine.
		local function handle_requests(res)
			while res do
				local tp = type(res)
				-- Insert a token list:
				if tp == "table" then
					res = coroutine.yield("", "")
					for _, t in ipairs(res) do
						res = coroutine.yield(t[1], t[2])
					end
				elseif tp == "string" then -- Or search up to some special pattern:
					local i1, i2 = string.find(s, res, idx)
					if i1 then
						idx = i2 + 1
						res = coroutine.yield("", string.sub(s, i1, i2))
					else
						res = coroutine.yield("", "")
						idx = sz + 1
					end
				else
					res = coroutine.yield(line_nr, idx)
				end
			end
		end
		
		handle_requests(first_arg)
		line_nr = 1
		
		while true do
			if idx > sz then
				while true do
					handle_requests(coroutine.yield())
				end
			end
			for _, m in ipairs(lua_matches) do
				local findres = table.create(2)
				local i1, i2 = string.find(s, m[1], idx)
				findres[1], findres[2] = i1, i2
				if i1 then
					local tok = string.sub(s, i1, i2)
					idx = i2 + 1
					lexer.finished = idx > sz
--					if lexer.finished then
--						print(string.format("Lex took %.2f milliseconds", (os.clock()-startTime)*1000 ))
--					end
					
					local res = m[2](tok, findres)
					
					if string.find(tok, "\n") then
						-- Update line number:
						local _, newlines = string.gsub(tok, "\n", TABLE_EMPTY)
						line_nr = line_nr + newlines
					end
					
					handle_requests(res)
					break
				end
			end
		end
	end
	return coroutine.wrap(lex)
end

return lexer
11 Likes

Sorry for bumping this thread but I got interested in this so I decided to make one myself, when I “tested” it, it’s apparently 5x faster than @boatbomber’s one and this OP’s one. lexer3 is the OP’s one while lexer2 is the @boatbomber’s one while lexer1 is the script for my lexer.

Test code:

local s = "for i = 1, n do end";
local a = os.clock();
for _ = 1, 1000 do
	for token, src in lexer1.lex(s) do
	end;
end;
print(string.format("%.0f μs ", (os.clock() - a) * 1E6))

local a = os.clock();
for _ = 1, 1000 do
	for _, token, src in lexer2.scan(s) do
	end;
end;
print(string.format("%.0f μs ", (os.clock() - a) * 1E6))

local a = os.clock();
for _ = 1, 1000 do
	for _, token, src in lexer3.scan(s) do
	end;
end;
print(string.format("%.0f μs ", (os.clock() - a) * 1E6))

Outputs

2444 μs
13428 μs
16222 μs

Code:

--[=[
	A Lua lexical scanner
	BSD 2-Clause Licence
	Copyright ©, 2020 - Blockzez (devforum.roblox.com/u/Blockzez and github.com/Blockzez)
	All rights reserved.
	
	Redistribution and use in source and binary forms, with or without
	modification, are permitted provided that the following conditions are met:
	
	1. Redistributions of source code must retain the above copyright notice, this
	   list of conditions and the following disclaimer.
	
	2. Redistributions in binary form must reproduce the above copyright notice,
	   this list of conditions and the following disclaimer in the documentation
	   and/or other materials provided with the distribution.
	
	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
	FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
]=]
local lexer = { };

-- 
local identifiers = {
	-- Reserved keywords
	['and'] = 'keyword', ['break'] = 'keyword', ['continue'] = 'keyword', ['do'] = 'keyword', ['else'] = 'keyword', ['elseif'] = 'keyword', ['end'] = 'keyword',
	['false'] = 'keyword', ['for'] = 'keyword', ['function'] = 'keyword', ['goto'] = 'keyword', ['if'] = 'keyword', ['in'] = 'keyword', ['local'] = 'keyword',
	['nil'] = 'keyword', ['not'] = 'keyword', ['or'] = 'keyword', ['repeat'] = 'keyword', ['return'] = 'keyword', ['then'] = 'keyword', ['true'] = 'keyword',
	['until'] = 'keyword', ['while'] = 'keyword',

	-- Lua globals
	assert = 'builtin', collectgarbage = 'builtin', error = 'builtin', getfenv = 'builtin', getmetatable = 'builtin', ipairs = 'builtin', loadstring = 'builtin', next = 'builtin', newproxy = 'builtin',
	pairs = 'builtin', pcall = 'builtin', print = 'builtin', rawequal = 'builtin', rawget = 'builtin', rawset = 'builtin', select = 'builtin', setfenv = 'builtin', setmetatable = 'builtin',
	tonumber = 'builtin', tostring = 'builtin', type = 'builtin', unpack = 'builtin', xpcall = 'builtin',
	--
	_G = 'builtin', _VERSION = 'builtin',
	
	-- Lua libraries
	bit32 = 'builtin', coroutine = 'builtin', debug = 'builtin', math = 'builtin', os = 'builtin', string = 'builtin', table = 'builtin', utf8 = 'builtin',

	-- Roblox globals
	delay = 'builtin', elapsedTime = 'builtin', require = 'builtin', settings = 'builtin', spawn = 'builtin', stats = 'builtin', tick = 'builtin', UserSettings = 'builtin', wait = 'builtin', warn = 'builtin',
	--
	Enum = 'builtin', game = 'builtin', shared = 'builtin', script = 'builtin', plugin = 'builtin', workspace = 'builtin',

	-- Depcreated
	printidentity = 'builtin_deprecated', version = 'builtin_deprecated',
	
	-- Roblox types
	Axes = 'builtin', BrickColor = 'builtin', CFrame = 'builtin', Color3 = 'builtin', ColorSequence = 'builtin', ColorSequenceKeypoint = 'builtin', DateTime = 'builtin', DockWidgetPluginGuiInfo = 'builtin',
	Faces = 'builtin', Instance = 'builtin', NumberRange = 'builtin', NumberSequence = 'builtin', NumberSequenceKeypoint = 'builtin', PathWaypoint = 'builtin', PhysicalProperties = 'builtin', Random = 'builtin',
	Ray = 'builtin', RaycastParams = 'builtin', Rect = 'builtin', Region3 = 'builtin', Region3int16 = 'builtin', TweenInfo = 'builtin', UDim = 'builtin', UDim2 = 'builtin', Vector2 = 'builtin',
	Vector2int16 = 'builtin', Vector3 = 'builtin', Vector3int16 = 'builtin',
};

local operator = {
	[0x2B] = true, [0x2D] = true, [0x2A] = true, [0x2F] = true, [0x25] = true,
	[0x3C] = true, [0x3E] = true, [0x3D] = true, [0x7E] = true,
	[0x26] = true, [0x7C] = true, [0x5E] = true,
	[0x23] = true, [0x2E] = true,
	[0x28] = true, [0x29] = true, [0x2C] = true,
	[0x5B] = true, [0x5D] = true, [0x3A] = true,
	[0x7B] = true, [0x7D] = true, [0x2E] = true,
};

local bases = {
	[0x42] = 2, [0x62] = 2,
	[0x4F] = 8, [0x6F] = 8,
	[0x58] = 16, [0x78] = 16
};

local function to_char_array(self)
	local len = #self;
	if len <= 7997 then
		return { s = self, string.byte(self, 1, len) };
	end;
	local clen = math.ceil(len / 7997);
	local ret = table.create(len);
	for i = 1, clen do
		local c = table.pack(string.byte(self, i * 7997 - 7996, i * 7997 - (i == clen and 7997 - ((len - 1) % 7997 + 1) or 0)));
		table.move(c, 1, c.n, i * 7997 - 7996, ret);
	end;
	ret.s = self;
	return ret;
end;

local function next_lex(codes, i0)
	local c = codes[i0];
	if not c then
		return;
	end;
	local ttype = 'other';
	local i1 = i0;
	if (c >= 0x30 and c <= 0x39) or (c == 0x2E and (codes[i0 + 1] and codes[i0 + 1] >= 0x30 and codes[i0 + 1] <= 0x39)) then
		-- Numbers
		local isfloat, has_expt = c == 0x2E, false;
		if c == 0x30 and bases[codes[i1 + 1]] then
			i1 += 2;
		end;
		while true do
			i1 += 1;
			c = codes[i1];
			if c == 0x2E then
				if isfloat then
					break;
				end;
				isfloat = true;
			elseif c == 0x45 or c == 0x65 then
				if isfloat or has_expt then
					break
				end;
				has_expt = true;
			elseif (not c) or (c < 0x30 or c > 0x39 and c ~= 0x5F) then
				break;
			end;
		end;
		ttype = 'number';
	elseif c == 0x22 or c == 0x27 then
		-- Strings
		repeat
			i1 += 1;
			if codes[i1] == 0x5C then
				i1 += 1;
				local c2 = codes[i1];
				if c2 == 0x75 then
					i1 += 5;
				elseif c2 == 0x78 then
					i1 += 3;
				else
					i1 += 1;
				end;
			end;
		until codes[i1] == c or not codes[i1];
		i1 += 1;
		ttype = 'string';
	elseif operator[c] then
		-- Operators/Comments/Strings
		if c == 0x2D and codes[i0 + 1] == 0x2D then
			i1 += 2;
			local eq_sign = -1;
			if codes[i1] == 0x5B then
				repeat
					i1 += 1;
					eq_sign += 1;
				until codes[i1] ~= 0x3D or not codes[i1];
			end;
			if eq_sign > -1 then
				repeat
					i1 = table.find(codes, 0x5D, i1 + 1) or #codes + 1;
					local c = eq_sign;
					while c > 0 and codes[i1 + 1] == 0x3D do
						c -= 1;
						i1 += 1;
					end;
				until c == 0 or not codes[i1];
				i1 += eq_sign + 2;
			else
				repeat
					i1 += 1;
				until codes[i1] == 0x0A or not codes[i1];
			end;
			ttype = "comment";
		elseif c == 0x5B and (codes[i0 + 1] == 0x5B or codes[i0 + 1] == 0x3D) then
			local eq_sign = -1;
			repeat
				i1 += 1;
				eq_sign += 1;
			until codes[i1] ~= 0x3D or not codes[i1];
			repeat
				i1 = table.find(codes, 0x5D, i1 + 1) or #codes + 1;
				local c = eq_sign;
				while c > 0 and codes[i1 + 1] == 0x3D do
					c -= 1;
					i1 += 1;
				end;
			until c == 0 or not codes[i1];
			i1 += eq_sign + 2;
			ttype = "string";
		else
			ttype = "operator";
			repeat
				i1 += 1;
			until not operator[codes[i1]];
		end;
	elseif (c >= 0x41 and c <= 0x5A) or (c >= 0x61 and c <= 0x7A) or c == 0x5F then
		-- Identifiers
		repeat
			i1 += 1;
			c = codes[i1];
		until (not c) or (c < 0x30 or c > 0x39) and (c < 0x41 or c > 0x5A) and (c < 0x61 or c > 0x7A) and c ~= 0x5F;
		ttype = identifiers[string.sub(codes.s, i0, i1 - 1)] or "identifier";
	else
		-- Others
		repeat
			i1 += 1;
			c = codes[i1];
		until (not c) or (c < 0x30 or c > 0x39) and (c < 0x41 or c > 0x5A) and (c < 0x61 or c > 0x7A) and c ~= 0x5F and c ~= 0x22 and c ~= 0x27 and not operator[c];
		ttype = "other"
	end;
	-- Whitespaces
	while codes[i1] and ((codes[i1] >= 0x09 and codes[i1] <= 0x0D) or codes[i1] == 0x20) do
		i1 += 1;
	end;
	return i1, ttype, string.sub(codes.s, i0, i1 - 1);
end;

function lexer.lex(str)
	return next_lex, to_char_array(str), 1;
end;

return lexer;
7 Likes

I love yours! Definitely faster to do things down at that level instead of plain string manipulation.

Probably harder to maintain and debug though, but that’s the price you pay for such speed.

Thank you for sharing!

I made a pretty massive feature addition!

Non-sequential reading via a reusable Navigator object!

For my Code Outline plugin, I needed to be able to look at tokens both ahead and behind in order to gather data on variable names & values.

However, the lexer did not support reading in arbitrary order, it only had a sequential iterator (lexer.scan) to read from.

My initial solution was very inefficient- I created an array, then sequentially lexed the source and stored all the resulting tokens in the array. Then, I iterated over the array and could just index it wherever I wanted, whenever I wanted.
As I’m sure you realized, that means I had to loop over every token twice and possibly index it even more times. One loop to generate and store the token, and one loop over the array, and possibly indexing more if it was a relevant token for the work.

Obviously, I wasn’t going to sit around and accept that.

I wrote lexer.navigator() as a built-in wrapper for lexer.scan() that would allow me to read in any order I wanted, and only generate tokens when called for. There’s some minor performance overhead (especially cuz it still has to generate sequentially internally) but it’s better than multiple loops!


API & Usage:

Calling lexer.navigator() returns a Navigator.

Navigator:Destroy()
Cleans up the Navigator object.

Navigator:SetSource(Source)
Clears any old data and prepares the navigator to lex the given Source.

Navigator.Next
Iterator function that behaves like lexer.scan.

Navigator.Peek(PeekAmount)
Function to see a token that is PeekAmount away from where the Next iterator is currently. Passing a negative value will look backward, passing a positive value will look ahead.

local source = "for i = 1, n do end"
		
-- The 'navigator' function returns a navigator object:
-- Navigators allow you to use nav.Peek() for non-sequential reads
local nav = lexer.navigator()
nav:SetSource(source) -- You can reuse navigators by setting a new Source
		
for token,src in nav.Next do
	print(token, "'"..src.."'")
	local peektoken,peeksrc = nav.Peek(2) -- You can peek backwards by passing a negative input
	if peektoken then
		print("  Peeked ahead by 2:",peektoken,"'"..peeksrc.."'")
	end
end
		
-->	keyword 'for '
-->	  Peeked ahead by 2: operator '= '
-->	iden 'i '
-->	  Peeked ahead by 2: number '1'
-->	operator '= '
-->	  Peeked ahead by 2: operator ', '
-->	number '1'
-->	  Peeked ahead by 2: iden 'n '
-->	operator ', '
-->	  Peeked ahead by 2: keyword 'do '
-->	iden 'n '
-->	  Peeked ahead by 2: keyword 'end'
-->	keyword 'do '
-->	keyword 'end'

Notes:

When you peek ahead, it’ll sequentially generate tokens to that desired token and then return it. Then, the .Next iterator will get to use those token values generated by that peek, until it passes the point the peek when until.
When you peek behind, it just grabs the token data out of its token cache so there’s nearly no performance hit.

The navigator should theoretically work with any version of the lexer, since it’s wrapping the given lexer.scan() function. Therefore, if you’re still relying on an older version’s behavior, you can just add the lexer.navigator() function to the end of your module and it should behave as expected.

Navigator Source
function lexer.navigator()
	
	local nav = {
		Source = "";
		TokenCache = table.create(50);
		
		_RealIndex = 0;
		_UserIndex = 0;
		_ScanThread = nil;
	}
	
	function nav:Destroy()
		self.Source = nil
		self._RealIndex = nil;
		self._UserIndex = nil;
		self.TokenCache = nil;
		self._ScanThread = nil;
	end
	
	function nav:SetSource(SourceString)
		self.Source = SourceString
		
		self._RealIndex = 0;
		self._UserIndex = 0;
		table.clear(self.TokenCache)
		
		self._ScanThread = coroutine.create(function()
			for Token,Src in lexer.scan(self.Source) do
				self._RealIndex += 1
				self.TokenCache[self._RealIndex] = {Token; Src;}
				coroutine.yield(Token,Src)
			end
		end)
	end

	function nav.Next()
		nav._UserIndex += 1
		
		if nav._RealIndex >= nav._UserIndex then
			-- Already scanned, return cached
			return table.unpack(nav.TokenCache[nav._UserIndex])
		else
			if coroutine.status(nav._ScanThread) == 'dead' then
				-- Scan thread dead
				return
			else
				local success, token, src = coroutine.resume(nav._ScanThread)
				if success and token then
					-- Scanned new data
					return token,src
				else
					-- Lex completed
					return
				end
			end
		end
		
	end
	
	function nav.Peek(PeekAmount)
		local GoalIndex = nav._UserIndex + PeekAmount
		
		if nav._RealIndex >= GoalIndex then
			-- Already scanned, return cached
			if GoalIndex > 0 then
				return table.unpack(nav.TokenCache[GoalIndex])
			else
				-- Invalid peek
				return
			end
		else
			if coroutine.status(nav._ScanThread) == 'dead' then
				-- Scan thread dead
				return
			else
				
				local IterationsAway = GoalIndex - nav._RealIndex
				
				local success, token, src = nil,nil,nil
				
				for i=1, IterationsAway do
					success, token, src = coroutine.resume(nav._ScanThread)
					if not (success or token) then
						-- Lex completed
						break
					end
				end
				
				return token,src
			end
		end
		
	end
	
	return nav
end
Full Latest Lexer Source
--[=[
	Lexical scanner for creating a sequence of tokens from Lua source code.
	This is a heavily modified and Roblox-optimized version of
	the original Penlight Lexer module:
		https://github.com/stevedonovan/Penlight
	Authors:
		stevedonovan <https://github.com/stevedonovan> ----------- Original Penlight lexer author
		ryanjmulder <https://github.com/ryanjmulder> ------------- Penlight lexer contributer
		mpeterv <https://github.com/mpeterv> --------------------- Penlight lexer contributer
		Tieske <https://github.com/Tieske> ----------------------- Penlight lexer contributer
		boatbomber <https://github.com/boatbomber> --------------- Roblox port, added builtin token, added patterns for incomplete syntax, bug fixes, behavior changes, token optimization
		Sleitnick <https://github.com/Sleitnick> ----------------- Roblox optimizations
		howmanysmall <https://github.com/howmanysmall> ----------- Lua + Roblox optimizations
		boatbomber <https://github.com/boatbomber> --------------- Added lexer.navigator() for non-sequential reads
	
	List of possible tokens:
		- iden
		- keyword
		- builtin
		- string
		- number
		- comment
		- operator
	
	Usage:
		local source = "for i = 1, n do end"
		
		-- The 'scan' function returns a token iterator:
		for token,src in lexer.scan(source) do
			print(token, "'"..src.."'")
		end
		-->	keyword 'for '
		-->	iden 'i '
		-->	operator '= '
		-->	number '1'
		-->	operator ', '
		-->	iden 'n '
		-->	keyword 'do '
		-->	keyword 'end'
		
		-- The 'navigator' function returns a navigator object:
		-- Navigators allow you to use nav.Peek() for non-sequential reads
		local nav = lexer.navigator()
		nav:SetSource(source) -- You can reuse navigators by setting a new Source
		
		for token,src in nav.Next do
			print(token, "'"..src.."'")
			local peektoken,peeksrc = nav.Peek(2) -- You can peek backwards by passing a negative input
			if peektoken then
				print("  Peeked ahead by 2:",peektoken,"'"..peeksrc.."'")
			end
		end
		
		-->	keyword 'for '
		-->	  Peeked ahead by 2: operator '= '
		-->	iden 'i '
		-->	  Peeked ahead by 2: number '1'
		-->	operator '= '
		-->	  Peeked ahead by 2: operator ', '
		-->	number '1'
		-->	  Peeked ahead by 2: iden 'n '
		-->	operator ', '
		-->	  Peeked ahead by 2: keyword 'do '
		-->	iden 'n '
		-->	  Peeked ahead by 2: keyword 'end'
		-->	keyword 'do '
		-->	keyword 'end'
			
	
--]=]

local lexer = {}

local Prefix,Suffix,Cleaner = "^[ \t\n\0\a\b\v\f\r]*", "[ \t\n\0\a\b\v\f\r]*", "[ \t\n\0\a\b\v\f\r]+"
local NUMBER_A = "0x[%da-fA-F]+"
local NUMBER_B = "%d+%.?%d*[eE][%+%-]?%d+"
local NUMBER_C = "%d+[%._]?[%d_eE]*"
local OPERATORS = "[:;<>/~%*%(%)%-=,{}%.#%^%+%%]+"
local BRACKETS = "[%[%]]+" -- needs to be separate pattern from other operators or it'll mess up multiline strings
local IDEN = "[%a_][%w_]*"
local STRING_EMPTY = "(['\"])%1"							--Empty String
local STRING_PLAIN = [=[(['"])[%w%p \t\v\b\f\r\a]-([^%\]%1)]=]	--TODO: Handle escaping escapes
local STRING_INCOMP_A = "(['\"]).-\n"						--Incompleted String with next line
local STRING_INCOMP_B = "(['\"])[^\n]*"					--Incompleted String without next line
local STRING_MULTI = "%[(=*)%[.-%]%1%]"					--Multiline-String
local STRING_MULTI_INCOMP = "%[=*%[.-.*"						--Incompleted Multiline-String
local COMMENT_MULTI = "%-%-%[(=*)%[.-%]%1%]"				--Completed Multiline-Comment
local COMMENT_MULTI_INCOMP = "%-%-%[=*%[.-.*"				--Incompleted Multiline-Comment
local COMMENT_PLAIN = "%-%-.-\n"							--Completed Singleline-Comment
local COMMENT_INCOMP = "%-%-.*"							--Incompleted Singleline-Comment

local TABLE_EMPTY = {}

local lua_keyword = {
	["and"] = true, ["break"] = true, ["do"] = true, ["else"] = true, ["elseif"] = true,
	["end"] = true, ["false"] = true, ["for"] = true, ["function"] = true, ["if"] = true,
	["in"] = true, ["local"] = true, ["nil"] = true, ["not"] = true, ["while"] = true,
	["or"] = true, ["repeat"] = true, ["return"] = true, ["then"] = true, ["true"] = true,
	["self"] = true, ["until"] = true,

	["continue"] = true,

	["plugin"] = true, --Highlights as a keyword instead of a builtin cuz Roblox is weird
}


local lua_builtin = {
	-- Lua Functions
	["assert"] = true;["collectgarbage"] = true;["error"] = true;["getfenv"] = true;
	["getmetatable"] = true;["ipairs"] = true;["loadstring"] = true;["newproxy"] = true;
	["next"] = true;["pairs"] = true;["pcall"] = true;["print"] = true;["rawequal"] = true;
	["rawget"] = true;["rawset"] = true;["select"] = true;["setfenv"] = true;["setmetatable"] = true;
	["tonumber"] = true;["tostring"] = true;["type"] = true;["unpack"] = true;["xpcall"] = true;

	-- Lua Variables
	["_G"] = true;["_VERSION"] = true;

	-- Lua Tables
	["bit32"] = true;["coroutine"] = true;["debug"] = true;
	["math"] = true;["os"] = true;["string"] = true;
	["table"] = true;["utf8"] = true;

	-- Roblox Functions
	["delay"] = true;["elapsedTime"] = true;["gcinfo"] = true;["require"] = true;
	["settings"] = true;["spawn"] = true;["tick"] = true;["time"] = true;["typeof"] = true;
	["UserSettings"] = true;["wait"] = true;["warn"] = true;["ypcall"] = true;

	-- Roblox Variables
	["Enum"] = true;["game"] = true;["shared"] = true;["script"] = true;
	["workspace"] = true;

	-- Roblox Tables
	["Axes"] = true;["BrickColor"] = true;["CellId"] = true;["CFrame"] = true;["Color3"] = true;
	["ColorSequence"] = true;["ColorSequenceKeypoint"] = true;["DateTime"] = true;
	["DockWidgetPluginGuiInfo"] = true;["Faces"] = true;["Instance"] = true;["NumberRange"] = true;
	["NumberSequence"] = true;["NumberSequenceKeypoint"] = true;["PathWaypoint"] = true;
	["PhysicalProperties"] = true;["PluginDrag"] = true;["Random"] = true;["Ray"] = true;["Rect"] = true;
	["Region3"] = true;["Region3int16"] = true;["TweenInfo"] = true;["UDim"] = true;["UDim2"] = true;
	["Vector2"] = true;["Vector2int16"] = true;["Vector3"] = true;["Vector3int16"] = true;
}

local function idump(tok)
	--print("tok unknown:",tok)
	return coroutine.yield("iden", tok)
end

local function odump(tok)
	return coroutine.yield("operator", tok)
end

local function ndump(tok)
	return coroutine.yield("number", tok)
end

local function sdump(tok)
	return coroutine.yield("string", tok)
end

local function cdump(tok)
	return coroutine.yield("comment", tok)
end

local function lua_vdump(tok)
	-- Since we merge spaces into the tok, we need to remove them
	-- in order to check the actual word it contains
	local cleanTok = string.gsub(tok,Cleaner,"")

	if lua_keyword[cleanTok] then
		return coroutine.yield("keyword", tok)
	elseif lua_builtin[cleanTok] then
		return coroutine.yield("builtin", tok)
	else
		return coroutine.yield("iden", tok)
	end
end

local lua_matches = {
	-- Indentifiers
	{Prefix.. IDEN ..Suffix, lua_vdump},

	-- Numbers
	{Prefix.. NUMBER_A ..Suffix, ndump},
	{Prefix.. NUMBER_B ..Suffix, ndump},
	{Prefix.. NUMBER_C ..Suffix, ndump},

	-- Strings
	{Prefix.. STRING_EMPTY ..Suffix, sdump},
	{Prefix.. STRING_PLAIN ..Suffix, sdump},
	{Prefix.. STRING_INCOMP_A ..Suffix, sdump},
	{Prefix.. STRING_INCOMP_B ..Suffix, sdump},
	{Prefix.. STRING_MULTI ..Suffix, sdump},
	{Prefix.. STRING_MULTI_INCOMP ..Suffix, sdump},

	-- Comments
	{Prefix.. COMMENT_MULTI ..Suffix, cdump},			
	{Prefix.. COMMENT_MULTI_INCOMP ..Suffix, cdump},
	{Prefix.. COMMENT_PLAIN ..Suffix, cdump},
	{Prefix.. COMMENT_INCOMP ..Suffix, cdump},

	-- Operators
	{Prefix.. OPERATORS ..Suffix, odump},
	{Prefix.. BRACKETS ..Suffix, odump},

	-- Unknown
	{"^.", idump}
}

--- Create a plain token iterator from a string.
-- @tparam string s a string.	

function lexer.scan(s)
	local startTime = os.clock()
	lexer.finished = false

	local function lex(first_arg)
		local line_nr = 0
		local sz = #s
		local idx = 1

		-- res is the value used to resume the coroutine.
		local function handle_requests(res)
			while res do
				local tp = type(res)
				-- Insert a token list:
				if tp == "table" then
					res = coroutine.yield("", "")
					for _, t in ipairs(res) do
						res = coroutine.yield(t[1], t[2])
					end
				elseif tp == "string" then -- Or search up to some special pattern:
					local i1, i2 = string.find(s, res, idx)
					if i1 then
						idx = i2 + 1
						res = coroutine.yield("", string.sub(s, i1, i2))
					else
						res = coroutine.yield("", "")
						idx = sz + 1
					end
				else
					res = coroutine.yield(line_nr, idx)
				end
			end
		end

		handle_requests(first_arg)
		line_nr = 1

		while true do
			if idx > sz then
				while true do
					handle_requests(coroutine.yield())
				end
			end
			for _, m in ipairs(lua_matches) do
				local findres = {}
				local i1, i2 = string.find(s, m[1], idx)
				findres[1], findres[2] = i1, i2
				if i1 then
					local tok = string.sub(s, i1, i2)
					idx = i2 + 1
					lexer.finished = idx > sz
					--if lexer.finished then
					--	print(string.format("Lex took %.2f ms", (os.clock()-startTime)*1000 ))
					--end

					local res = m[2](tok, findres)

					if string.find(tok, "\n") then
						-- Update line number:
						local _, newlines = string.gsub(tok, "\n", TABLE_EMPTY)
						line_nr = line_nr + newlines
					end

					handle_requests(res)
					break
				end
			end
		end
	end
	return coroutine.wrap(lex)
end

function lexer.navigator()
	
	local nav = {
		Source = "";
		TokenCache = table.create(50);
		
		_RealIndex = 0;
		_UserIndex = 0;
		_ScanThread = nil;
	}
	
	function nav:Destroy()
		self.Source = nil
		self._RealIndex = nil;
		self._UserIndex = nil;
		self.TokenCache = nil;
		self._ScanThread = nil;
	end
	
	function nav:SetSource(SourceString)
		self.Source = SourceString
		
		self._RealIndex = 0;
		self._UserIndex = 0;
		table.clear(self.TokenCache)
		
		self._ScanThread = coroutine.create(function()
			for Token,Src in lexer.scan(self.Source) do
				self._RealIndex += 1
				self.TokenCache[self._RealIndex] = {Token; Src;}
				coroutine.yield(Token,Src)
			end
		end)
	end

	function nav.Next()
		nav._UserIndex += 1
		
		if nav._RealIndex >= nav._UserIndex then
			-- Already scanned, return cached
			return table.unpack(nav.TokenCache[nav._UserIndex])
		else
			if coroutine.status(nav._ScanThread) == 'dead' then
				-- Scan thread dead
				return
			else
				local success, token, src = coroutine.resume(nav._ScanThread)
				if success and token then
					-- Scanned new data
					return token,src
				else
					-- Lex completed
					return
				end
			end
		end
		
	end
	
	function nav.Peek(PeekAmount)
		local GoalIndex = nav._UserIndex + PeekAmount
		
		if nav._RealIndex >= GoalIndex then
			-- Already scanned, return cached
			if GoalIndex > 0 then
				return table.unpack(nav.TokenCache[GoalIndex])
			else
				-- Invalid peek
				return
			end
		else
			if coroutine.status(nav._ScanThread) == 'dead' then
				-- Scan thread dead
				return
			else
				
				local IterationsAway = GoalIndex - nav._RealIndex
				
				local success, token, src = nil,nil,nil
				
				for i=1, IterationsAway do
					success, token, src = coroutine.resume(nav._ScanThread)
					if not (success or token) then
						-- Lex completed
						break
					end
				end
				
				return token,src
			end
		end
		
	end
	
	return nav
end

return lexer
16 Likes

I can’t wait to try this mad fortune :yum:
You really make our lives easier. Thanks to the awesome open source devs :heart: