JSON Lexer - Did I do it right?

I wrote a lexer for practice, which is meant to be used in a highlighter. Keep in mind this has syntax inaccuracies and is not made for production use.

I want to know if this is the correct implementation for a standard lexer. Thank you in advance!

local Lexer = {}
Lexer.__index = Lexer

function Lexer.new(source)
	local self = {
		source = source;
		location = 0;
		current_char = nil;
		current_token = nil;
		tracked_chars = {};
		tokens = {}
	}
	setmetatable(self,Lexer)
	return self
end

function Lexer:reset()
	setmetatable(self, nil)
	self = Lexer.new(self.source)
end

function Lexer:destroy()
	setmetatable(self, nil)
	self.tokens = nil
	self.tracked_chars = nil
	self = nil
end

-- Navigation

function Lexer:next()
	self.location = self.location + 1
	self.current_char = self.source:sub(self.location, self.location)
	table.insert(self.tracked_chars, self.current_char)
	return self.current_char
end

function Lexer:peek(n, offset)
	n = n or self.location
	offset = offset or 0
	n = n+offset
	return self.source:sub(n,n)
end

function Lexer:eat()
	-- Conclude the tracking and save it as a matched token
	local value = table.concat(self.tracked_chars)
	table.insert(self.tokens, {self.current_token, value})
	self.tracked_chars = {}
	self.current_token = nil
end

-- Scanning

local start_chars = {
	["OPERATOR"] = {
		["="] = true,["/"] = true,["*"] = true,['+'] = true,['-'] = true,
		['{'] = true,['}'] = true,['['] = true,[']'] = true,[':'] = true,
		[';'] = true,[','] = true,
	},
	["NUMBER"] = {
		["0"] = true,["1"] = true,["2"] = true,["3"] = true,["4"] = true,
		["5"] = true,["6"] = true,["7"] = true,["8"] = true,["9"] = true
	},
	["STRING"] = {
		['"'] = true
	},
	["KEYWORD"] = {
		['t'] = true,['f'] = true,['n'] = true,
	},
	["WHITESPACE"] = {
		[' '] = true,['\t'] = true,['\n'] = true,
	}
}

local function find_token(chr, token)
	return start_chars[token] and start_chars[token][chr]
end

local keywords = {
	"null","true","false"
}

local number_chars = {
	-- non-digit characters that can fit in a number
	-- e for exponent, x for hex (?), . for decimal
	["e"] = true, ["x"] = true, ["."] = true
}

function Lexer:scan(source)
	self.source = source and trim(source) or self.source
	source = self.source
	local length = source:len()

	while self.location ~= length do
		local current_char = self:next()
		local current_token = self.current_token
		local next_char = self:peek(nil, 1)
		local prev_char = self:peek(nil, -1)

		if not current_token then
			-- Start a new token
			-- - Single-character tokens get eaten immediately
			-- - >1 character tokens get passed on
			-- - >=1 character tokens include their ending exception here

			if find_token(current_char, "OPERATOR") then
				self.current_token = "OPERATOR"
				self:eat()
			elseif find_token(current_char, "STRING") then
				self.current_token = "STRING"
			elseif find_token(current_char, "KEYWORD") then
				self.current_token = "KEYWORD"
			elseif find_token(current_char, "NUMBER") then
				self.current_token = "NUMBER"
				if not (
					(number_chars[next_char] and find_token(self:peek(nil, 2), "NUMBER"))
					or find_token(next_char, "NUMBER")
					) then
					self:eat()
				end
			elseif find_token(current_char,"WHITESPACE") then
				self.current_token = "WHITESPACE"
				if not find_token(next_char, "WHITESPACE") then
					self:eat()
				end
			end
		else
			-- There's a current token and it's tracking.
			-- Decide when to eat it

			if current_token == "NUMBER" then
				if not (
					(number_chars[next_char] and find_token(self:peek(nil, 2), "NUMBER"))
					or find_token(next_char, "NUMBER")
					) then
					self:eat()
				end
				-- else keep going
			elseif current_token == "STRING" then
				if (
					find_token(current_char, "STRING")
					and prev_char:match("\\")==nil
					) then
					self:eat()
				end
				-- else keep going
			elseif current_token == "KEYWORD" then
				local last_word = table.concat(self.tracked_chars)
				for _, keyword in ipairs(keywords) do
					if last_word == keyword then
						self:eat()
					end
				end
				-- eat if a complete keyword is found
				-- else keep going
			elseif current_token == "WHITESPACE" then
				if not find_token(next_char, "WHITESPACE") then
					self:eat()
				end
				-- else keep going
			end
		end
	end
	return self.tokens
end

function Lexer:print()
	for _, pair in ipairs(self.tokens) do
		print(pair[1], pair[2])
	end
end

return Lexer

This is ran with:

local lexer = require ("lexer").new(src)
lexer:scan()
lexer:print()

Which outputs:

WHITESPACE	
OPERATOR	{
WHITESPACE	
	
STRING	"Name"
OPERATOR	:
WHITESPACE	 
STRING	"My Cool Model"
OPERATOR	,
...
1 Like

Since self is just a normal parameter inside methods, destroy and reset probably don’t behave like you want at all. There are no other obvious issues, though.

I do suggest that future non-practice lexers you leverage the power of Lua string patterns (specifically string.find) to ease token matching. It’s more efficient and easier to read usually.

3 Likes