I wrote a lexer for practice, which is meant to be used in a highlighter. Keep in mind this has syntax inaccuracies and is not made for production use.
I want to know if this is the correct implementation for a standard lexer. Thank you in advance!
local Lexer = {}
Lexer.__index = Lexer
function Lexer.new(source)
local self = {
source = source;
location = 0;
current_char = nil;
current_token = nil;
tracked_chars = {};
tokens = {}
}
setmetatable(self,Lexer)
return self
end
function Lexer:reset()
setmetatable(self, nil)
self = Lexer.new(self.source)
end
function Lexer:destroy()
setmetatable(self, nil)
self.tokens = nil
self.tracked_chars = nil
self = nil
end
-- Navigation
function Lexer:next()
self.location = self.location + 1
self.current_char = self.source:sub(self.location, self.location)
table.insert(self.tracked_chars, self.current_char)
return self.current_char
end
function Lexer:peek(n, offset)
n = n or self.location
offset = offset or 0
n = n+offset
return self.source:sub(n,n)
end
function Lexer:eat()
-- Conclude the tracking and save it as a matched token
local value = table.concat(self.tracked_chars)
table.insert(self.tokens, {self.current_token, value})
self.tracked_chars = {}
self.current_token = nil
end
-- Scanning
local start_chars = {
["OPERATOR"] = {
["="] = true,["/"] = true,["*"] = true,['+'] = true,['-'] = true,
['{'] = true,['}'] = true,['['] = true,[']'] = true,[':'] = true,
[';'] = true,[','] = true,
},
["NUMBER"] = {
["0"] = true,["1"] = true,["2"] = true,["3"] = true,["4"] = true,
["5"] = true,["6"] = true,["7"] = true,["8"] = true,["9"] = true
},
["STRING"] = {
['"'] = true
},
["KEYWORD"] = {
['t'] = true,['f'] = true,['n'] = true,
},
["WHITESPACE"] = {
[' '] = true,['\t'] = true,['\n'] = true,
}
}
local function find_token(chr, token)
return start_chars[token] and start_chars[token][chr]
end
local keywords = {
"null","true","false"
}
local number_chars = {
-- non-digit characters that can fit in a number
-- e for exponent, x for hex (?), . for decimal
["e"] = true, ["x"] = true, ["."] = true
}
function Lexer:scan(source)
self.source = source and trim(source) or self.source
source = self.source
local length = source:len()
while self.location ~= length do
local current_char = self:next()
local current_token = self.current_token
local next_char = self:peek(nil, 1)
local prev_char = self:peek(nil, -1)
if not current_token then
-- Start a new token
-- - Single-character tokens get eaten immediately
-- - >1 character tokens get passed on
-- - >=1 character tokens include their ending exception here
if find_token(current_char, "OPERATOR") then
self.current_token = "OPERATOR"
self:eat()
elseif find_token(current_char, "STRING") then
self.current_token = "STRING"
elseif find_token(current_char, "KEYWORD") then
self.current_token = "KEYWORD"
elseif find_token(current_char, "NUMBER") then
self.current_token = "NUMBER"
if not (
(number_chars[next_char] and find_token(self:peek(nil, 2), "NUMBER"))
or find_token(next_char, "NUMBER")
) then
self:eat()
end
elseif find_token(current_char,"WHITESPACE") then
self.current_token = "WHITESPACE"
if not find_token(next_char, "WHITESPACE") then
self:eat()
end
end
else
-- There's a current token and it's tracking.
-- Decide when to eat it
if current_token == "NUMBER" then
if not (
(number_chars[next_char] and find_token(self:peek(nil, 2), "NUMBER"))
or find_token(next_char, "NUMBER")
) then
self:eat()
end
-- else keep going
elseif current_token == "STRING" then
if (
find_token(current_char, "STRING")
and prev_char:match("\\")==nil
) then
self:eat()
end
-- else keep going
elseif current_token == "KEYWORD" then
local last_word = table.concat(self.tracked_chars)
for _, keyword in ipairs(keywords) do
if last_word == keyword then
self:eat()
end
end
-- eat if a complete keyword is found
-- else keep going
elseif current_token == "WHITESPACE" then
if not find_token(next_char, "WHITESPACE") then
self:eat()
end
-- else keep going
end
end
end
return self.tokens
end
function Lexer:print()
for _, pair in ipairs(self.tokens) do
print(pair[1], pair[2])
end
end
return Lexer
This is ran with:
local lexer = require ("lexer").new(src)
lexer:scan()
lexer:print()
Which outputs:
WHITESPACE
OPERATOR {
WHITESPACE
STRING "Name"
OPERATOR :
WHITESPACE
STRING "My Cool Model"
OPERATOR ,
...