Gex - Regex for lua

This regex is not based on some standard, some patterns might works differently


I made a regex library for lua, think It could be useful for developers. The module WIP, not all features are implemented yet, but the basics like:

  • sequence /[abcd]/
  • group /(var)?(let)?/
  • alternative /(yellow)|(green)/

How to use

local gex = require(...) -- path to gex module

local yellowOrGreenRegex = gex.compile("(yellow)|(green)")
print(yellowOrGreenRegex:find("yellow and green colors", 1, true))
print(yellowOrGreenRegex:match("yellow and green colors"))

local vowels = gex.compile("[aeuio]*")
print(vowels:find("yellow and green colors", 1, true))
Code to Test
local function testMatch(test: string, regex: string, match: string, expect: string, expectFound: boolean)
	local r = gex.compile(regex)
	local f, a, b = r:match(match)
	local c = match:sub(a, b)
	if f == expectFound and c == expect then
		print(("[MATCH][%s] Successful Found: %s Expected: %s Regex: /%s/ Got: \"%s\" Expected: \"%s\""):format(test, tostring(f), tostring(expectFound), regex, c, expect))
	else
		warn(("[MATCH][%s] Failed Found: %s Expected: %s Regex: /%s/ Got: \"%s\" Expected: \"%s\""):format(test, tostring(f), tostring(expectFound), regex, c, expect))
	end
end

local function generateFakeEmails(count: number): {string}
	local validsymbols = "qwertyuiopasdfghjklzxcvbnm"
	local validsymbolsL = validsymbols:len()
	local fakeEmails = {}
	for i = 1, count do
		local email = ""
		for _ = 1, math.random(5, 20) do
			local r = math.random(1, validsymbolsL)
			email ..= validsymbols:sub(r, r)
		end
		if math.random(1, 2) == 1 then
			email ..= "@"
		end
		for _ = 1, math.random(5, 10) do
			local r = math.random(1, validsymbolsL)
			email ..= validsymbols:sub(r, r)
		end
		if math.random(1, 2) == 1 then
			email ..= "."
		end
		for _ = 1, math.random(5, 8) do
			local r = math.random(1, validsymbolsL)
			email ..= validsymbols:sub(r, r)
		end
		table.insert(fakeEmails, email)
	end
	return fakeEmails
end

local function runTests()
	testMatch("exact", "a", "a", "a", true)
	testMatch("exact", "ab", "ab", "ab", true)
	testMatch("one or more", "a+", "aaaa", "aaaa", true)
	testMatch("multiple quanitities", "a+b", "aaaabbb", "aaaab", true)
	testMatch("optional", "a?", "b", "", false)
	testMatch("optional", "a?", "aaaa", "a", true)
	testMatch("zero or more", "a*", "", "", false)
	testMatch("zero or more", "a*", "aaaaaaaaaaaa", "aaaaaaaaaaaa", true)
	
	testMatch("alphanumeric", "\\w+", "wo3rd50", "wo3rd50", true)
	testMatch("digit", "\\d+", "165xyz", "165", true)
	testMatch("alphabetic", "\\a+", "word100", "word", true)
	testMatch("wildcard", "\\.+", "d293 *&($# HJDS nckd	 127", "d293 *&($# HJDS nckd	 127", true)
	
	testMatch("alternative", "a+|b+", "a", "a", true)
	testMatch("alternative", "a+|b+", "b", "b", true)
	
	testMatch("email", "\\w+@\\w+.\\w+", "fakeemail@fake.domen", "fakeemail@fake.domen", true)
	testMatch("email", "\\w+@\\w+.\\w+", "fakeemail@fake", "fakeemail@fake", false)
	
	testMatch("group", "(var)|(let)", "var let be that", "var", true)
	testMatch("group", "(var)|(let)", "let var be this", "let", true)
	
	testMatch("sequence", "[abcd]", "a", "a", true)
	testMatch("sequence", "[abcd]", "b", "b", true)
	testMatch("sequence", "[abcd]", "c", "c", true)
	testMatch("sequence", "[abcd]", "d", "d", true)
	testMatch("sequence(find `f`)", "[abcd]", "f", "", false)
	
	testMatch("range(3)", "\\d{3}", "10000", "100", true)
	testMatch("range(2, 5)", "\\d{2, 5}", "", "", false)
	testMatch("range(2, 5)", "\\d{2, 5}", "1234567890", "12345", true)
	testMatch("range(2,)", "\\d{2,}", "100", "100", true)
	
	testMatch("start and end line", "\\^\\w+\\$", "hello world", "hello", true)
end

local function speedTest()
	local function speedTestFakeEmails(emails: number)

		local fakeEmails = generateFakeEmails(emails)
		local fakeEmailsString = table.concat(fakeEmails, ",\n")
		
		local start = tick()
		
		local ValidEmails = gex.compile("%w+@%w+.%w+"):find(fakeEmailsString, 1, true)

		print(("tooks: %fs for calculate string length of %i; %s"):format(tick() - start, fakeEmailsString:len(), ("%i%% valid emails of %i"):format(#ValidEmails / #fakeEmails * 100, #fakeEmails)))
	end
	
	speedTestFakeEmails(1)
	speedTestFakeEmails(5)
	speedTestFakeEmails(10)
	speedTestFakeEmails(100)
	speedTestFakeEmails(1000)
	speedTestFakeEmails(2000)
	speedTestFakeEmails(5000)
end

runTests()
speedTest()

Methods

  • gex.compile(regex: string): Gex

  • gex:find(text: string, init: number?, multiple: boolean?): {{value: string, start: number, finish: number}}

  • gex:match(text: string, init: number?, multiple: boolean?): {value: string, start: number, finish: number}

11 Likes

what does this actially does? I dont get it

How fast is Gex for large strings or complex expressions? I know RegEx suffers performance issues with some expressions so will Gex have similar issues?

1 Like

Specifically this version is not fast if you use it for large text.

I’ve tried make much more faster implementation of this, but currently new implementation works with a lot of bugs.

This version of gex have performance issues with large chunk of text.

1 Like