Text compression

unfortunately, the revised module still seems to choke on bytes \0 and \255. this seems to be caused by off-by-one errors on lines 27 and 39 (1, 31 and 1, 255 - 181; should seemingly be 0, 31 and 1, 255 - 180).

however, my attempted fix causes a different error to appear, on line 76 due to tobase92 being called with a nil argument.

i don’t understand the code enough to properly fix this. can somebody more knowledgeable please take a look?

2 Likes

I actually fixed it a while back for myself, but I never bothered to post a new reply, so here you go.

-- Module by 1waffle1 and boatbomber, optimized and fixed by iiau
-- https://devforum.roblox.com/t/text-compression/163637/37

local dictionary = {} -- key to len

do -- populate dictionary
	local length = 0
	for i = 32, 127 do
		if i ~= 34 and i ~= 92 then
			local c = string.char(i)
			dictionary[c] = length
			dictionary[length] = c
			length = length + 1
		end
	end
end

local escapemap_126, escapemap_127 = {}, {}
local unescapemap_126, unescapemap_127 = {}, {}

local blacklisted_126 = { 34, 92 }
for i = 126, 180 do
	table.insert(blacklisted_126, i)
end

do -- Populate escape map
	-- represents the numbers 0-31, 34, 92, 126, 127 (36 characters)
	-- and 128-180 (52 characters)
	-- https://devforum.roblox.com/t/text-compression/163637/5
	for i = 0, 31 + #blacklisted_126 do
		local b = blacklisted_126[i - 31]
		local s = i + 32

		-- Note: 126 and 127 are magic numbers
		local c = string.char(b or i)
		local e = string.char(s + (s >= 34 and 1 or 0) + (s >= 91 and 1 or 0))

		escapemap_126[c] = e
		unescapemap_126[e] = c
	end

	for i = 1, 255 - 180 do
		local c = string.char(i + 180)
		local s = i + 34
		local e = string.char(s + (s >= 92 and 1 or 0))

		escapemap_127[c] = e
		unescapemap_127[e] = c
	end
end

local function escape(s)
	-- escape the control characters 0-31, double quote 34, backslash 92, Tilde 126, and DEL 127 (36 chars)
	-- escape characters 128-180 (53 chars)
	return string.gsub(string.gsub(s, '[%c"\\\126-\180]', function(c)
		return "\126" .. escapemap_126[c]
	end), '[\181-\255]', function(c)
		return "\127" .. escapemap_127[c]
	end)
end
local function unescape(s)
	return string.gsub(string.gsub(s, "\127(.)", function(e)
		return unescapemap_127[e]
	end), "\126(.)", function(e)
		return unescapemap_126[e]
	end)
end

local b93Cache = {}
local function tobase93(n)
	local value = b93Cache[n]
	if value then
		return value
	end

	local c = n
	value = ""
	repeat
		local remainder = n % 93
		value = dictionary[remainder] .. value
		n = (n - remainder) / 93
	until n == 0

	b93Cache[c] = value
	return value
end

local b10Cache = {}
local function tobase10(value)
	local n = b10Cache[value]
	if n then
		return n
	end

	n = 0
	for i = 1, #value do
		n = n + math.pow(93, i - 1) * dictionary[string.sub(value, -i, -i)]
	end

	b10Cache[value] = n
	return n
end

local function compress(text)
	assert(type(text) == "string", "bad argument #1 to 'compress' (string expected, got " .. typeof(text) .. ")")
	local dictionaryCopy = table.clone(dictionary)
	local key, sequence, size = "", {}, #dictionary

	local width, spans, span = 1, {}, 0
	local function listkey(k)
		local value = tobase93(dictionaryCopy[k])
		local valueLength = #value
		if valueLength > width then
			width, span, spans[width] = valueLength, 0, span
		end
		table.insert(sequence, string.rep(" ", width - valueLength) .. value)
		span += 1
	end
	text = escape(text)
	for i = 1, #text do
		local c = string.sub(text, i, i)
		local new = key .. c
		if dictionaryCopy[new] then
			key = new
		else
			listkey(key)
			key = c
			size += 1
			dictionaryCopy[new] = size
			dictionaryCopy[size] = new
		end
	end
	listkey(key)
	spans[width] = span
	return table.concat(spans, ",") .. "|" .. table.concat(sequence)
end

local function decompress(text)
	assert(type(text) == "string", "bad argument #1 to 'decompress' (string expected, got " .. typeof(text) .. ")")
	local dictionaryCopy = table.clone(dictionary)
	local sequence, spans, content = {}, string.match(text, "(.-)|(.*)")
	local groups, start = {}, 1
	for span in string.gmatch(spans, "%d+") do
		local width = #groups + 1
		groups[width] = string.sub(content, start, start + span * width - 1)
		start = start + span * width
	end
	local previous

	for width, group in ipairs(groups) do
		for value in string.gmatch(group, string.rep(".", width)) do
			local entry = dictionaryCopy[tobase10(value)]
			if previous then
				if entry then
					table.insert(dictionaryCopy, previous .. string.sub(entry, 1, 1))
				else
					entry = previous .. string.sub(previous, 1, 1)
					table.insert(dictionaryCopy, entry)
				end
				table.insert(sequence, entry)
			else
				sequence[1] = entry
			end
			previous = entry
		end
	end
	return unescape(table.concat(sequence))
end

return { compress = compress, decompress = decompress }

It turns out I didn’t need to change it to base-92. Instead I found out that the escapemap had overlap with character codes required to unescape it. So I have to make an entire dictionary just for unescaping the escape characters.

I’m currently using this live production in one of my games, Felandia, serving hundreds of daily players and I have not experienced any issues whatsoever.

Enjoy!

8 Likes

I’m exporting training data for a neural network and this compressed my json file from 31 script objects (199999 characters per script) to just 13. Good stuff!

1 Like

I’ve been rewriting this code into Typescript yesterday for my Datastore optimizations and sharding mechanics.
However as I read conversation, it looks like there are many improved versions, would someone point out which of these versions are production stable?
Thanks alot!

2 Likes

I went through most (some had problems), but the one modified by @iiau has been working great.

2 Likes

[EDIT] By using maybeyield from this reply by Anaminus, post Reply from Anaminus I was able to yield every .5 sec or so to avoid exhaust from occouring, just leaving this if anyone else run into this issue

Great module! One issue I run into is when compressing big texts is the “Script timeout: exhausted allowed execution time” Would it be possible to implement some way where you can customize how much of the processing power the system uses, lets say 70% of cpu or something along that?

1 Like

Hello. I have been using this module for around 2 years now to compress image data for CanvasDraw, which is my pixel-based graphics library for roblox.

I am wondering if there’s any room to further improve performance as I am having lag spike issues compressing a large 20,000 to 130,000 character strings.

I am using the optimised version edited by boatbomber:

--!native

-- Module by 1waffle1, optimized by boatbomber
-- https://devforum.roblox.com/t/text-compression/163637

local gsub = string.gsub
local sub = string.sub
local insert = table.insert
local rep = string.rep

local dictionary = {}
do -- populate dictionary
	local length = 0
	for i = 32, 127 do
		if i ~= 34 and i ~= 92 then
			local c = string.char(i)
			dictionary[c], dictionary[length] = length, c
			length = length + 1
		end
	end
end

local escapemap = {}
do -- Populate escape map
	for i = 1, 34 do
		i = ({ 34, 92, 127 })[i - 31] or i
		local c, e = string.char(i), string.char(i + 31)
		escapemap[c], escapemap[e] = e, c
	end
end

local function escape(s)
	return gsub(s, '[%c"\\]', function(c)
		return "\127" .. escapemap[c]
	end)
end
local function unescape(s)
	return gsub(s, "\127(.)", function(c)
		return escapemap[c]
	end)
end

local function copy(t)
	local new = {}
	for k, v in pairs(t) do
		new[k] = v
	end
	return new
end

local b93Cache = {}
local function tobase93(n)
	local value = b93Cache[n]
	if value then
		return value
	end

	value = ""
	repeat
		local remainder = n % 93
		value = dictionary[remainder] .. value
		n = (n - remainder) / 93
	until n == 0

	b93Cache[n] = value
	return value
end

local b10Cache = {}
local function tobase10(value)
	local n = b10Cache[value]
	if n then
		return n
	end

	n = 0
	for i = 1, #value do
		n = n + 93 ^ (i - 1) * dictionary[sub(value, -i, -i)]
	end

	b10Cache[value] = n
	return n
end

local function compress(text)
	local dictionaryCopy = copy(dictionary)
	local key, sequence, size = "", {}, #dictionaryCopy
	local width, spans, span = 1, {}, 0
	local function listkey(k)
		local value = tobase93(dictionaryCopy[k])
		local valueLength = #value
		if valueLength > width then
			width, span, spans[width] = valueLength, 0, span
		end
		insert(sequence, rep(" ", width - valueLength) .. value)
		span += 1
	end
	text = escape(text)
	for i = 1, #text do
		local c = sub(text, i, i)
		local new = key .. c
		if dictionaryCopy[new] then
			key = new
		else
			listkey(key)
			key = c
			size += 1
			dictionaryCopy[new], dictionaryCopy[size] = size, new
		end
	end
	listkey(key)
	spans[width] = span
	return table.concat(spans, ",") .. "|" .. table.concat(sequence)
end

local function decompress(text)
	local dictionaryCopy = copy(dictionary)
	local sequence, spans, content = {}, string.match(text, "(.-)|(.*)")
	local groups, start = {}, 1
	for span in string.gmatch(spans, "%d+") do
		local width = #groups + 1
		groups[width] = sub(content, start, start + span * width - 1)
		start = start + span * width
	end
	local previous

	for width, group in ipairs(groups) do
		for value in string.gmatch(group, rep(".", width)) do
			local entry = dictionaryCopy[tobase10(value)]
			if previous then
				if entry then
					insert(dictionaryCopy, previous .. sub(entry, 1, 1))
				else
					entry = previous .. sub(previous, 1, 1)
					insert(dictionaryCopy, entry)
				end
				insert(sequence, entry)
			else
				sequence[1] = entry
			end
			previous = entry
		end
	end
	return unescape(table.concat(sequence))
end

return { Compress = compress, Decompress = decompress }

I am specifically trying to get decompression to be as fast as possible

1 Like

Would Parallel Lua be at all a possibility here? I’m not sure. It’s the only thing I can think of other than just reducing the amount of data you’re trying to compress.

4 Likes

It could be, but I have no idea how I would even approach implementing that to this. Would buffers be useful at all?

1 Like

It think that buffers should make it faster, as everytime a function that manipulates string is used, it needs to create that string in memory. I may not be exactly right but that’s how i remember it worked. So it is possible that buffers would make it faster.

1 Like

It might not be possible if you compress the string as a whole, however, if the string can be split into segments, then yes, it is definitely possible. In one of my game, I have a big string that I decompress using my parallel lua module, but that big string has a separator (I used “_”), separating the data of the different games stored


I use string.split() to split the string, and then do some math to spread the work amongst a set number of actors. Removing some of the string manipulation steps could improve performance further, but this alone made it a lot quicker

This also allows me to update only a section of the giant string, meaning I don’t have to compress and decompress everything every time I update part of it

Another approach would be to have each “section” be a fixed length, which I don’t know how would work, but is definitely possible


One this that annoys me with this module is that it uses base 93, while I use base 92 for the serialization (it uses a modified version of squash). This is for the separator I use, mentioned above

2 Likes

So… after spending several days on this, trying to figure out if there’s a way to further optimize this.

I ended up bringing the compression speed 10-50% faster, on average, with buffers!

--!native
-- Module by 1waffle1 and boatbomber, optimized and fixed by iiau
-- https://devforum.roblox.com/t/text-compression/163637/37

local lzw = {}
lzw.__index = lzw

lzw.new = function(BUF_SIZE: number?)
	local self = setmetatable({}, lzw)
	
	self.BUF_SIZE = BUF_SIZE or 8388608 --* 8 MB should be enough for most things on Roblox
	self.buf = buffer.create(self.BUF_SIZE)

	self.strToLength = {}
	self.lengthToStr = buffer.create(94) -- base93
	self.bidirectionalDict = {}

	local lengthToStr = self.lengthToStr
	local strToLength = self.strToLength
	local bidirectionalDict = self.bidirectionalDict

	do -- populate dictionary
		local length = 0
		for i = 32, 127 do
			if i ~= 34 and i ~= 92 then
				local c = string.char(i)
				strToLength[c] = length

				bidirectionalDict[length] = c
				bidirectionalDict[c] = length

				buffer.writeu8(lengthToStr, length, i)
				length = length + 1
			end
		end
	end


	local escapemap_126, escapemap_127 = {}, {}
	local unescapemap_126, unescapemap_127 = {}, {}

	local blacklisted_126 = { 34, 92 }
	for i = 126, 180 do
		table.insert(blacklisted_126, i)
	end

	do -- Populate escape map
		-- represents the numbers 0-31, 34, 92, 126, 127 (36 characters)
		-- and 128-180 (52 characters)
		-- https://devforum.roblox.com/t/text-compression/163637/5
		for i = 0, 31 + #blacklisted_126 do
			local b = blacklisted_126[i - 31]
			local s = i + 32

			-- Note: 126 and 127 are magic numbers
			local c = string.char(b or i)
			local e = string.char(s + (s >= 34 and 1 or 0) + (s >= 91 and 1 or 0))

			escapemap_126[c] = e
			unescapemap_126[e] = c
		end

		for i = 1, 255 - 180 do
			local c = string.char(i + 180)
			local s = i + 34
			local e = string.char(s + (s >= 92 and 1 or 0))

			escapemap_127[c] = e
			unescapemap_127[e] = c
		end
	end

	local function escape(s : string)
		-- escape the control characters 0-31, double quote 34, backslash 92, Tilde 126, and DEL 127 (36 chars)
		-- escape characters 128-180 (53 chars)
		return string.gsub(string.gsub(s, '[%c"\\\126-\180]', function(c)
			return "\126" .. escapemap_126[c]
		end), '[\181-\255]', function(c)
			return "\127" .. escapemap_127[c]
		end)
	end
	local function unescape(s : string)
		return string.gsub(string.gsub(s, "\127(.)", function(e)
			return unescapemap_127[e]
		end), "\126(.)", function(e)
			return unescapemap_126[e]
		end)
	end

	local b93Cache = {}
	local b10Cache = {}
	local function tobase93(n: number) : string
		local value : string? = b93Cache[n]
		if value then
			return value
		end

		local c = n
		value = ""
		repeat
			local remainder = n % 93
			value = string.char(buffer.readu8(lengthToStr, remainder)) .. value
			n = (n - remainder) / 93
		until n == 0

		b93Cache[c] = value
		return value
	end

	local function tobase10(value: string) : number
		local n = b10Cache[value]
		if n then
			return n
		end
	
		n = 0
		for i = 1, #value do
			n = n + math.pow(93, i - 1) * strToLength[string.sub(value, -i, -i)]
		end
	
		b10Cache[value] = n
		return n
	end

	self.escape = escape
	self.unescape = unescape
	self.tobase93 = tobase93
	self.tobase10 = tobase10

	return self
end

function lzw:compress(text: string)
	assert(type(text) == "string", "bad argument #1 to 'compress' (string expected, got " .. typeof(text) .. ")")

	local buf = buffer.fromstring((self.escape :: (string) -> string)(text))
	local tobase93 = self.tobase93
	local dictionaryCopy : {[string] : number} = table.clone(self.strToLength)
	local sequence: buffer, size : number = self.buf, 93
	local width: number, spans: {[number]: number}, span : number = 1, {}, 0
	local ptrA: number, ptrB: number = 0, 1
	local key: string = ""
	local cursor: number = 0

	buffer.fill(sequence, 0, 0)
	
	local function listkey(k : string)  -- string to length
		local value = tobase93(dictionaryCopy[k])
		local valueLength = #value
		if valueLength > width then
			width, span, spans[width] = valueLength, 0, span
		end
		for _ = 1, width - valueLength do
			buffer.writeu8(sequence, cursor, 32)
			cursor += 1
		end
		buffer.writestring(sequence, cursor, value)
		cursor += valueLength
		
		span += 1
	end

	local len = buffer.len(buf)
	while ptrB <= len do
		local new : string = buffer.readstring(buf, ptrA, ptrB - ptrA)

		if dictionaryCopy[new] then
			key = new
			ptrB += 1
		else
			listkey(key)
			ptrA = ptrB - 1
			size += 1
			dictionaryCopy[new] = size
		end
	end
	listkey(key)

	spans[width] = span

	return table.concat(spans, ",") .. "|" .. buffer.readstring(sequence, 0, cursor)
end

function lzw:decompress(text: string)
	assert(type(text) == "string", "bad argument #1 to 'decompress' (string expected, got " .. typeof(text) .. ")")
	local dictionaryCopy = table.clone(self.bidirectionalDict)
	local tobase10 = self.tobase10
	local sequence, spans, content = {}, string.match(text, "(.-)|(.*)")
	local groups, start = {}, 1
	for span in string.gmatch(spans, "%d+") do
		local width = #groups + 1
		groups[width] = string.sub(content, start, start + span * width - 1)
		start = start + span * width
	end
	local previous

	for width, group in ipairs(groups) do
		for value in string.gmatch(group, string.rep(".", width)) do
			local entry = dictionaryCopy[tobase10(value)]
			if previous then
				if entry then
					table.insert(dictionaryCopy, previous .. string.sub(entry, 1, 1))
				else
					entry = previous .. string.sub(previous, 1, 1)
					table.insert(dictionaryCopy, entry)
				end
				table.insert(sequence, entry)
			else
				sequence[1] = entry
			end
			previous = entry
		end
	end
	return self.unescape(table.concat(sequence))
end

return lzw

I’ve also turned this into moreso a class, where you create a new encoder and decoder with lzw.new(bufferSize? = 8mb).

Methods are:

lzw:compress(string)
lzw:decompress(string)

And that’s it!
Decompression remains unchanged - I believe that encoding will generally used more in practice (think: ProfileService autosaves).

How does this optimization save time?
Turns out, that table indexing and buffer accessing are the same. But, we can save time on buffer writing! This makes use of writing the sequence into a fixed block memory instead of through a vector, which doesn’t need to spend CPU work to internally resize the array. Also, it’s faster to index a buffer rather than index a substring of a string.
The default size is 8 MB which is more than enough for DataStores (maximum is 4mb).

This is backwards compatible with my last release in this thread.

2 Likes

I’ve optimized this even further! By directly pushing the characters into the buffer after computing the base-93 literals, I’m able to cut down the first time compression speed an additional 25-50% in most cases!

Small text (<1024) characters see the most improvement by nearly 60-80%!

The only drawback is that I’m unable to memoize the results of the base-93 calculation, as reading a string from the buffer for the sole purpose of storing it in a dictionary will mean significant slowdowns - meaning that it may be slower on repeat calculations than in the original implementation, especially on large inputs.

However, given that in most scenarios you have constantly evolving save data on players and different players - it won’t be a big issue!

Furthermore, optimizing the escape function with buffers also provided a small speed increase.

Edit: This has been fixed on 10/16/24! 10/17/24 :partying_face:

10/16/24

Line 171 just needed to be deleted from
cursor += sz
into
cursor += sz - 1
Also removed unnecessary buffer.fill.

Line 171 just needed to be deleted (cursor += sz + 1)

DO NOT USE THIS. See my reply below for a fixed version.
--!native
-- Module by 1waffle1 and boatbomber, optimized and fixed by iiau
-- https://devforum.roblox.com/t/text-compression/163637/37

local lzw = {}
lzw.__index = lzw
--local concatSize = 512

lzw.new = function(BUF_SIZE: number?)
	local self = setmetatable({}, lzw)
	
	self.BUF_SIZE = BUF_SIZE or 8388608 --* 8 MB should be enough for most things on Roblox
	self.buf = buffer.create(self.BUF_SIZE)
	--self.concat = buffer.create(concatSize) -- storage to concat strings (is this faster?)
	--local concat = self.concat

	self.strToLength = {}
	self.lengthToStr = buffer.create(94) -- base93
	self.bidirectionalDict = {}

	local lengthToStr = self.lengthToStr
	local strToLength = self.strToLength
	local bidirectionalDict = self.bidirectionalDict

	do -- populate dictionary
		local length = 0
		for i = 32, 127 do
			if i ~= 34 and i ~= 92 then
				local c = string.char(i)
				strToLength[c] = length

				bidirectionalDict[length] = c
				bidirectionalDict[c] = length

				buffer.writeu8(lengthToStr, length, i)
				length = length + 1
			end
		end
	end


	local escapemap_126, escapemap_127 = {}, {}
	local unescapemap_126, unescapemap_127 = {}, {}

	local blacklisted_126 = { 34, 92 }
	for i = 126, 180 do
		table.insert(blacklisted_126, i)
	end

	do -- Populate escape map
		-- represents the numbers 0-31, 34, 92, 126, 127 (36 characters)
		-- and 128-180 (52 characters)
		-- https://devforum.roblox.com/t/text-compression/163637/5
		for i = 0, 31 + #blacklisted_126 do
			local b = blacklisted_126[i - 31]
			local s = i + 32

			-- Note: 126 and 127 are magic numbers
			local c = b or i
			local e = s + (s >= 34 and 1 or 0) + (s >= 91 and 1 or 0)

			escapemap_126[c] = e
			unescapemap_126[string.char(e)] = string.char(c)
		end

		for i = 1, 255 - 180 do
			local c = i + 180
			local s = i + 34
			local e = s + (s >= 92 and 1 or 0)

			escapemap_127[c] = e
			unescapemap_127[string.char(e)] = string.char(c)
		end
	end

	local function escape(s : string)
		local len = #s
		local refbuf = buffer.fromstring(s)
		local newbuf = buffer.create(len * 2) -- worst case scenario

		local c = 1 -- 1 indexed
		local f = string.find(s, '[%c"\\\126-\255]') -- 1 indexed
		local bufcursor = 0

		while f do
			if c < f then
				buffer.copy(newbuf, bufcursor, refbuf, c - 1, f - c)
				bufcursor += f - c
				c = f
			end

			local byte = buffer.readu8(refbuf, f - 1)
			if byte >= 181 then
				local e = escapemap_127[byte]
				buffer.writeu8(newbuf, bufcursor, 127)
				buffer.writeu8(newbuf, bufcursor + 1, e)
			else
				local e = escapemap_126[byte]
				buffer.writeu8(newbuf, bufcursor, 126)
				buffer.writeu8(newbuf, bufcursor + 1, e)
			end
			
			c += 1
			bufcursor += 2
			f = string.find(s, '[%c"\\\126-\255]', f + 1)
		end
		
		if c <= len then
			buffer.copy(newbuf, bufcursor, refbuf, c - 1, len - c + 1)
			bufcursor += len - c + 1
		end

		return newbuf, bufcursor
	end

	local function unescape(s : string)
		return string.gsub(string.gsub(s, "\127(.)", function(e)
			return unescapemap_127[e]
		end), "\126(.)", function(e)
			return unescapemap_126[e]
		end)
	end

	local b10Cache = {}

	local function tobase10(value: string) : number
		local n = b10Cache[value]
		if n then
			return n
		end
	
		n = 0
		for i = 1, #value do
			n = n + math.pow(93, i - 1) * bidirectionalDict[string.sub(value, -i, -i)]
		end
	
		b10Cache[value] = n
		return n
	end

	self.escape = escape
	self.unescape = unescape
	--self.tobase93 = tobase93
	self.tobase10 = tobase10

	return self
end

function lzw:compress(text: string)
	assert(type(text) == "string", "bad argument #1 to 'compress' (string expected, got " .. typeof(text) .. ")")

	local buf, len = self.escape(text)
	local dictionaryCopy : {[string] : number} = table.clone(self.strToLength)
	local sequence: buffer, size : number = self.buf, 93
	local width: number, spans: {[number]: number}, span : number = 1, {}, 0
	local ptrA: number, ptrB: number = 0, 1
	local key: string = ""
	local cursor : number = 0
	local lengthToStr : buffer = self.lengthToStr
	local depth : number

	local writeasbase93 = function(n: number) : string
		local sz = 0
		repeat
			local remainder = n % 93
			buffer.copy(sequence, cursor + depth - 1 - sz, lengthToStr, remainder, 1)
			n = (n - remainder) / 93
			sz += 1
		until n == 0
	end
	
	local function listkey(k : string)  -- string to length
		local n = dictionaryCopy[k]
		depth = n <= 1 and 1 or math.ceil(math.log(n, 93))
		if depth > width then
			width, span, spans[width] = depth, 0, span
		end
		
		for _ = 1, width - depth do
			buffer.writeu8(sequence, cursor, 32)
			cursor += 1
		end
		writeasbase93(n)
		cursor += depth
		
		span += 1
	end

	while ptrB <= len do
		local new : string = buffer.readstring(buf, ptrA, ptrB - ptrA)

		if dictionaryCopy[new] then
			key = new
			ptrB += 1
		else
			listkey(key)
			ptrA = ptrB - 1
			size += 1
			dictionaryCopy[new] = size
		end
	end
	listkey(key)

	spans[width] = span

	return table.concat(spans, ",") .. "|" .. buffer.readstring(sequence, 0, cursor)
end

function lzw:decompress(text: string)
	assert(type(text) == "string", "bad argument #1 to 'decompress' (string expected, got " .. typeof(text) .. ")")
	local dictionaryCopy = table.clone(self.bidirectionalDict)
	local tobase10 = self.tobase10
	local sequence, spans, content = {}, string.match(text, "(.-)|(.*)")
	local groups, start = {}, 1
	for span in string.gmatch(spans, "%d+") do
		local width = #groups + 1
		groups[width] = string.sub(content, start, start + span * width - 1)
		start = start + span * width
	end
	local previous

	for width, group in ipairs(groups) do
		for value in string.gmatch(group, string.rep(".", width)) do
			local entry = dictionaryCopy[tobase10(value)]
			if previous then
				if entry then
					table.insert(dictionaryCopy, previous .. string.sub(entry, 1, 1))
				else
					entry = previous .. string.sub(previous, 1, 1)
					table.insert(dictionaryCopy, entry)
				end
				table.insert(sequence, entry)
			else
				sequence[1] = entry
			end
			previous = entry
		end
	end
	return self.unescape(table.concat(sequence))
end

return lzw
3 Likes

One of the smartest things I’ve ever seen base-93 is a crazy hack.

Hello, I tried using this but kept having an error occur when the string length is above 1:
attempt to perform arithmetic (mul) on number and nil

This error is from the base10 function from when attempting to convert string to length.

n += math.pow(93, i - 1) * strToLength[string.sub(value, -i, -i)]

While I work on a fix, in the meantime just use the decode from this code.

EDIT Never mind. I see what’s going on. Turns out every character seems to have the \0 character in between every character when it’s encoded. I’ll see what’s going on.

EDIT 2 See my last post. It is fixed

Is this backwards compatible with the original string compressor? Like if I had data stored with a string compressed from boatbombers version, would that decompress successfully with this?

1 Like

I just so happen to have a project which could benefit from this particular due to the repetitive patterns of my map building game like the one you mentioned would be a good use case.
Dungeon Builder Tech Demo - Roblox

No; as boatbomber’s version doesn’t account for character escapes that extend to ASCII values 128-255. You could however, modify how the escape function works to possibly have it work with the older versions.

Fixed another edge case.
Line 174 is replaced with depth = n == 0 and 1 or math.ceil(math.log(n + 1, 93)).

--!native
-- Module by 1waffle1 and boatbomber, optimized and fixed by iiau
-- https://devforum.roblox.com/t/text-compression/163637/37

local lzw = {}
lzw.__index = lzw
--local concatSize = 512

lzw.new = function(BUF_SIZE: number?)
	local self = setmetatable({}, lzw)
	
	self.BUF_SIZE = BUF_SIZE or 8388608 --* 8 MB should be enough for most things on Roblox
	self.buf = buffer.create(self.BUF_SIZE)
	--self.concat = buffer.create(concatSize) -- storage to concat strings (is this faster?)
	--local concat = self.concat

	self.strToLength = {}
	self.lengthToStr = buffer.create(94) -- base93
	self.bidirectionalDict = {}

	local lengthToStr = self.lengthToStr
	local strToLength = self.strToLength
	local bidirectionalDict = self.bidirectionalDict

	do -- populate dictionary
		local length = 0
		for i = 32, 127 do
			if i ~= 34 and i ~= 92 then
				local c = string.char(i)
				strToLength[c] = length

				bidirectionalDict[length] = c
				bidirectionalDict[c] = length

				buffer.writeu8(lengthToStr, length, i)
				length = length + 1
			end
		end
	end


	local escapemap_126, escapemap_127 = {}, {}
	local unescapemap_126, unescapemap_127 = {}, {}

	local blacklisted_126 = { 34, 92 }
	for i = 126, 180 do
		table.insert(blacklisted_126, i)
	end

	do -- Populate escape map
		-- represents the numbers 0-31, 34, 92, 126, 127 (36 characters)
		-- and 128-180 (52 characters)
		-- https://devforum.roblox.com/t/text-compression/163637/5
		for i = 0, 31 + #blacklisted_126 do
			local b = blacklisted_126[i - 31]
			local s = i + 32

			-- Note: 126 and 127 are magic numbers
			local c = b or i
			local e = s + (s >= 34 and 1 or 0) + (s >= 91 and 1 or 0)

			escapemap_126[c] = e
			unescapemap_126[string.char(e)] = string.char(c)
		end

		for i = 1, 255 - 180 do
			local c = i + 180
			local s = i + 34
			local e = s + (s >= 92 and 1 or 0)

			escapemap_127[c] = e
			unescapemap_127[string.char(e)] = string.char(c)
		end
	end

	local function escape(s : string)
		local len = #s
		local refbuf = buffer.fromstring(s)
		local newbuf = buffer.create(len * 2) -- worst case scenario

		local c = 1 -- 1 indexed
		local f = string.find(s, '[%c"\\\126-\255]') -- 1 indexed
		local bufcursor = 0

		while f do
			if c < f then
				buffer.copy(newbuf, bufcursor, refbuf, c - 1, f - c)
				bufcursor += f - c
				c = f
			end

			local byte = buffer.readu8(refbuf, f - 1)
			if byte >= 181 then
				local e = escapemap_127[byte]
				buffer.writeu8(newbuf, bufcursor, 127)
				buffer.writeu8(newbuf, bufcursor + 1, e)
			else
				local e = escapemap_126[byte]
				buffer.writeu8(newbuf, bufcursor, 126)
				buffer.writeu8(newbuf, bufcursor + 1, e)
			end
			
			c += 1
			bufcursor += 2
			f = string.find(s, '[%c"\\\126-\255]', f + 1)
		end
		
		if c <= len then
			buffer.copy(newbuf, bufcursor, refbuf, c - 1, len - c + 1)
			bufcursor += len - c + 1
		end

		return newbuf, bufcursor
	end

	local function unescape(s : string)
		return string.gsub(string.gsub(s, "\127(.)", function(e)
			return unescapemap_127[e]
		end), "\126(.)", function(e)
			return unescapemap_126[e]
		end)
	end

	local b10Cache = {}

	local function tobase10(value: string) : number
		local n = b10Cache[value]
		if n then
			return n
		end
	
		n = 0
		for i = 1, #value do
			n = n + math.pow(93, i - 1) * bidirectionalDict[string.sub(value, -i, -i)]
		end
	
		b10Cache[value] = n
		return n
	end

	self.escape = escape
	self.unescape = unescape
	--self.tobase93 = tobase93
	self.tobase10 = tobase10

	return self
end

function lzw:compress(text: string)
	assert(type(text) == "string", "bad argument #1 to 'compress' (string expected, got " .. typeof(text) .. ")")

	local buf, len = self.escape(text)
	local dictionaryCopy : {[string] : number} = table.clone(self.strToLength)
	local sequence: buffer, size : number = self.buf, 93
	local width: number, spans: {[number]: number}, span : number = 1, {}, 0
	local ptrA: number, ptrB: number = 0, 1
	local key: string = ""
	local cursor : number = 0
	local lengthToStr : buffer = self.lengthToStr
	local depth : number

	local writeasbase93 = function(n: number) : string
		local sz = 0
		repeat
			local remainder = n % 93
			buffer.copy(sequence, cursor + depth - 1 - sz, lengthToStr, remainder, 1)
			n = (n - remainder) / 93
			sz += 1
		until n == 0
	end
	
	local function listkey(k : string)  -- string to length
		local n = dictionaryCopy[k]
		depth = n == 0 and 1 or math.ceil(math.log(n + 1, 93))
		if depth > width then
			width, span, spans[width] = depth, 0, span
		end
		
		for _ = 1, width - depth do
			buffer.writeu8(sequence, cursor, 32)
			cursor += 1
		end
		writeasbase93(n)
		cursor += depth
		
		span += 1
	end

	while ptrB <= len do
		local new : string = buffer.readstring(buf, ptrA, ptrB - ptrA)

		if dictionaryCopy[new] then
			key = new
			ptrB += 1
		else
			listkey(key)
			ptrA = ptrB - 1
			size += 1
			dictionaryCopy[new] = size
		end
	end
	listkey(key)

	spans[width] = span

	return table.concat(spans, ",") .. "|" .. buffer.readstring(sequence, 0, cursor)
end

function lzw:decompress(text: string)
	assert(type(text) == "string", "bad argument #1 to 'decompress' (string expected, got " .. typeof(text) .. ")")
	local dictionaryCopy = table.clone(self.bidirectionalDict)
	local tobase10 = self.tobase10
	local sequence, spans, content = {}, string.match(text, "(.-)|(.*)")
	local groups, start = {}, 1
	for span in string.gmatch(spans, "%d+") do
		local width = #groups + 1
		groups[width] = string.sub(content, start, start + span * width - 1)
		start = start + span * width
	end
	local previous

	for width, group in ipairs(groups) do
		for value in string.gmatch(group, string.rep(".", width)) do
			local entry = dictionaryCopy[tobase10(value)]
			if previous then
				if entry then
					table.insert(dictionaryCopy, previous .. string.sub(entry, 1, 1))
				else
					entry = previous .. string.sub(previous, 1, 1)
					table.insert(dictionaryCopy, entry)
				end
				table.insert(sequence, entry)
			else
				sequence[1] = entry
			end
			previous = entry
		end
	end
	return self.unescape(table.concat(sequence))
end

return lzw