Text compression

Is this backwards compatible with the original string compressor? Like if I had data stored with a string compressed from boatbombers version, would that decompress successfully with this?

1 Like

I just so happen to have a project which could benefit from this particular due to the repetitive patterns of my map building game like the one you mentioned would be a good use case.
Dungeon Builder Tech Demo - Roblox

No; as boatbomber’s version doesn’t account for character escapes that extend to ASCII values 128-255. You could however, modify how the escape function works to possibly have it work with the older versions.

Fixed another edge case.
Line 174 is replaced with depth = n == 0 and 1 or math.ceil(math.log(n + 1, 93)).

--!native
-- Module by 1waffle1 and boatbomber, optimized and fixed by iiau
-- https://devforum.roblox.com/t/text-compression/163637/37

local lzw = {}
lzw.__index = lzw
--local concatSize = 512

lzw.new = function(BUF_SIZE: number?)
	local self = setmetatable({}, lzw)
	
	self.BUF_SIZE = BUF_SIZE or 8388608 --* 8 MB should be enough for most things on Roblox
	self.buf = buffer.create(self.BUF_SIZE)
	--self.concat = buffer.create(concatSize) -- storage to concat strings (is this faster?)
	--local concat = self.concat

	self.strToLength = {}
	self.lengthToStr = buffer.create(94) -- base93
	self.bidirectionalDict = {}

	local lengthToStr = self.lengthToStr
	local strToLength = self.strToLength
	local bidirectionalDict = self.bidirectionalDict

	do -- populate dictionary
		local length = 0
		for i = 32, 127 do
			if i ~= 34 and i ~= 92 then
				local c = string.char(i)
				strToLength[c] = length

				bidirectionalDict[length] = c
				bidirectionalDict[c] = length

				buffer.writeu8(lengthToStr, length, i)
				length = length + 1
			end
		end
	end


	local escapemap_126, escapemap_127 = {}, {}
	local unescapemap_126, unescapemap_127 = {}, {}

	local blacklisted_126 = { 34, 92 }
	for i = 126, 180 do
		table.insert(blacklisted_126, i)
	end

	do -- Populate escape map
		-- represents the numbers 0-31, 34, 92, 126, 127 (36 characters)
		-- and 128-180 (52 characters)
		-- https://devforum.roblox.com/t/text-compression/163637/5
		for i = 0, 31 + #blacklisted_126 do
			local b = blacklisted_126[i - 31]
			local s = i + 32

			-- Note: 126 and 127 are magic numbers
			local c = b or i
			local e = s + (s >= 34 and 1 or 0) + (s >= 91 and 1 or 0)

			escapemap_126[c] = e
			unescapemap_126[string.char(e)] = string.char(c)
		end

		for i = 1, 255 - 180 do
			local c = i + 180
			local s = i + 34
			local e = s + (s >= 92 and 1 or 0)

			escapemap_127[c] = e
			unescapemap_127[string.char(e)] = string.char(c)
		end
	end

	local function escape(s : string)
		local len = #s
		local refbuf = buffer.fromstring(s)
		local newbuf = buffer.create(len * 2) -- worst case scenario

		local c = 1 -- 1 indexed
		local f = string.find(s, '[%c"\\\126-\255]') -- 1 indexed
		local bufcursor = 0

		while f do
			if c < f then
				buffer.copy(newbuf, bufcursor, refbuf, c - 1, f - c)
				bufcursor += f - c
				c = f
			end

			local byte = buffer.readu8(refbuf, f - 1)
			if byte >= 181 then
				local e = escapemap_127[byte]
				buffer.writeu8(newbuf, bufcursor, 127)
				buffer.writeu8(newbuf, bufcursor + 1, e)
			else
				local e = escapemap_126[byte]
				buffer.writeu8(newbuf, bufcursor, 126)
				buffer.writeu8(newbuf, bufcursor + 1, e)
			end
			
			c += 1
			bufcursor += 2
			f = string.find(s, '[%c"\\\126-\255]', f + 1)
		end
		
		if c <= len then
			buffer.copy(newbuf, bufcursor, refbuf, c - 1, len - c + 1)
			bufcursor += len - c + 1
		end

		return newbuf, bufcursor
	end

	local function unescape(s : string)
		return string.gsub(string.gsub(s, "\127(.)", function(e)
			return unescapemap_127[e]
		end), "\126(.)", function(e)
			return unescapemap_126[e]
		end)
	end

	local b10Cache = {}

	local function tobase10(value: string) : number
		local n = b10Cache[value]
		if n then
			return n
		end
	
		n = 0
		for i = 1, #value do
			n = n + math.pow(93, i - 1) * bidirectionalDict[string.sub(value, -i, -i)]
		end
	
		b10Cache[value] = n
		return n
	end

	self.escape = escape
	self.unescape = unescape
	--self.tobase93 = tobase93
	self.tobase10 = tobase10

	return self
end

function lzw:compress(text: string)
	assert(type(text) == "string", "bad argument #1 to 'compress' (string expected, got " .. typeof(text) .. ")")

	local buf, len = self.escape(text)
	local dictionaryCopy : {[string] : number} = table.clone(self.strToLength)
	local sequence: buffer, size : number = self.buf, 93
	local width: number, spans: {[number]: number}, span : number = 1, {}, 0
	local ptrA: number, ptrB: number = 0, 1
	local key: string = ""
	local cursor : number = 0
	local lengthToStr : buffer = self.lengthToStr
	local depth : number

	local writeasbase93 = function(n: number) : string
		local sz = 0
		repeat
			local remainder = n % 93
			buffer.copy(sequence, cursor + depth - 1 - sz, lengthToStr, remainder, 1)
			n = (n - remainder) / 93
			sz += 1
		until n == 0
	end
	
	local function listkey(k : string)  -- string to length
		local n = dictionaryCopy[k]
		depth = n == 0 and 1 or math.ceil(math.log(n + 1, 93))
		if depth > width then
			width, span, spans[width] = depth, 0, span
		end
		
		for _ = 1, width - depth do
			buffer.writeu8(sequence, cursor, 32)
			cursor += 1
		end
		writeasbase93(n)
		cursor += depth
		
		span += 1
	end

	while ptrB <= len do
		local new : string = buffer.readstring(buf, ptrA, ptrB - ptrA)

		if dictionaryCopy[new] then
			key = new
			ptrB += 1
		else
			listkey(key)
			ptrA = ptrB - 1
			size += 1
			dictionaryCopy[new] = size
		end
	end
	listkey(key)

	spans[width] = span

	return table.concat(spans, ",") .. "|" .. buffer.readstring(sequence, 0, cursor)
end

function lzw:decompress(text: string)
	assert(type(text) == "string", "bad argument #1 to 'decompress' (string expected, got " .. typeof(text) .. ")")
	local dictionaryCopy = table.clone(self.bidirectionalDict)
	local tobase10 = self.tobase10
	local sequence, spans, content = {}, string.match(text, "(.-)|(.*)")
	local groups, start = {}, 1
	for span in string.gmatch(spans, "%d+") do
		local width = #groups + 1
		groups[width] = string.sub(content, start, start + span * width - 1)
		start = start + span * width
	end
	local previous

	for width, group in ipairs(groups) do
		for value in string.gmatch(group, string.rep(".", width)) do
			local entry = dictionaryCopy[tobase10(value)]
			if previous then
				if entry then
					table.insert(dictionaryCopy, previous .. string.sub(entry, 1, 1))
				else
					entry = previous .. string.sub(previous, 1, 1)
					table.insert(dictionaryCopy, entry)
				end
				table.insert(sequence, entry)
			else
				sequence[1] = entry
			end
			previous = entry
		end
	end
	return self.unescape(table.concat(sequence))
end

return lzw