Text compression

I wrote a text compression program which can be used to fit more data inside of a datastore key. It’s based on the LZW algorithm and uses the 127 different characters from \1 to \127 that datastores support.

edit: Some characters take up more than one character worth of space when encoded and stored in a datastore. Control characters mostly take up 6 characters each, and " and \ each take up 2 characters. This has been updated to avoid using such characters in the compressed output. 93 unique characters are used.

For an example benchmark, I gathered the sources of all of the default chat and camera scripts and concatenated them into one string. It was able to be compressed from 126590 characters to 62317, less than half the original size, in under 0.2 seconds. If you’re compressing data that’s repetitive and contains lots of common substrings then it will get a compression ratio much better than half.

I should warn that if you’re taking something that’s larger than what can be stored in a datastore, compressing it isn’t a guarantee that the result will be small enough to fit in a single key.

Source:

local dictionary, length = {}, 0
for i = 32, 127 do
	if i ~= 34 and i ~= 92 then
		local c = string.char(i)
		dictionary[c], dictionary[length] = length, c
		length = length + 1
	end
end

local escapemap = {}
for i = 1, 34 do
	i = ({34, 92, 127})[i-31] or i
	local c, e = string.char(i), string.char(i + 31)
	escapemap[c], escapemap[e] = e, c
end
local function escape(s)
	return (s:gsub("[%c\"\\]", function(c)
		return "\127"..escapemap[c]
	end))
end

local function unescape(s)
	return (s:gsub("\127(.)", function(c)
		return escapemap[c]
	end))
end

local function copy(t)
	local new = {}
	for k, v in pairs(t) do
		new[k] = v
	end
	return new
end

local function tobase93(n)
	local value = ""
	repeat
		local remainder = n%93
		value = dictionary[remainder]..value
		n = (n - remainder)/93
	until n == 0
	return value
end

local function tobase10(value)
	local n = 0
	for i = 1, #value do
		n = n + 93^(i-1)*dictionary[value:sub(-i, -i)]
	end
	return n
end

local function compress(text)
	local dictionary = copy(dictionary)
	local key, sequence, size = "", {}, #dictionary
	local width, spans, span = 1, {}, 0
	local function listkey(key)
		local value = tobase93(dictionary[key])
		if #value > width then
			width, span, spans[width] = #value, 0, span
		end
		sequence[#sequence+1] = (" "):rep(width - #value)..value
		span = span + 1
	end
	text = escape(text)
	for i = 1, #text do
		local c = text:sub(i, i)
		local new = key..c
		if dictionary[new] then
			key = new
		else
			listkey(key)
			key, size = c, size+1
			dictionary[new], dictionary[size] = size, new
		end
	end
    listkey(key)
	spans[width] = span
    return table.concat(spans, ",").."|"..table.concat(sequence)
end

local function decompress(text)
	local dictionary = copy(dictionary)
	local sequence, spans, content = {}, text:match("(.-)|(.*)")
	local groups, start = {}, 1
	for span in spans:gmatch("%d+") do
		local width = #groups+1
		groups[width] = content:sub(start, start + span*width - 1)
		start = start + span*width
	end
	local previous;
	for width = 1, #groups do
		for value in groups[width]:gmatch(('.'):rep(width)) do
			local entry = dictionary[tobase10(value)]
			if previous then
				if entry then
					sequence[#sequence+1] = entry
					dictionary[#dictionary+1] = previous..entry:sub(1, 1)
				else
					entry = previous..previous:sub(1, 1)
					sequence[#sequence+1] = entry
					dictionary[#dictionary+1] = entry
				end
			else
				sequence[1] = entry
			end
			previous = entry
		end
	end
	return unescape(table.concat(sequence))
end
previous source

This code uses all 127 characters that datastores support, however, most control characters take up 6 characters worth of space when encoded, so the result is much bigger.

local dictionary = {}
for i = 1, 127 do
	local c = string.char(i)
	dictionary[c], dictionary[i-1] = i-1, c
end

local function copy(t)
	local new = {}
	for k, v in pairs(t) do
		new[k] = v
	end
	return new
end

local function tobase127(n)
	local value = ""
	repeat
		local remainder = n%127
		value = dictionary[remainder]..value
		n = (n - remainder)/127
	until n == 0
	return value
end

local function tobase10(value)
	local n = 0
	for i = 1, #value do
		n = n + 127^(i-1)*dictionary[value:sub(-i, -i)]
	end
	return n
end

local function compress(text)
	local dictionary = copy(dictionary)
	local key, sequence, size = "", {}, #dictionary
	local width, spans, span = 1, {}, 0
	local function listkey(key)
		local value = tobase127(dictionary[key])
		if #value > width then
			width, span, spans[width] = #value, 0, span
		end
		sequence[#sequence+1] = ("\1"):rep(width - #value)..value
		span = span + 1
	end
	for i = 1, #text do
		local c = text:sub(i, i)
		local new = key..c
		if dictionary[new] then
			key = new
		else
			listkey(key)
			key, size = c, size+1
			dictionary[new], dictionary[size] = size, new
		end
	end
    listkey(key)
	spans[width] = span
    return table.concat(spans, ",").."|"..table.concat(sequence)
end

local function decompress(text)
	local dictionary = copy(dictionary)
	local sequence, spans, content = {}, text:match("(.-)|(.*)")
	local groups, start = {}, 1
	for span in spans:gmatch("%d+") do
		local width = #groups+1
		groups[width] = content:sub(start, start + span*width - 1)
		start = start + span*width
	end
	local previous;
	for width = 1, #groups do
		for value in groups[width]:gmatch(('.'):rep(width)) do
			local entry = dictionary[tobase10(value)]
			if previous then
				if entry then
					sequence[#sequence+1] = entry
					dictionary[#dictionary+1] = previous..entry:sub(1, 1)
				else
					entry = previous..previous:sub(1, 1)
					sequence[#sequence+1] = entry
					dictionary[#dictionary+1] = entry
				end
			else
				sequence[1] = entry
			end
			previous = entry
		end
	end
    return table.concat(sequence)
end

The two functions compress(text) and decompress(text) are used to compress and decompress a string. If you want to use this as a module, you can add return {compress = compress, decompress = decompress} to the end of it.

228 Likes
I'm so EXCITED about my compression
I Love Compression pt 2: Huge Voxel Terrain Serialization
Most efficiently saving data
Whats the limit of a table in DataStore?
How do you "Compress" tables?
Data saving large, player made instances
Why do some ASCII characters take up more than 1 space in DataStores?
Compressing CFrame Values
Compressing Text (Zip?)
Saving/loading chunks in a chunk-based game
String compressor isn't working well enough
QuadaStore - The ROBLOX Datastore Solution
Are there any actual good text compression libraries out there?
Any efficient JSON string compression algorithms for lua?
How do i compress data
How to reduce lag from in pairs() loop
Saving EVERY SINGLE World Data
Rbx-deflate - text compression library
Does HttpService:JSONEncode() have a max string length?
BufferTemplates | A super-aggressive compression library
Which takes up more space to save?
How Can You Handle A Huge Database?
Setting up a Datastore
Datastore Code Review
Conceptualizing a save state architecture
A* Pathfinding Support Using Heaps
Recording player movement and saving it in a Datastore
Compression for limited data (Datastore)
World being sizes too big to be saved
Outputting binary data
Is using a simple datastore good enough to store tables for tycoons?
Preset Manager Plugin - Bulk apply properties, attributes, tags, and collisions
In-Engine Compression Algorithms & Binary Format Processing
Voxel Data Compression
DataStore:SetAsync() data limits
Is storing UserData in Binary worth it?
Is it better to have multiple DataStores?
String Compression (zlib/deflate)
This how do i extremely reduced data packet size
This how do i extremely reduced data packet size
Data efficiency
How to store data in seeds?

This exact module is what allowed me to compress saves in HomeBuilder 2 so compact

Here is the biggest build any of my players have ever made, overall it’s 126,811 base parts between 13,070 models. All of the data in JSON format alone is 547,887 characters long, and after being compressed with this module it’s only 169,020 characters long (30.8% of the original size). This is 65%(aprox) of the max string length for a datastore key(260,000 characters just about), and it took her a couple weeks of playing a few hours a day to build.

HB2 Build

I cannot recommend this enough, it’s really fast too, and super easy to use

40 Likes

Compression - Script.rbxm (OLD) (3.1 KB)

Compression - Script.rbxm (UPDATED) (3.6 KB)

I hope you don’t mind me uploading this as a Script + Module for people who want to save this as a File here.

:+1:t2:

You should upload it as a Model too.

9 Likes

This was very informative. Thank you for writing this.

1 Like

I never considered compressing strings before. Since I save a lot of my data as strings this will probably come in handy sooner of later…

3 Likes

I found that some characters are not really a single byte when they’re stored in a datastore because they’re encoded in JSON.

here's a table of encodings
1 \u0001
2 \u0002
3 \u0003
4 \u0004
5 \u0005
6 \u0006
7 \u0007
8 \b
9 \t
10 \n
11 \u000B
12 \f
13 \r
14 \u000E
15 \u000F
16 \u0010
17 \u0011
18 \u0012
19 \u0013
20 \u0014
21 \u0015
22 \u0016
23 \u0017
24 \u0018
25 \u0019
26 \u001A
27 \u001B
28 \u001C
29 \u001D
30 \u001E
31 \u001F
34 \"
92 \\

I’ve updated the program to avoid using these characters. It will escape these characters before compression and replace them with \127 followed by a different character, and interpret the escaped characters upon decompression. The initial dictionary size is 93 instead of 127 now, so the compression rate is slightly worse (although technically it’s better now than it was before). @ColdSmoke’s example now compresses to 33.2% of the original size, or 182,258 characters, which is 70% of a datastore key’s maximum capacity.

6 Likes

I coded functions to change base of an unsigned integer, they looked so similiar to yours that I considered posting them!

local encode, decode do
	
	--offset=33 base=92 --default
	--offset=48 base=10 --decimal
	function encode(int, base, offset)
		local codepoints = {}
		local i = 0
		while int + 1 >= base^i do
			local code = math.floor(int/base^i % base + offset)
			table.insert(codepoints, 1, code)
			i = i + 1
		end
		return utf8.char(unpack(codepoints))
	end
	
	function decode(s, base, offset)
		local int = 0
		local length = #s
		for i, code in utf8.codes(s) do
			int = int + (code-offset)*base^(length-i)
		end
		return int
	end
end

Anyone interested feel free to use it.
Note: if you would like to use smaller bases, you may consider using
Variant tonumber(Variant arg, int base = 10).

1 Like

Can’t get it to work or am I doing something wrong?

local MS = require(script.ModuleScript)

local HttpService = game.HttpService

local Dictionary = {

Cash = 500,

Level = 1,

EXP = {0,50},

Kills = 0,

Deaths = 0,

Rank = 'Rookie',

Membership = 'None',

}

print(#HttpService:JSONEncode(Dictionary))

local Compressed; Compressed = MS.compress(Dictionary)

print(#Compressed)

attempt to call method ‘gsub’ (a nil value)

Stack Begin

Script ‘ServerScriptService.Script.ModuleScript’, Line 17 - upvalue escape

Script ‘ServerScriptService.Script.ModuleScript’, Line 66 - field compress

Script ‘ServerScriptService.Script’, Line 17

Stack End

1 Like

You need to compress the JSON string, not the table.

Something like:

local JSONstring = HttpService:JSONEncode(Dictionary)

local Compressed = MS.compress(JSONstring)
5 Likes

Let me try that and I’ll get back to you.

(Edit)

Okay it works!

But I did this and the results was

print(#JSONstring)

print(#Compressed)

92

166

print(JSONstring)

print(Compressed)

"Cash":500,"EXP":[0,50],"Deaths":0,"Membership":"None","Level":1,"Rank":"Rookie","Kills":0}

1,80|{! A C a s h!# : 5 0 0 ,!# E X P!) [!-!+ ]!. A D e a t h s!)!-!# M e m b e r!’ i p!)!# N o n e!#!8 L e v e l!) 1!8 R a n k!L A R o o k i!Q A!8 K i l l!? A : 0 }


Seems like it still needs some work

@1waffle1

2 Likes

The compression works better with longer strings that have a lot of repeating patterns.

For smaller strings, I’d just leave them as they are.

7 Likes

It doesn’t. It is impossible for any compression algorithm to map every input to a smaller output. Your string is already so small that it doesn’t need to be (and can’t be) compressed.

14 Likes

So, lets say I have a minecraft-like game, and I want to save block changes. An example of one block changes saved to my string format would be something like “B1/1000/100/1000/3/1/3999/a/b/c,”

  • B1 means the block type. Ex: B1,B2,B3
  • 1000/100/1000 is the grid coordinate (1grid=4studs)
  • 3/1/3999 would be the amount of times the block is repeated on respective axis (x,y,z)
  • a, b and c would be special properties if needed (ex: door would have a property as 1 or 0, to indicate if its opened)
    Would this be already too compressed for your code?

Information that has patterns or biases can be simplified. It looks like your format uses lots of / and numbers, which is an attribute which can be shortened by using more different types of characters to represent common repeated substrings like /1/. If you try feeding it a large example then you can see for yourself how much it can be compressed.

1 Like

Yeah, I was thinking that I could compress this even more. I thought of doing a dictionary with a lot of subdivisions. For example, a lot of “air” was created starting on the same x coordinate 300, but with different y and z. Instead of writing “Air/x1/y1/z1,Air/x1/y1/z2,Air/x1/y1/z3” over and over I do “Air={x1={y1={z1,z2,z3}},y2={…}},x2={…},x3={…}}”.

Combined with your compressor, I guess it will take a lot of time for some minecraft like game to fill the datastore with map changes.

I’ve been loving this module! Thank you so much for sharing this.
It can be a bit slow when dealing with large data, so I decided to try optimizing.

Decompression is now 34.8% faster and compression is 6.6% faster in my test.

Bench
local default = require(script.default)
local optimized = require(script.optimized)

local HttpService = game:GetService("HttpService")

return {

	ParameterGenerator = function()
		local data = {}

		for i=1, 30 do
			local characters = table.create(100)
			for n=1, 100 do
				characters[n] = math.random(40, 120)
			end

			data[HttpService:GenerateGUID(false)] = string.char(table.unpack(characters))
		end

		return HttpService:JSONEncode(data)
	end;

	Functions = {
		["optimized"] = function(Profiler, Input)
			Profiler.Begin("compress")
			local compressed = optimized.Compress(Input)
			Profiler.End()

			Profiler.Begin("Decompress")
			local decompressed = optimized.Decompress(compressed)
			Profiler.End()
		end;

		["default"] = function(Profiler, Input)
			Profiler.Begin("compress")
			local compressed = default.Compress(Input)
			Profiler.End()

			Profiler.Begin("Decompress")
			local decompressed = default.Decompress(compressed)
			Profiler.End()
		end;
	};
}

Changes:

  • Cached base10 and base93 conversions
  • Used string global functions instead of string methods (ie string.sub(s) instead of s:sub())
  • Used table.insert(t) instead of t[#t+1] since this is faster in latest Luau as it avoids checking table length
  • Used ipairs instead of numeric+indexing to loop over groups
  • Used Luau’s += where possible
  • Fixed shadowed variables
  • Used StyLua 0.11.3 for standardized formatting

Edit: go use the fixed version shared later in this thread

38 Likes

Thank you so much for providing this optimisation!

I was inefficiently sending a gigantic table of data over remote events between the server and client that must’ve been several megabytes large and was incredibly slow. I fixed this by converting the table into JSON using HttpService’s JSONEncode then compressing the JSON using this optimised module, then decompressing and JSONDecode back into a table again. It’s now lightning fast, with little noticeable changes in overall performance, thank you so much!

2 Likes

For some weird reason string.char(3) causes it to break

image

local test = require(script.ModuleScript)

local compressed = test.Compress("hey")

print(compressed)

local compressedbroken = test.Compress("hey" .. string.char(3))

print(compressedbroken)

Is there a way to prevent it from breaking the script?

image

3 Likes

excellent for datastore map saving systems.

Thanks for this!

It works great,with number compression from suphi,This was able to reduce the size by 60 percent and on one duplicated data case,94 percent!