Fastest Base85 Encoder/Decoder

About


This is a pure luau Base85 ( or Ascii85 ), a binary-to-text encoding that is more compact than Base64. However, is it not as compact as Base94 Encoder/Decoder but it is more standardized than Base94, that may result in a better speed execution.

Absolutely no use cases, extremely useless, just an encoder/decoder people can use for fun, I guess?


It seem to be as fast as the fastest Base64 implentation in Luau ( A.K.A Luau Fastest Cryptography )

Method Time (ms) Speed
Encode 2.5 - 4.5 225 - 420 MB/s
Decode 2.3 - 4.3 230 - 400 MB/s

Info: 1 MB Binary randomized, native code generation + Intel(R) Core™ i3-1005G1 CPU @ 1.20GHz


Source Code
--!optimize 2
--!native
--!strict

local P1, P2, P3, P4 = 85, 7225, 614125, 52200625

local ENC_LUT_HI = buffer.create(14000) 
local ENC_LUT_LO = buffer.create(30000)

local DECODE_A = buffer.create(262144)
local DECODE_B = buffer.create(262144)

for i = 0, 6995 do
	buffer.writeu16(ENC_LUT_HI, i * 2, (i // 85 + 33) + (i % 85 + 33) * 256)
end

for i = 0, 7224 do
	buffer.writeu32(ENC_LUT_LO, i * 4, ((i // 85 + 33) + (i % 85 + 33) * 256) * 65536)
end

for c1 = 0, 255 do
	local v1 = c1 - 33
	if v1 < 0 or v1 > 84 then continue end

	for c2 = 0, 255 do
		local v2 = c2 - 33
		if v2 < 0 or v2 > 84 then continue end

		local key = (c1 + c2 * 256) * 4

		buffer.writeu32(DECODE_A, key, v1 * P4 + v2 * P3 - 33)
		buffer.writeu32(DECODE_B, key, v1 * P2 + v2 * P1)
	end
end

local function Encode(Input: buffer): buffer
	local Len = buffer.len(Input)
	if Len == 0 then return buffer.create(0) end

	local Rem = Len % 4
	local Output = buffer.create(Len // 4 * 5 + (if Rem > 0 then Rem + 1 else 0))

	local OutIdx = 0
	local UnrollLimit = Len - 16

	local LutHi = ENC_LUT_HI
	local LutLo = ENC_LUT_LO

	for i = 0, UnrollLimit, 16 do
		local W1 = bit32.byteswap(buffer.readu32(Input, i))
		local W2 = bit32.byteswap(buffer.readu32(Input, i + 4))
		local W3 = bit32.byteswap(buffer.readu32(Input, i + 8))
		local W4 = bit32.byteswap(buffer.readu32(Input, i + 12))

		local P1_1 = W1 // 614125
		local R1_1 = W1 - P1_1 * 614125
		local P1_2 = W2 // 614125
		local R1_2 = W2 - P1_2 * 614125
		local P1_3 = W3 // 614125
		local R1_3 = W3 - P1_3 * 614125
		local P1_4 = W4 // 614125
		local R1_4 = W4 - P1_4 * 614125

		local P2_1 = R1_1 // 85
		local P2_2 = R1_2 // 85
		local P2_3 = R1_3 // 85
		local P2_4 = R1_4 // 85

		local ValHi1 = buffer.readu16(LutHi, P1_1 * 2)
		local ValLo1 = buffer.readu32(LutLo, P2_1 * 4)
		buffer.writeu32(Output, OutIdx, ValHi1 + ValLo1)
		buffer.writeu8(Output, OutIdx + 4, (R1_1 - P2_1 * 85) + 33)

		local ValHi2 = buffer.readu16(LutHi, P1_2 * 2)
		local ValLo2 = buffer.readu32(LutLo, P2_2 * 4)
		buffer.writeu32(Output, OutIdx + 5, ValHi2 + ValLo2)
		buffer.writeu8(Output, OutIdx + 9, (R1_2 - P2_2 * 85) + 33)

		local ValHi3 = buffer.readu16(LutHi, P1_3 * 2)
		local ValLo3 = buffer.readu32(LutLo, P2_3 * 4)
		buffer.writeu32(Output, OutIdx + 10, ValHi3 + ValLo3)
		buffer.writeu8(Output, OutIdx + 14, (R1_3 - P2_3 * 85) + 33)

		local ValHi4 = buffer.readu16(LutHi, P1_4 * 2)
		local ValLo4 = buffer.readu32(LutLo, P2_4 * 4)
		buffer.writeu32(Output, OutIdx + 15, ValHi4 + ValLo4)
		buffer.writeu8(Output, OutIdx + 19, (R1_4 - P2_4 * 85) + 33)

		OutIdx += 20
	end

	local InIdx = (OutIdx // 5) * 4
	while InIdx <= Len - 4 do
		local W = bit32.byteswap(buffer.readu32(Input, InIdx))
		local P1 = W // 614125
		local R1 = W - P1 * 614125
		local P2 = R1 // 85

		buffer.writeu32(Output, OutIdx, buffer.readu16(LutHi, P1 * 2) + buffer.readu32(LutLo, P2 * 4))
		buffer.writeu8(Output, OutIdx + 4, (R1 - P2 * 85) + 33)

		InIdx += 4
		OutIdx += 5
	end

	if Rem > 0 then
		local Val = 0
		for i = 0, Rem - 1 do
			Val = Val * 256 + buffer.readu8(Input, InIdx + i)
		end

		if Rem == 1 then
			Val *= 16777216 
		elseif Rem == 2 then
			Val *= 65536
		elseif Rem == 3 then
			Val *= 256
		end

		local P1 = Val // 614125
		local R1 = Val - P1 * 614125
		local P2 = R1 // 85

		local Part2 = buffer.readu32(LutLo, P2 * 4)
		buffer.writeu16(Output, OutIdx, buffer.readu16(LutHi, P1 * 2))
		if Rem > 1 then
			buffer.writeu8(Output, OutIdx + 2, (Part2 // 65536) % 256)
			if Rem > 2 then
				buffer.writeu8(Output, OutIdx + 3, Part2 // 16777216)
			end
		end
	end

	return Output
end

local function Decode(Input: buffer): buffer
	local Len = buffer.len(Input)
	if Len == 0 then return buffer.create(0) end

	local Rem = Len % 5
	local Output = buffer.create((Len // 5) * 4 + (if Rem > 0 then Rem - 1 else 0))

	local OutIdx = 0
	local UnrollLimit = Len - 20

	local DecA, DecB = DECODE_A, DECODE_B

	for i = 0, UnrollLimit, 20 do
		buffer.writeu32(Output, OutIdx, bit32.byteswap(
			buffer.readu32(DecA, buffer.readu16(Input, i) * 4)
				+ buffer.readu32(DecB, buffer.readu16(Input, i + 2) * 4)
				+ buffer.readu8(Input, i + 4))
		)

		buffer.writeu32(Output, OutIdx + 4, bit32.byteswap(
			buffer.readu32(DecA, buffer.readu16(Input, i + 5) * 4)
				+ buffer.readu32(DecB, buffer.readu16(Input, i + 7) * 4)
				+ buffer.readu8(Input, i + 9))
		)

		buffer.writeu32(Output, OutIdx + 8, bit32.byteswap(
			buffer.readu32(DecA, buffer.readu16(Input, i + 10) * 4)
				+ buffer.readu32(DecB, buffer.readu16(Input, i + 12) * 4)
				+ buffer.readu8(Input, i + 14))
		)

		buffer.writeu32(Output, OutIdx + 12, bit32.byteswap(
			buffer.readu32(DecA, buffer.readu16(Input, i + 15) * 4) 
				+ buffer.readu32(DecB, buffer.readu16(Input, i + 17) * 4) 
				+ buffer.readu8(Input, i + 19))
		)

		OutIdx += 16
	end

	local InIdx = (OutIdx // 4) * 5
	while InIdx <= Len - 5 do
		buffer.writeu32(Output, OutIdx, bit32.byteswap(
			buffer.readu32(DecA, buffer.readu16(Input, InIdx) * 4) 
				+ buffer.readu32(DecB, buffer.readu16(Input, InIdx + 2) * 4)
				+ buffer.readu8(Input, InIdx + 4))
		)

		InIdx += 5
		OutIdx += 4
	end

	if InIdx < Len then
		local CharsLeft = Len - InIdx
		local Sum = 0

		for i = 0, CharsLeft - 1 do
			local char = buffer.readu8(Input, InIdx + i)
			local val = char - 33

			if i == 0 then
				Sum += val * P4
			elseif i == 1 then
				Sum += val * P3
			elseif i == 2 then
				Sum += val * P2
			elseif i == 3 then
				Sum += val * P1
			end
		end

		for i = CharsLeft, 4 do
			if i == 1 then
				Sum += 84 * P3
			elseif i == 2 then
				Sum += 84 * P2
			elseif i == 3 then
				Sum += 84 * P1
			elseif i == 4 then
				Sum += 84
			end
		end

		buffer.writeu8(Output, OutIdx, bit32.extract(Sum, 24, 8))
		if CharsLeft > 2 then
			buffer.writeu8(Output, OutIdx + 1, bit32.extract(Sum, 16, 8))
			if CharsLeft > 3 then
				buffer.writeu8(Output, OutIdx + 2, bit32.extract(Sum, 8, 8))
			end
		end
	end

	return Output
end

return {
	Encode = Encode,
	Decode = Decode
}

1 Like

Small speed boost

--!optimize 2
--!native
--!strict

local POW1, POW2, POW3, POW4 = 85, 7225, 614125, 52200625

local ENC_LUT_HI = buffer.create(14000)
local ENC_LUT_LO = buffer.create(30000)
local ENC_LUT_5 = buffer.create(85)

local DECODE_A = buffer.create(262144)
local DECODE_B = buffer.create(262144)

do
	for i = 0, 6995 do
		buffer.writeu16(ENC_LUT_HI, i * 2, (i // 85 + 33) + (i % 85 + 33) * 256)
	end

	for i = 0, 7224 do
		buffer.writeu32(ENC_LUT_LO, i * 4, ((i // 85 + 33) + (i % 85 + 33) * 256) * 65536)
	end

	for i = 0, 84 do
		buffer.writeu8(ENC_LUT_5, i, i + 33)
	end

	for c1 = 0, 255 do
		local v1 = c1 - 33
		if v1 < 0 or v1 > 84 then continue end

		for c2 = 0, 255 do
			local v2 = c2 - 33
			if v2 < 0 or v2 > 84 then continue end

			local key = (c1 + c2 * 256) * 4
			buffer.writeu32(DECODE_A, key, v1 * POW4 + v2 * POW3 - 33)
			buffer.writeu32(DECODE_B, key, v1 * POW2 + v2 * POW1)
		end
	end
end

local function Encode(Input: buffer): buffer
	local Len = buffer.len(Input)
	if Len == 0 then return buffer.create(0) end

	local Rem = Len % 4
	local Output = buffer.create(Len // 4 * 5 + (if Rem > 0 then Rem + 1 else 0))

	local OutIdx = 0
	local UnrollLimit = Len - 16

	local LutHi = ENC_LUT_HI
	local LutLo = ENC_LUT_LO
	local Lut5 = ENC_LUT_5

	for i = 0, UnrollLimit, 16 do
		local W1 = bit32.byteswap(buffer.readu32(Input, i))
		local W2 = bit32.byteswap(buffer.readu32(Input, i + 4))
		local W3 = bit32.byteswap(buffer.readu32(Input, i + 8))
		local W4 = bit32.byteswap(buffer.readu32(Input, i + 12))

		local Hi1 = W1 // 614125
		local Lo1 = W1 - Hi1 * 614125
		local Mid1 = Lo1 // 85

		local Hi2 = W2 // 614125
		local Lo2 = W2 - Hi2 * 614125
		local Mid2 = Lo2 // 85

		local Hi3 = W3 // 614125
		local Lo3 = W3 - Hi3 * 614125
		local Mid3 = Lo3 // 85

		local Hi4 = W4 // 614125
		local Lo4 = W4 - Hi4 * 614125
		local Mid4 = Lo4 // 85

		buffer.writeu32(Output, OutIdx, buffer.readu16(LutHi, Hi1 * 2) + buffer.readu32(LutLo, Mid1 * 4))
		buffer.writeu8(Output, OutIdx + 4, buffer.readu8(Lut5, Lo1 - Mid1 * 85))

		buffer.writeu32(Output, OutIdx + 5, buffer.readu16(LutHi, Hi2 * 2) + buffer.readu32(LutLo, Mid2 * 4))
		buffer.writeu8(Output, OutIdx + 9, buffer.readu8(Lut5, Lo2 - Mid2 * 85))

		buffer.writeu32(Output, OutIdx + 10, buffer.readu16(LutHi, Hi3 * 2) + buffer.readu32(LutLo, Mid3 * 4))
		buffer.writeu8(Output, OutIdx + 14, buffer.readu8(Lut5, Lo3 - Mid3 * 85))

		buffer.writeu32(Output, OutIdx + 15, buffer.readu16(LutHi, Hi4 * 2) + buffer.readu32(LutLo, Mid4 * 4))
		buffer.writeu8(Output, OutIdx + 19, buffer.readu8(Lut5, Lo4 - Mid4 * 85))

		OutIdx += 20
	end

	local InIdx = (OutIdx // 5) * 4
	while InIdx <= Len - 4 do
		local W = bit32.byteswap(buffer.readu32(Input, InIdx))
		local Hi = W // 614125
		local Lo = W - Hi * 614125
		local Mid = Lo // 85

		buffer.writeu32(Output, OutIdx, buffer.readu16(LutHi, Hi * 2) + buffer.readu32(LutLo, Mid * 4))
		buffer.writeu8(Output, OutIdx + 4, buffer.readu8(Lut5, Lo - Mid * 85))

		InIdx += 4
		OutIdx += 5
	end

	if Rem > 0 then
		local Val = buffer.readu8(Input, InIdx)
		if Rem > 1 then
			Val = Val * 256 + buffer.readu8(Input, InIdx + 1)
			if Rem > 2 then
				Val = Val * 256 + buffer.readu8(Input, InIdx + 2)
				Val *= 256
			else
				Val *= 65536
			end
		else
			Val *= 16777216
		end

		local Hi = Val // 614125
		local Lo = Val - Hi * 614125
		local Mid = Lo // 85

		local HiPart = buffer.readu16(LutHi, Hi * 2)
		local LoPart = buffer.readu32(LutLo, Mid * 4)

		buffer.writeu8(Output, OutIdx, bit32.band(HiPart, 0xFF))
		buffer.writeu8(Output, OutIdx + 1, bit32.rshift(HiPart, 8))
		if Rem > 1 then
			buffer.writeu8(Output, OutIdx + 2, bit32.band(bit32.rshift(LoPart, 16), 0xFF))
			if Rem > 2 then
				buffer.writeu8(Output, OutIdx + 3, bit32.rshift(LoPart, 24))
			end
		end
	end

	return Output
end

local function Decode(Input: buffer): buffer
	local Len = buffer.len(Input)
	if Len == 0 then return buffer.create(0) end

	local Rem = Len % 5
	local Output = buffer.create((Len // 5) * 4 + (if Rem > 0 then Rem - 1 else 0))

	local OutIdx = 0
	local UnrollLimit = Len - 20

	local DecA = DECODE_A
	local DecB = DECODE_B

	for i = 0, UnrollLimit, 20 do
		local Key00 = buffer.readu16(Input, i) * 4
		local Key01 = buffer.readu16(Input, i + 2) * 4
		local Key10 = buffer.readu16(Input, i + 5) * 4
		local Key11 = buffer.readu16(Input, i + 7) * 4
		local Key20 = buffer.readu16(Input, i + 10) * 4
		local Key21 = buffer.readu16(Input, i + 12) * 4
		local Key30 = buffer.readu16(Input, i + 15) * 4
		local Key31 = buffer.readu16(Input, i + 17) * 4

		buffer.writeu32(Output, OutIdx, bit32.byteswap(
			buffer.readu32(DecA, Key00) + buffer.readu32(DecB, Key01) + buffer.readu8(Input, i + 4)))

		buffer.writeu32(Output, OutIdx + 4, bit32.byteswap(
			buffer.readu32(DecA, Key10) + buffer.readu32(DecB, Key11) + buffer.readu8(Input, i + 9)))

		buffer.writeu32(Output, OutIdx + 8, bit32.byteswap(
			buffer.readu32(DecA, Key20) + buffer.readu32(DecB, Key21) + buffer.readu8(Input, i + 14)))

		buffer.writeu32(Output, OutIdx + 12, bit32.byteswap(
			buffer.readu32(DecA, Key30) + buffer.readu32(DecB, Key31) + buffer.readu8(Input, i + 19)))

		OutIdx += 16
	end

	local InIdx = (OutIdx // 4) * 5
	while InIdx <= Len - 5 do
		local Key0 = buffer.readu16(Input, InIdx) * 4
		local Key1 = buffer.readu16(Input, InIdx + 2) * 4

		buffer.writeu32(Output, OutIdx, bit32.byteswap(
			buffer.readu32(DecA, Key0) + buffer.readu32(DecB, Key1) + buffer.readu8(Input, InIdx + 4)))

		InIdx += 5
		OutIdx += 4
	end

	if Rem > 0 then
		local C0 = buffer.readu8(Input, InIdx) - 33
		local C1 = if Rem > 1 then buffer.readu8(Input, InIdx + 1) - 33 else 84
		local C2 = if Rem > 2 then buffer.readu8(Input, InIdx + 2) - 33 else 84
		local C3 = if Rem > 3 then buffer.readu8(Input, InIdx + 3) - 33 else 84

		local Sum = C0 * POW4 + C1 * POW3 + C2 * POW2 + C3 * POW1 + 84

		buffer.writeu8(Output, OutIdx, bit32.rshift(Sum, 24))
		if Rem > 2 then
			buffer.writeu8(Output, OutIdx + 1, bit32.band(bit32.rshift(Sum, 16), 0xFF))
			if Rem > 3 then
				buffer.writeu8(Output, OutIdx + 2, bit32.band(bit32.rshift(Sum, 8), 0xFF))
			end
		end
	end

	return Output
end

return {
	Encode = Encode,
	Decode = Decode
}
1 Like

Thanks for the feedback! But there is something wrong when it come to my device:

Your decode seems a little bit faster after some test

But Encode are weirdly random


Like sometimes New wins Old, Old wins New, But Old usually faster. averagely. It seem to depend on type of devices and memory you are in. But I will try combine with your decode, and my encode

Yeah at this point it comes down to hardware, for me its faster on amd but as you said it can be slower on intel, its already very well optimized

1 Like