The road to 10 MHz: low level/more in depth lua optimization techniques

Good {{timeOfDay}} ! This is my first topic :smiley:
I am making a 16 bit CPU Emulator, and the current “clock speed” is around 6MHz with native code generation (really dependent on the computer running the emulator, you are more than welcome to post your results)
The problem is that it is not enough. The aim, as said in the title, is 10 MHz.

So, the first big question is “How can i optimize this further?”. As i’ve already implemented all changes for setbacks described in the holy Roberto Ierusalimschy’s text https://www.lua.org/gems/sample.pdf. Method localizing alone made a leap from 2MHz to 3.5MHz.

The second big question that had been lingering in my thoughts is as simple as “What’s the theoretical speed limit for a lua CPU emulator?”, however it is not the main focus of this post.

The actual code. Readability destroyed for speed. If you want something to be explained in the code/about the cpu then you’re welcome to ask!

--!native
local REGS = buffer.create(12*2) 

local REGrepr = {
	"A", -- 0
	"B", -- 1
	"C", -- 2
	"D", -- 3
	"IP", -- 4
	"SS", -- 5
	"SO", -- 6
	"MS", -- 7
	"MO", -- 8
	"I", -- 9
	"O", -- 10
	"ST" -- 11 (0 0 0 0  0 0 0 0  0 0 0 0  0 0 CARRY SIGNED)
}

local RAM = buffer.create(65536) -- 64(0) kb is enough for everyone

local Parser = require(game:GetService("ReplicatedStorage"):WaitForChild("ZASM"))

local b32extract = bit32.extract
local readi = buffer.readi16
local writei = buffer.writei16
local readu = buffer.readu16
local writeu = buffer.writeu16
local band = bit32.band
local bor = bit32.bor

local is_signed = false

function readREG(reg,forceSign)
	local signed = forceSign or is_signed
	return if signed then readi(REGS,reg*2) else readu(REGS,reg*2)
end

function writeREG(reg,value,forceSign )
	local signed = forceSign or is_signed
	if signed then writei(REGS,reg*2,value) else writeu(REGS,reg*2,value) end
	if reg == 11 then
		is_signed = band(readu(REGS,22),1) -- its fine :3
	end
end

function readRAM(addr,forceSign)
	local signed = forceSign or is_signed
	return if signed then readi(RAM,addr) else readu(RAM,addr) 
end

function writeRAM(addr,value,forceSign)
	local signed = forceSign or is_signed
	if signed then writei(RAM,addr,value) else writeu(RAM,addr,value) end
end

function R_I(f,a,fbit) -- differentiates between a register and an immediate value based on the fbit in the flags
	return if b32extract(f,fbit,1)==1 then a else readREG(a)
end

function jumpDecorate(compF) 
	return function(f,a,b,c)
		if a > 65535  then 
			error("Jump offset out of range")
		end
	
		if compF(R_I(f,a,0), R_I(f,b,1)) then 
			writeREG(4,c,false) -- IP = c
		end	
	end
end

local function logicDecorate(logicF)
	return function(f,a,b,c)
		writeREG(c,logicF(R_I(f,a,0),R_I(f,b,1)))
	end
end

local instrs = {
	
	function(f,a,b,c) -- mov
		-- dry :3
		writeREG(b,R_I(f,a,0))

	end, -- add
	function(f,a,b,c)
		local value = R_I(f,a,0)+R_I(f,b,1)
		local overflowVal = if is_signed then 32768 else 65535
		if value > overflowVal then
			writeREG(11,bor(readREG(11,false),0b10),false) -- ST = xxxx xxxx xxxx xx1x
		end
		writeREG(c,value%overflowVal)
	end,
	function(f,a,b,c) -- sub
		local value = R_I(f,a,0)-R_I(f,b,1)
		writeREG(c,value)
	end,
	function(f,a,b,c) --mul
		local value = R_I(f,a,0)*R_I(f,b,1)
		writeREG(2,b32extract(value,16,16))
		writeREG(3,b32extract(value,0,16))
	end,

	logicDecorate(bit32.band),
	logicDecorate(bit32.bor),
	logicDecorate(bit32.bxor),
	logicDecorate(bit32.bnot),

	jumpDecorate(function(a,b) return true end),
	jumpDecorate(function(a,b) return a < b  end),
	jumpDecorate(function(a,b) return a <= b  end),
	jumpDecorate(function(a,b) return a > b  end),
	jumpDecorate(function(a,b) return a >= b  end),
	jumpDecorate(function(a,b) return a == b  end),
	jumpDecorate(function(a,b) return a ~= b  end),

	function(f,a,b,c) -- save
		local addr = readREG(7,false)+readREG(8,false)
		assert(addr <= 65535,"Memory address is out of bounds")

		writeRAM(addr,R_I(f,a,0),false)
	end,
	function(f,a,b,c) -- load
		local addr = readREG(7,false)+readREG(8,false)
		assert(addr <= 65535,"Memory address is out of bounds")

		writeREG(a,readRAM(addr,false))
	end,

	function(f,a,b,c) -- push
		local addr = readREG(5,false)+readREG(6,false)
		assert(addr <= 65535,"Memory address is out of bounds")

		writeRAM(addr,R_I(f,a,0),false)
		writeREG(6,readREG(6,false)+1,false)
	end,
	function(f,a,b,c) -- pop
		writeREG(6,readREG(6,false)-1,false)
		
		local addr = readREG(5,false)+readREG(6,false)
		assert(addr <= 65535,"Memory address is out of bounds")
		
		writeREG(a,readRAM(addr,false))	
	end,
	function(f,a,b,c) --halt
		return 'HALT'
	end
}
local Emulator = {}
function Emulator.Step()
	local IP = readREG(4,false)
	writeREG(4,IP+1,false)
	IP*=8
	local opcode = readu(RAM,IP)
	local instr = instrs[b32extract(opcode,0,13)+1]
	assert(instr ~= nil,"Unknown opcode ".. opcode .." at address".. IP)
	return instr(b32extract(opcode,13,3),readu(RAM,IP+2),readu(RAM,IP+4),readu(RAM,IP+6))
end

function Emulator.Reset()
	REGS = buffer.create(12*2) 
	writeREG(5,16384,false) -- stack starts at 16384 bytes (8192 words)
	writeREG(7,32768,false) -- program heap starts at 32768 bytes (16384 words)
	RAM = buffer.create(65536)
end

function Emulator.LoadInstructions(str)
	local binary = Parser.assemble(Parser.parse(Parser.lex(str)))
	local j = 0
	for i,instr in binary do
		writeu(RAM,j,instr[1])
		writeu(RAM,j+2,instr[2])
		writeu(RAM,j+4,instr[3])
		writeu(RAM,j+6,instr[4])
		j+=8
	end

end
function Emulator.Start()
	task.spawn(function()
		task.desynchronize()
		while Emulator.Step() ~= "HALT" do
		end
	end)
end

return Emulator

7 Likes

try adding typechecking to each of function.
Also please dont use

Becouse it does opposite of optimization
Easy solution would be adding

--!optimize 2
--!strict
--!native

And make sure nothing errors in strict mode and you should be fine.

1 Like

1.typechecking and --!strict gave 0 new performace
2.–!optimize 2 i think gave a bit of performance
3. The thing about not using local variables for function references decreases the performance by about 1.5MHz. The reason is simple, and im going to quote the holy text of texexec:

Access to external locals (that is, variables that are local to an enclosing
function) is not as fast as access to local variables, but it is still faster than
access to globals. Consider the next fragment:

function foo (x)
for i = 1, 1000000 do
x = x + math.sin(i)
end
return x
end
print(foo(10))

We can optimize it by declaring sin once, outside function foo:

local sin = math.sin
function foo (x)
for i = 1, 1000000 do
x = x + sin(i)
end
return x
end
print(foo(10))

This second code runs 30% faster than the original one.

wrong
–!optimize 2 does that thing for you
You basically fully anihilate perfomance by doing it manually

bring readi, writei, bit32 ops, buffer tables etc. into locals at the top so Lua doesn’t chase globals, then turn your readREG/writeREG calls into inline code in the arithmetic ops, cut that extra function call and replace the 13-bit b32extract + table lookup with a straight direct dispatch
e.g. goto or a big if chain so you skip one level of indirection, emit two or four instructions at once where possible or unroll tiny loops in your arithmetic so you do fewer loop control checks,
you can pull your signed/unsigned flag logic out of the inner loop & track a Boolean and select the right bit-op variants once, not every instruction, trust your emulator invariants,
remove redundant assert(addr ≤ 65535) inside hot code and do one big check at load time.
That should hopefully give you that jump from 6 MHz toward 10 MHz :+1:

1 Like

Thank you for your suggestions :​D
But could you please elaborate on… literally everything you’ve said? And focusing more on how to do not what to do would be helpful :3

After reading your comment more carefully, i think i understood a bit of what you were trying to say. And so, i sacrificed redability and threw away principles of “clean code” to make all intructions into a giant if.
Here is the new speed (in MHz)
image
I hereby declare my goals achieved, and this topic - closed.
Thank you, for all who had helped me :​D

3 Likes

Sorry to bump a thread with a solution; however, I will tell you that localising does not matter in Luau. This is a different language altogether; we have plenty of optimisations in it, and going to Lua ones isn’t particularly the best choice for performance guides, just something I thought should have been said.

oh, thanks for the info
but still, i think typing band is faster than bit32.band

1 Like

Typing-wise, likely yes; performance wise, unlikely much