Sup,
I’m currently working on implementing a LLM model within Roblox using Lua, and I’ve encountered an issue that I’m struggling to resolve. I’m hoping someone here can provide guidance or suggestions to help me fix it.
Problem Description:
I’m training a model and using a custom CrossEntropyLoss
function. During the training process, I receive the following error:
CrossEntropyLoss:71
Target at index 9 is nil
This error occurs consistently when processing certain batches of data, causing the training loop to halt.
What I’ve Tried:
-
Data Validation:
- Added assertions to ensure that neither the
input
nor thetarget
isnil
before passing them to the loss function. - Verified that the dataset is correctly loaded and that each input has a corresponding target.
- Added assertions to ensure that neither the
-
Loss Function Updates:
- Updated the
CrossEntropyLoss
function to include detailed checks fornil
targets and out-of-range values. - Implemented an epsilon value to prevent logarithm of zero during loss computation.
- Updated the
-
Batch Processing:
- Ensured that the batching mechanism correctly aligns inputs and targets.
- Checked that the batch size is consistent and that no data samples are missing.
-
Logging:
- Added logging to monitor the state of inputs and targets at various stages of the training loop.
- Identified that the error specifically points to the 9th target in the problematic batch.
Relevant Code Snippets:
1. CrossEntropyLoss.luau
:
--[[
CrossEntropyLoss Module
@module CrossEntropyLoss
@type CrossEntropyLossClass
]]
local CrossEntropyLoss = {}
CrossEntropyLoss.__index = CrossEntropyLoss
export type CrossEntropyLossClass = {
new: () -> CrossEntropyLossClass,
compute: (self: CrossEntropyLossClass, logits: { { number } }, targets: { number }) -> number,
backward: (self: CrossEntropyLossClass, logits: { { number } }, targets: { number }) -> { { number } },
}
function CrossEntropyLoss.new(): CrossEntropyLossClass
local self = setmetatable({}, CrossEntropyLoss)
self.loss = 0
self.probs = {} -- Stores softmax probabilities for each sample
self.grad = {} -- Stores gradients w.r.t logits
return self
end
local function softmax(logits: { number }): { number }
local maxLogit = -math.huge
-- Find the maximum logit for numerical stability
for _, logit in ipairs(logits) do
if logit > maxLogit then
maxLogit = logit
end
end
local sumExp = 0
local probs = {}
-- Compute exponentials and sum
for _, logit in ipairs(logits) do
local expVal = math.exp(logit - maxLogit)
table.insert(probs, expVal)
sumExp += expVal
end
-- Normalize to get probabilities
for i, prob in ipairs(probs) do
probs[i] = prob / sumExp
end
return probs
end
-- Computes the cross-entropy loss
-- @param logits { { number } } - The raw output logits from the model for each sample
-- @param targets { number } - The target class indices for each sample
-- @return number - The total cross-entropy loss over all samples
function CrossEntropyLoss:compute(logits: { { number } }, targets: { number }): number
self.loss = 0
self.probs = {} -- Reset probabilities for the current batch
for i, logitRow in ipairs(logits) do
-- Compute softmax probabilities for the current sample
local probs = softmax(logitRow)
self.probs[i] = probs -- Store for backward pass
-- Retrieve the target class index
local target = targets[i]
-- Validate target presence and range
if target == nil then
error(string.format("Target at index %d is nil. Logits: %s", i, table.concat(logitRow, ", ")))
end
if type(target) ~= "number" then
error(string.format("Target at index %d is not a number. Logits: %s", i, table.concat(logitRow, ", ")))
end
if target < 1 or target > #logitRow then
error(string.format("Target at index %d is out of range. Target: %d, Logit Row Size: %d", i, target, #logitRow))
end
-- Retrieve the probability of the target class
local targetProb = probs[target]
-- Validate probability positivity
if targetProb <= 0 then
error(string.format("Probability at index %d for target %d is not positive. Probability: %f", i, target, targetProb))
end
-- Accumulate the negative log probability
self.loss += -math.log(targetProb)
end
return self.loss
end
-- Computes the gradient of the loss w.r.t logits
-- @param logits { { number } } - The raw output logits from the model for each sample
-- @param targets { number } - The target class indices for each sample
-- @return { { number } } - The gradient of the loss w.r.t each logit
function CrossEntropyLoss:backward(logits: { { number } }, targets: { number }): { { number } }
self.grad = {}
for i, logitRow in ipairs(logits) do
self.grad[i] = {}
local probs = self.probs[i]
local target = targets[i]
for j, prob in ipairs(probs) do
if j == target then
-- Gradient for the target class
self.grad[i][j] = prob - 1
else
-- Gradient for non-target classes
self.grad[i][j] = prob
end
end
end
return self.grad
end
return CrossEntropyLoss
2. Optimizer.luau
:
--[[
Optimizer Module
@module Optimizer
@type OptimizerClass
]]
local Types = require(script.Parent.types)
export type OptimizerClass = {
new: (learningRate: number, beta1: number?, beta2: number?, epsilon: number?) -> OptimizerClass,
learningRate: number,
beta1: number,
beta2: number,
epsilon: number,
m: { [string]: { [number]: { [number]: number } } },
v: { [string]: { [number]: { [number]: number } } },
t: number,
updateParameters: (self: OptimizerClass, parameters: { any }, learningRate: number) -> any,
}
local Optimizer = {}
Optimizer.__index = Optimizer
function Optimizer.new(learningRate: number, beta1: number?, beta2: number?, epsilon: number?): OptimizerClass
local self = setmetatable({}, Optimizer)
self.learningRate = learningRate
self.beta1 = beta1 or 0.9
self.beta2 = beta2 or 0.999
self.epsilon = epsilon or 1e-8
self.m = {}
self.v = {}
self.t = 0
return self
end
function Optimizer:updateParameters(parameters: { any }, learningRate: number): void
self.t += 1
local lr = learningRate or self.learningRate
for paramName, param in pairs(parameters) do
if type(param) == "table" then
if not self.m[paramName] then
self.m[paramName] = {}
self.v[paramName] = {}
for i, row in ipairs(param) do
self.m[paramName][i] = {}
self.v[paramName][i] = {}
for j, _ in ipairs(row) do
self.m[paramName][i][j] = 0
self.v[paramName][i][j] = 0
end
end
end
for i, row in ipairs(param) do
for j, _ in ipairs(row) do
local grad = param.gradW and param.gradW[i][j] or 0 -- Adjust based on parameter type
self.m[paramName][i][j] = self.beta1 * self.m[paramName][i][j] + (1 - self.beta1) * grad
self.v[paramName][i][j] = self.beta2 * self.v[paramName][i][j] + (1 - self.beta2) * (grad * grad)
local mHat = self.m[paramName][i][j] / (1 - self.beta1 ^ self.t)
local vHat = self.v[paramName][i][j] / (1 - self.beta2 ^ self.t)
param[i][j] = param[i][j] - lr * mHat / (math.sqrt(vHat) + self.epsilon)
end
end
elseif type(param) == "number" then
end
end
end
return Optimizer
-
init.luau (only the trainmodel part since its where the issue begins)
:
--[[
https://www.roblox.com/users/1539582829/profile
https://twitter.com/zzen_a
MIT License
Copyright (c) 2024 rustyspotted
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
]]
local Types = require(script.lib.types)
local Optimizer = require(script.lib.Optimizer)
local CrossEntropyLoss = require(script.lib.CrossEntropyLoss)
local LinearAlgebra = require(script.components.LinearAlgebra)
local Tokenizer = require(script.components.Tokenizer)
local Embedding = require(script.components.Embedding)
local MultiHeadAttention = require(script.components.MultiHeadAttention)
local FeedForward = require(script.components.FeedForward)
local LayerNorm = require(script.components.LayerNorm)
local TransformerBlock = require(script.components.TransformerBlock)
local TransformerModel = require(script.components.TransformerModel)
local CharTokenizer = require(script.components.CharTokenizer)
local BPETokenizer = require(script.components.BPETokenizer)
local RoLLM = {}
--[=[
@prop Presets Folder
@within RoLLM
@readonly
References the Presets folder.
]=]
RoLLM.Presets = (script.Parent :: Instance)
RoLLM.Promise = require(RoLLM.Presets.promise)
RoLLM.Signal = require(RoLLM.Presets.signal)
RoLLM.Types = Types
RoLLM.LinearAlgebra = LinearAlgebra
RoLLM.Tokenizer = Tokenizer
RoLLM.Embedding = Embedding
RoLLM.MultiHeadAttention = MultiHeadAttention
RoLLM.FeedForward = FeedForward
RoLLM.LayerNorm = LayerNorm
RoLLM.TransformerBlock = TransformerBlock
RoLLM.TransformerModel = TransformerModel
RoLLM.CharTokenizer = CharTokenizer
RoLLM.BPETokenizer = BPETokenizer
--[=[
@function chunkString
@within RoLLM
@param bigString string - The string to be chunked.
@param chunkSize number - The size of each chunk.
@return {string} - A table containing string chunks.
]=]
local function chunkString(bigString: string, chunkSize: number): { string }
local chunks: { string } = {}
local idx = 1
while idx <= #bigString do
local chunk = string.sub(bigString, idx, idx + chunkSize - 1)
table.insert(chunks, chunk)
idx += chunkSize
end
return chunks
end
--[=[
Creates a "ready-to-use" LLM instance with a tokenizer and model.
@function new
@within RoLLM
@param textData string | {string} - Text data for building the vocabulary.
@param config Types.TransformerConfig - Configuration table for the Transformer.
@param chunkSize number? - Optional chunk size for processing text.
@return {any} - An LLM object with prediction, generation, and training methods.
]=]
function RoLLM.new(
textData: string | { string },
config: Types.TransformerConfig,
chunkSize: number?
): { any }
chunkSize = chunkSize or 500000
local tokenizerMode: string = config.tokenizerMode or "char"
local tokenizer: typeof(Tokenizer.new("char"))
if tokenizerMode == "char" then
tokenizer = CharTokenizer.new()
else
tokenizer = BPETokenizer.new()
if config.externalVocabURL then
tokenizer:loadExternalVocab(config.externalVocabURL)
end
end
local function addTextForChar(str: string): any
if tokenizerMode == "char" then
tokenizer:buildVocabFromText(str)
else
tokenizer:textToTokens(str);
end
end
if tokenizerMode == "char" then
if type(textData) == "table" then
for _, singleString in ipairs(textData) do
if #singleString > chunkSize then
local chunks = chunkString(singleString, chunkSize)
for _, chunk in ipairs(chunks) do
addTextForChar(chunk)
end
else
addTextForChar(singleString)
end
end
else
if #textData > chunkSize then
local chunks = chunkString(textData, chunkSize)
for _, chunk in ipairs(chunks) do
addTextForChar(chunk)
end
else
addTextForChar(textData)
end
end
else
if #textData > chunkSize then
local chunks = chunkString(textData, chunkSize)
for _, chunk in ipairs(chunks) do
addTextForChar(chunk)
end
else
addTextForChar(textData)
end
end
local vocabSize: number = tokenizer:getVocabSize()
if vocabSize == 0 then
error("Vocab size is 0. Did you load data or external vocab properly?")
end
config.vocabSize = vocabSize
print("Final vocab size:", vocabSize)
local model: TransformerModel.TransformerModelClass = TransformerModel.new(config)
local self = {}
local function textToTokens(str: string): { number }
return tokenizer:textToTokens(str)
end
local function tokensToText(toks: { number }): string
return tokenizer:tokensToText(toks)
end
function self:predict(inputStr: string): string
local tokens = textToTokens(inputStr)
local nextID = model:predictNextToken(tokens)
return tokensToText({ nextID })
end
function self:generate(inputStr: string, numTokens: number): string
local tokens = textToTokens(inputStr)
for _ = 1, numTokens do
local nxt = model:predictNextToken(tokens)
table.insert(tokens, nxt)
end
return tokensToText(tokens)
end
function self:predictTemperature(inputStr: string, temperature: number): string
local tokens = textToTokens(inputStr)
local nextID = model:predictNextTokenTemperature(tokens, temperature)
return tokensToText({ nextID })
end
function self:generateTemperature(inputStr: string, numTokens: number, temperature: number): string
local tokens = textToTokens(inputStr)
for _ = 1, numTokens do
local nxt = model:predictNextTokenTemperature(tokens, temperature)
table.insert(tokens, nxt)
end
return tokensToText(tokens)
end
--[=[
Trains the Transformer model using provided training data.
@function trainModel
@within RoLLM
@param trainingData {string} - A table of training strings.
@param epochs number - Number of training epochs.
@param learningRate number - Learning rate for the optimizer.
@return nil
]=]
function self:trainModel(trainingData: { string }, epochs: number, learningRate: number): any
local optimizerInstance = Optimizer.new(learningRate)
local lossFunction = CrossEntropyLoss.new()
print(string.format("Starting Training: %d epochs, Learning Rate: %f", epochs, learningRate))
for epoch = 1, epochs do
local totalLoss = 0
for _, data in ipairs(trainingData) do
local tokens = textToTokens(data)
if #tokens < 2 then
error(string.format("Input sequence is too short for training. Tokens: %s", table.concat(tokens, ", ")))
end
print(string.format("Input tokens for training: %s", table.concat(tokens, ", ")))
local logits = model:forward(tokens)
local targets = {}
for i = 2, #tokens do
table.insert(targets, tokens[i])
end
print(string.format("Generated targets: %s", table.concat(targets, ", ")))
local loss = lossFunction:compute(logits, targets)
totalLoss += loss
local gradOutput = lossFunction:backward(logits, targets)
model:backward(gradOutput)
optimizerInstance:updateParameters(model:getParameters())
end
print(string.format("Epoch %d/%d, Loss: %.4f", epoch, epochs, totalLoss))
end
print("Training Completed.")
end
self._tokenizer = tokenizer
self._model = model
return self
end
return RoLLM