Error in Custom CrossEntropyLoss Function: "Target at index X is nil" During Model Training

Sup,

I’m currently working on implementing a LLM model within Roblox using Lua, and I’ve encountered an issue that I’m struggling to resolve. I’m hoping someone here can provide guidance or suggestions to help me fix it.

Problem Description:

I’m training a model and using a custom CrossEntropyLoss function. During the training process, I receive the following error:

CrossEntropyLoss:71
Target at index 9 is nil

This error occurs consistently when processing certain batches of data, causing the training loop to halt.

What I’ve Tried:

  1. Data Validation:

    • Added assertions to ensure that neither the input nor the target is nil before passing them to the loss function.
    • Verified that the dataset is correctly loaded and that each input has a corresponding target.
  2. Loss Function Updates:

    • Updated the CrossEntropyLoss function to include detailed checks for nil targets and out-of-range values.
    • Implemented an epsilon value to prevent logarithm of zero during loss computation.
  3. Batch Processing:

    • Ensured that the batching mechanism correctly aligns inputs and targets.
    • Checked that the batch size is consistent and that no data samples are missing.
  4. Logging:

    • Added logging to monitor the state of inputs and targets at various stages of the training loop.
    • Identified that the error specifically points to the 9th target in the problematic batch.

Relevant Code Snippets:

1. CrossEntropyLoss.luau:

--[[
    CrossEntropyLoss Module

    @module CrossEntropyLoss
    @type CrossEntropyLossClass
]]
local CrossEntropyLoss = {}
CrossEntropyLoss.__index = CrossEntropyLoss
export type CrossEntropyLossClass = {
    new: () -> CrossEntropyLossClass,
    compute: (self: CrossEntropyLossClass, logits: { { number } }, targets: { number }) -> number,
    backward: (self: CrossEntropyLossClass, logits: { { number } }, targets: { number }) -> { { number } },
}

function CrossEntropyLoss.new(): CrossEntropyLossClass
    local self = setmetatable({}, CrossEntropyLoss)
    self.loss = 0
    self.probs = {}  -- Stores softmax probabilities for each sample
    self.grad = {}   -- Stores gradients w.r.t logits
    return self
end

local function softmax(logits: { number }): { number }
    local maxLogit = -math.huge
    -- Find the maximum logit for numerical stability
    for _, logit in ipairs(logits) do
        if logit > maxLogit then
            maxLogit = logit
        end
    end

    local sumExp = 0
    local probs = {}
    -- Compute exponentials and sum
    for _, logit in ipairs(logits) do
        local expVal = math.exp(logit - maxLogit)
        table.insert(probs, expVal)
        sumExp += expVal
    end

    -- Normalize to get probabilities
    for i, prob in ipairs(probs) do
        probs[i] = prob / sumExp
    end

    return probs
end

-- Computes the cross-entropy loss
-- @param logits { { number } } - The raw output logits from the model for each sample
-- @param targets { number } - The target class indices for each sample
-- @return number - The total cross-entropy loss over all samples
function CrossEntropyLoss:compute(logits: { { number } }, targets: { number }): number
    self.loss = 0
    self.probs = {}  -- Reset probabilities for the current batch

    for i, logitRow in ipairs(logits) do
        -- Compute softmax probabilities for the current sample
        local probs = softmax(logitRow)
        self.probs[i] = probs  -- Store for backward pass

        -- Retrieve the target class index
        local target = targets[i]

        -- Validate target presence and range
        if target == nil then
            error(string.format("Target at index %d is nil. Logits: %s", i, table.concat(logitRow, ", ")))
        end
        if type(target) ~= "number" then
            error(string.format("Target at index %d is not a number. Logits: %s", i, table.concat(logitRow, ", ")))
        end
        if target < 1 or target > #logitRow then
            error(string.format("Target at index %d is out of range. Target: %d, Logit Row Size: %d", i, target, #logitRow))
        end

        -- Retrieve the probability of the target class
        local targetProb = probs[target]

        -- Validate probability positivity
        if targetProb <= 0 then
            error(string.format("Probability at index %d for target %d is not positive. Probability: %f", i, target, targetProb))
        end

        -- Accumulate the negative log probability
        self.loss += -math.log(targetProb)
    end

    return self.loss
end

-- Computes the gradient of the loss w.r.t logits
-- @param logits { { number } } - The raw output logits from the model for each sample
-- @param targets { number } - The target class indices for each sample
-- @return { { number } } - The gradient of the loss w.r.t each logit
function CrossEntropyLoss:backward(logits: { { number } }, targets: { number }): { { number } }
    self.grad = {}

    for i, logitRow in ipairs(logits) do
        self.grad[i] = {}
        local probs = self.probs[i]
        local target = targets[i]

        for j, prob in ipairs(probs) do
            if j == target then
                -- Gradient for the target class
                self.grad[i][j] = prob - 1
            else
                -- Gradient for non-target classes
                self.grad[i][j] = prob
            end
        end
    end

    return self.grad
end

return CrossEntropyLoss

2. Optimizer.luau:

--[[
    Optimizer Module

    @module Optimizer
    @type OptimizerClass
]]

local Types = require(script.Parent.types)

export type OptimizerClass = {
    new: (learningRate: number, beta1: number?, beta2: number?, epsilon: number?) -> OptimizerClass,
    learningRate: number,
    beta1: number,
    beta2: number,
    epsilon: number,
    m: { [string]: { [number]: { [number]: number } } },
    v: { [string]: { [number]: { [number]: number } } },
    t: number,
    updateParameters: (self: OptimizerClass, parameters: { any }, learningRate: number) -> any,
}

local Optimizer = {}
Optimizer.__index = Optimizer

function Optimizer.new(learningRate: number, beta1: number?, beta2: number?, epsilon: number?): OptimizerClass
    local self = setmetatable({}, Optimizer)
    self.learningRate = learningRate
    self.beta1 = beta1 or 0.9
    self.beta2 = beta2 or 0.999
    self.epsilon = epsilon or 1e-8
    self.m = {}
    self.v = {}
    self.t = 0
    return self
end

function Optimizer:updateParameters(parameters: { any }, learningRate: number): void
    self.t += 1
    local lr = learningRate or self.learningRate

    for paramName, param in pairs(parameters) do
        if type(param) == "table" then
            if not self.m[paramName] then
                self.m[paramName] = {}
                self.v[paramName] = {}
                for i, row in ipairs(param) do
                    self.m[paramName][i] = {}
                    self.v[paramName][i] = {}
                    for j, _ in ipairs(row) do
                        self.m[paramName][i][j] = 0
                        self.v[paramName][i][j] = 0
                    end
                end
            end

            for i, row in ipairs(param) do
                for j, _ in ipairs(row) do
                    local grad = param.gradW and param.gradW[i][j] or 0  -- Adjust based on parameter type
                    self.m[paramName][i][j] = self.beta1 * self.m[paramName][i][j] + (1 - self.beta1) * grad
                    self.v[paramName][i][j] = self.beta2 * self.v[paramName][i][j] + (1 - self.beta2) * (grad * grad)

                    local mHat = self.m[paramName][i][j] / (1 - self.beta1 ^ self.t)
                    local vHat = self.v[paramName][i][j] / (1 - self.beta2 ^ self.t)

                    param[i][j] = param[i][j] - lr * mHat / (math.sqrt(vHat) + self.epsilon)
                end
            end
        elseif type(param) == "number" then
           
        end
    end
end

return Optimizer
  1. init.luau (only the trainmodel part since its where the issue begins):
--[[
    https://www.roblox.com/users/1539582829/profile
    https://twitter.com/zzen_a

    MIT License

    Copyright (c) 2024 rustyspotted

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:

    The above copyright notice and this permission notice shall be included in all
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE.
]]

local Types = require(script.lib.types)
local Optimizer = require(script.lib.Optimizer)
local CrossEntropyLoss = require(script.lib.CrossEntropyLoss)

local LinearAlgebra = require(script.components.LinearAlgebra)
local Tokenizer = require(script.components.Tokenizer)
local Embedding = require(script.components.Embedding)
local MultiHeadAttention = require(script.components.MultiHeadAttention)
local FeedForward = require(script.components.FeedForward)
local LayerNorm = require(script.components.LayerNorm)
local TransformerBlock = require(script.components.TransformerBlock)
local TransformerModel = require(script.components.TransformerModel)
local CharTokenizer = require(script.components.CharTokenizer)
local BPETokenizer = require(script.components.BPETokenizer)

local RoLLM = {}

--[=[
    @prop Presets Folder
    @within RoLLM
    @readonly
    References the Presets folder.
]=]

RoLLM.Presets = (script.Parent :: Instance)

RoLLM.Promise = require(RoLLM.Presets.promise)
RoLLM.Signal = require(RoLLM.Presets.signal)
RoLLM.Types = Types
RoLLM.LinearAlgebra = LinearAlgebra
RoLLM.Tokenizer = Tokenizer
RoLLM.Embedding = Embedding
RoLLM.MultiHeadAttention = MultiHeadAttention
RoLLM.FeedForward = FeedForward
RoLLM.LayerNorm = LayerNorm
RoLLM.TransformerBlock = TransformerBlock
RoLLM.TransformerModel = TransformerModel
RoLLM.CharTokenizer = CharTokenizer
RoLLM.BPETokenizer = BPETokenizer

--[=[
    @function chunkString
    @within RoLLM
    @param bigString string - The string to be chunked.
    @param chunkSize number - The size of each chunk.
    @return {string} - A table containing string chunks.
]=]
local function chunkString(bigString: string, chunkSize: number): { string }
    local chunks: { string } = {}
    local idx = 1
    while idx <= #bigString do
        local chunk = string.sub(bigString, idx, idx + chunkSize - 1)
        table.insert(chunks, chunk)
        idx += chunkSize
    end
    return chunks
end

--[=[
    Creates a "ready-to-use" LLM instance with a tokenizer and model.

    @function new
    @within RoLLM
    @param textData string | {string} - Text data for building the vocabulary.
    @param config Types.TransformerConfig - Configuration table for the Transformer.
    @param chunkSize number? - Optional chunk size for processing text.
    @return {any} - An LLM object with prediction, generation, and training methods.
]=]
function RoLLM.new(
    textData: string | { string },
    config: Types.TransformerConfig,
    chunkSize: number?
): { any }
    chunkSize = chunkSize or 500000
    local tokenizerMode: string = config.tokenizerMode or "char"

    local tokenizer: typeof(Tokenizer.new("char"))
    if tokenizerMode == "char" then
        tokenizer = CharTokenizer.new()
    else
        tokenizer = BPETokenizer.new()
        if config.externalVocabURL then
            tokenizer:loadExternalVocab(config.externalVocabURL)
        end
    end

    local function addTextForChar(str: string): any
		if tokenizerMode == "char" then
        	tokenizer:buildVocabFromText(str)
		else
			tokenizer:textToTokens(str);
		end
    end

    if tokenizerMode == "char" then
        if type(textData) == "table" then
            for _, singleString in ipairs(textData) do
                if #singleString > chunkSize then
                    local chunks = chunkString(singleString, chunkSize)
                    for _, chunk in ipairs(chunks) do
                        addTextForChar(chunk)
                    end
                else
                    addTextForChar(singleString)
                end
            end
        else
            if #textData > chunkSize then
                local chunks = chunkString(textData, chunkSize)
                for _, chunk in ipairs(chunks) do
                    addTextForChar(chunk)
                end
            else
                addTextForChar(textData)
            end
        end
    else
		if #textData > chunkSize then
			local chunks = chunkString(textData, chunkSize)
			for _, chunk in ipairs(chunks) do
				addTextForChar(chunk)
			end
		else
			addTextForChar(textData)
		end
    end

    local vocabSize: number = tokenizer:getVocabSize()
    if vocabSize == 0 then
        error("Vocab size is 0. Did you load data or external vocab properly?")
    end
    config.vocabSize = vocabSize
    print("Final vocab size:", vocabSize)

    local model: TransformerModel.TransformerModelClass = TransformerModel.new(config)

    local self = {}

    local function textToTokens(str: string): { number }
        return tokenizer:textToTokens(str)
    end

    local function tokensToText(toks: { number }): string
        return tokenizer:tokensToText(toks)
    end

    function self:predict(inputStr: string): string
        local tokens = textToTokens(inputStr)
        local nextID = model:predictNextToken(tokens)
        return tokensToText({ nextID })
    end

    function self:generate(inputStr: string, numTokens: number): string
        local tokens = textToTokens(inputStr)
        for _ = 1, numTokens do
            local nxt = model:predictNextToken(tokens)
            table.insert(tokens, nxt)
        end
        return tokensToText(tokens)
    end

    function self:predictTemperature(inputStr: string, temperature: number): string
        local tokens = textToTokens(inputStr)
        local nextID = model:predictNextTokenTemperature(tokens, temperature)
        return tokensToText({ nextID })
    end

    function self:generateTemperature(inputStr: string, numTokens: number, temperature: number): string
        local tokens = textToTokens(inputStr)
        for _ = 1, numTokens do
            local nxt = model:predictNextTokenTemperature(tokens, temperature)
            table.insert(tokens, nxt)
        end
        return tokensToText(tokens)
    end

    --[=[
        Trains the Transformer model using provided training data.

        @function trainModel
        @within RoLLM
        @param trainingData {string} - A table of training strings.
        @param epochs number - Number of training epochs.
        @param learningRate number - Learning rate for the optimizer.
        @return nil
    ]=]
	function self:trainModel(trainingData: { string }, epochs: number, learningRate: number): any
		local optimizerInstance = Optimizer.new(learningRate)
		local lossFunction = CrossEntropyLoss.new()
	
		print(string.format("Starting Training: %d epochs, Learning Rate: %f", epochs, learningRate))
	
		for epoch = 1, epochs do
			local totalLoss = 0
			for _, data in ipairs(trainingData) do
				local tokens = textToTokens(data)
	
				if #tokens < 2 then
					error(string.format("Input sequence is too short for training. Tokens: %s", table.concat(tokens, ", ")))
				end
	
				print(string.format("Input tokens for training: %s", table.concat(tokens, ", ")))
	
				local logits = model:forward(tokens)
	
				local targets = {}
				for i = 2, #tokens do
					table.insert(targets, tokens[i])
				end
	
				print(string.format("Generated targets: %s", table.concat(targets, ", ")))
	
				local loss = lossFunction:compute(logits, targets)
				totalLoss += loss
	
				local gradOutput = lossFunction:backward(logits, targets)
				model:backward(gradOutput)
	
				optimizerInstance:updateParameters(model:getParameters())
			end
			print(string.format("Epoch %d/%d, Loss: %.4f", epoch, epochs, totalLoss))
		end
	
		print("Training Completed.")
	end	
	
    self._tokenizer = tokenizer
    self._model = model

    return self
end

return RoLLM

Github repo: GitHub - rustyspottedcatt/RoLLM

bump since no one answered the post