Utf8.upper and utf8.lower

As a developer, it is currently impossible to automatically turn non-English characters uppercase or lowercase.

With the introduction of automatic localization, it is crucial to me as a developer that my GUIs are consistent.

A real world example is if you had two localization entries for “Red” and “Brown”. In game, however, in some cases you might want them to be uppercase (“RED” and “BROWN”). In English, you can easily use string.upper.

Now let’s say you want to localize your game. You translate “Red” to “Rojo” and “Brown” to “Marrón”. In your same code that turns the colors uppercase, “Marrón” will NOT get turned into “MARRÓN”. Rather, it gets turned into the unsightly “MARRóN”. Likewise with string.lower.

I propose to either create utf8.upper and utf8.lower or to extend off of the default behavior of Lua’s string.upper and string.lower (might be a bit more tough).

23 Likes

Hi. Bumping this because I have ran into this issue recently.

This is my workaround.

7 Likes

Hello. Also bumping the thread because I ran into this issue too, and @Quenty’s workaround only accounts for à, á, â, ã, ä, å, æ, ç, è, é, ê, ë, ì, í, î, ï, ð, ñ, ò, ó, ô, õ, ö, ø, ù, ú, û, ü, ý, þ, ā, ă, ą, ć, ĉ, ċ, č, ď, đ, ē, ĕ, ė, ę, ě, ĝ, ğ, ġ, ģ, ĥ, ħ, ĩ, ī, ĭ, į, ı, ij, ĵ, ķ, ĺ, ļ, ľ, ŀ, ł, ń, ņ, ň, ŋ, ō, ŏ, ő, œ, ŕ, ŗ, ř, ś, ŝ, ş, š, ţ, ť, ŧ, ũ, ū, ŭ, ů, ű, ų, ŵ, ŷ, ÿ, ź, ż, ž, ſ, ƀ, ƃ, ƅ, ƈ, ƌ, ƒ, ƙ, ƣ and ơ which isn’t enough for me.

Casing.rbxm (57·3 KiB)

From International 2.4, I’ve added toLocaleLower and toLocaleUpper as a workaround for this. Here’s my workaround without locale:

local casing = {
	caseMapping = require(script:WaitForChild("caseMapping")),
	moreAbove = require(script:WaitForChild("moreAbove")),
	specialCasing = require(script:WaitForChild("specialCasing")),
};

local function concat_utf8(self)
	for i, v in ipairs(self) do
		self[i] = utf8.char(v);
	end;
	return table.concat(self);
end;

local function code_utf8(self)
	local ret = { };
	for _, c in utf8.codes(self) do
		table.insert(ret, c);
	end;
	return ret;
end;

local function replace(copy, self, old, new, max, i, j)
	old, new = type(old) == "table" and old or { old }, type(new) == "table" and new or { new };
	local ret = copy and table.move(self, 1, #self, 1, table.create(#self)) or self;
	local i0 = i and (i - 1) or 0;
	local count = 0;
	while i0 do
		i0 = table.find(ret, old[1], i0 + 1);
		if i0 then
			if j and (i0 > j) then
				break;
			end;
			local match = true;
			if type(old) == "table" then
				for i1, v in ipairs(old) do
					if ret[i0 + i1 - 1] ~= v then
						match = false;
						break;
					end;
				end;
			end;
			if match then
				local repl_len = math.min(#new, #old);
				for i1 = 0, repl_len - 1 do
					ret[i0 + i1] = new[i1 + 1];
				end;
				local i1 = i0 + repl_len;
				if #old > #new then
					for i2 = 1, (#old - #new) do
						table.remove(ret, i1);
					end;
				elseif #new > #old then
					for i2 = 1, (#new - #old) do
						table.insert(ret, i1 + i2 - 1, new[repl_len + i2]);
					end;
				end;
				count += 1;
				if max and max > 0 and count >= max then
					break;
				end;
			end;
		end;
	end;
	return ret;
end;

local function is_latin(c)
	return c and ((c >= 0x0041 and c <= 0x005A) or (c >= 0x0061 and c <= 0x007A) or (c == 0x00AA) or (c == 0x00BA) or (c >= 0x00C0 and c <= 0x00D6)
		or (c >= 0x00D8 and c <= 0x00F6) or (c >= 0x00F8 and c <= 0x02B8) or (c >= 0x02E0 and c <= 0x02E4) or (c >= 0x1D00 and c <= 0x1D25)
		or (c >= 0x1D2C and c <= 0x1D5C) or (c >= 0x1D62 and c <= 0x1D65) or (c >= 0x1D6B and c <= 0x1D77) or (c >= 0x1D79 and c <= 0x1DBE)
		or (c >= 0x1E00 and c <= 0x1EFF) or (c == 0x2071) or (c == 0x207F) or (c >= 0x2090 and c <= 0x209C) or (c >= 0x212A and c <= 0x212B)
		or (c == 0x2132) or (c == 0x214E) or (c >= 0x2160 and c <= 0x2188) or (c >= 0x2C60 and c <= 0x2C7F) or (c >= 0xA722 and c <= 0xA787)
		or (c >= 0xA78B and c <= 0xA78E) or (c >= 0xA790 and c <= 0xA793) or (c >= 0xA7A0 and c <= 0xA7AA) or (c >= 0xA7F8 and c <= 0xA7FF)
		or (c >= 0xFB00 and c <= 0xFB06) or (c >= 0xFF21 and c <= 0xFF3A) or (c >= 0xFF41 and c <= 0xFF5A));
end;

local function toupper(self)
	for i, v in ipairs(self) do
		self[i] = casing.caseMapping.upper[v] or v;
	end;
	for old_value, new_value in next, casing.specialCasing.upper do
		replace(false, self, old_value, new_value);
	end;
	return concat_utf8(self);
end;

local whitespaces = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x1680, 0x2000, 0x2001,
	0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000 };
local function tolower(self)
	for i, v in ipairs(self) do
		-- Final form of sigma
		if self[i] == 0x03A3 and is_latin(self[i - 1]) and ((not self[i + 1]) or table.find(whitespaces, self[i + 1])) then
			self[i] = 0x03C2;
		else
			self[i] = casing.caseMapping.lower[v] or v;
		end;
	end;
	for old_value, new_value in next, casing.specialCasing.lower do
		replace(false, self, old_value, new_value);
	end;
	return concat_utf8(self);
end;

return {
	ToUpper = function(str)
		return toupper(code_utf8(str));
	end,
	ToLower = function(str)
		return tolower(code_utf8(str));
	end,
};
1 Like

Thanks! Are you ok if I merge this code into my UTF8 library under the MIT license?

Yep I’m ok with it. (30 char limit).

1 Like