str = ''
str:gsub("[^a-zA-ZËë{and more characters}]", '')
but I also wanted to include CJK characters and it would take a lot of space.
For example: ジョンスミス! 山田"太郎 John$Smith → ジョンスミス 山田太郎 JohnSmith 无。無。 → 无無 hë!!o (Gru߀) → hëo Gruß
That’s happening because。、 ؟, aren’t considered ponctuation by lua. Only way to do this is, loop manually, through the string (after we apply the normal gsub), and gsub every character you want to remove that’s stored in a table.
local specialChars = {"。","、", "؟"}
local str = "!无。$、¥『山田太郎』"
str = str:gsub("[%d%p]+", "") --do a first run
for i = 1, #str do
local current = string.sub(str, i, i) --current character in the string
for i, v in pairs(specialChars) do
if current = v then
str = str:gsub(current, "")
end
end
end
I’m gonna assume this is going to be an expensive process
local str = "ジョンスミス! 山田'太郎 John$Smith"
local filteredString, numOfMatches = string.gsub(str, "%A", "") --%A to find all non-letter characters and then replace them with "" / delete them
print(filteredString)
I have a pattern here that captures individual unicode characters. From there you can pick out which ones aren’t letters.
local str = [[!无。$、¥『山田太郎』 مرحبا؟ ジョンスミス! 山田'太郎
John$Smith hë!!o (Gru߀) 无。無。]]
local punctuationString = [[。、¥『』؟€]]
local pattern = "[%z\1-\127\194-\244][\128-\191]*"
local punctuation = {}
for c in punctuationString:gmatch(pattern) do
punctuation[c] = true
end
str = str:gsub(pattern, function(character)
if character:match("[%p%d]") or punctuation[character] then
return ""
end
end)
print(str)
Indicating everything that aren’t letters is also harder than I thought
In Regex there’s \p{L} which matches any kind of letter from any language
Here’s my attempt:
local function replace_non_letters(str, replace_to)
str = str:gsub("[%d%p]", '');
local ret = '';
for _, char in utf8.codes(str) do
local add_letter = false;
-- https://jrgraphix.net/research/unicode_blocks.php
if char == 32 -- Spaces
-- Latin
or ((char >= 0x41) and (char <= 0x5a)) or ((char >= 0x61) and (char <= 0x7a))
or ((char >= 0xc0) and (char <= 0x24f))
-- Greek
or ((char >= 0x370) and (char <= 0x3ff))
-- Arabic
or ((char >= 0x600) and (char <= 0x6d5))
-- Chinese Characters (漢字)
or ((char >= 0x4E00) and (char <= 0x9FFF)) or ((char >= 0x3400) and (char <= 0x4DBF)) or ((char >= 0x20000) and (char <= 0x2A6DF))
or ((char >= 0x2A700) and (char <= 0x2B73F)) or ((char >= 0x2B740) and (char <= 0x2B81F)) or ((char >= 0x2B820) and (char <= 0x2CEAF))
or ((char >= 0xF900) and (char <= 0xFAFF)) or ((char >= 0x2F800) and (char <= 0x2FA1F))
-- ひらがな、カタカナ
or ((char >= 0x3041) and (char <= 0x3096)) or ((char >= 0x30a1) and (char <= 0x30fa))
-- Hangul
or ((char >= 0xac00) and (char <= 0xd7af))
then
add_letter = true;
end
if add_letter then
ret = ret .. utf8.char(char);
else
ret = ret .. (replace_to or '');
end;
end;
return ret;
end;