diff --git a/server/lib/cleanup/languages/th.js b/server/lib/cleanup/languages/th.js index 5b26906f..4e26e670 100644 --- a/server/lib/cleanup/languages/th.js +++ b/server/lib/cleanup/languages/th.js @@ -15,25 +15,36 @@ function sortSentences(sentences) { // question mark http://www.royin.go.th/?page_id=10418 // exclamation mark http://www.royin.go.th/?page_id=10433 // Maiyamok http://www.royin.go.th/?page_id=10427 +// +// Emoji range from +// https://www.regextester.com/106421 +// https://stackoverflow.com/questions/10992921/how-to-remove-emoji-code-using-javascript function clean(sentences) { return sentences.map((sentence) => { return sentence - .replace(/[\u200b\u200c]/g, '') // removes zero-width chars (occurs in some Thai texts) - .replace(/:/g, ' : ') // add a space before and after colon - .replace(/\?/g, ' ? ') // adds a space before and after question mark - .replace(/!/g, ' ! ') // adds a space before and after exclamation mark - .replace(/,/g, ' ') // replaces comma with space - .replace(/\.(\.\s*)+/g, ' ') // replaces ellipsis (.., ...) with space - .replace(/\s\./g, ' ') // replaces orphan period with space - .replace(/(\u0E46\s*)+/g, '\u0E46') // condenses multiple Maiyamok to one Maiyamok - .replace(/\u0E46/g, ' \u0E46 ') // adds a space before and after Maiyamok - .replace(/\s+/g, ' ') // condenses multiple spaces to one space - .replace(/^\./, '') // removes periods at the beginning of the sentence - .replace(/^\s+/, '') // removes spaces at the beginning of the sentence - .replace(/\s+$/, '') // removes spaces at the end of the sentence - .replace(/\u0E40\u0E40/g, '\u0E41') // normalizes Sara E + Sara E -> Sara Ae - .replace(/\u0E4d([\u0E48\u0E49\u0E4A\u0E4B]*)\u0E32/g, '$1\u0E33') // normalizes Nikhahit + Sara Aa -> Sara Am - .replace(/([\u0E24\u0E26])\u0E32/g, '$1\u0E45') // normalizes Ru/Lu + Sara Aa -> Ru/Lu + Lakkhangyao + .replace(/[\u200b\u200c]/g, '') // remove zero-width chars (occurs in some Thai texts) + .replace(/\u00a9|\u00ae|[\u2000-\u3300]|[\u2580-\u27bf]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]|[\ue000-\uf8ff]/g, '') // remove emoji + .replace(/:/g, ' : ') // add a space before and after colon + .replace(/\?/g, ' ? ') // add a space before and after question mark + .replace(/!/g, ' ! ') // add a space before and after exclamation mark + .replace(/,/g, ' ') // replace comma with space + .replace(/\.(\.\s*)+/g, ' ') // replace ellipsis (.., ...) with space + .replace(/\s\./g, ' ') // replace orphan period with space + .replace(/(\u0E46\s*)+/g, '\u0E46') // condense multiple Maiyamok to one Maiyamok + .replace(/\u0E46/g, ' \u0E46 ') // add a space before and after Maiyamok + .replace(/\s+/g, ' ') // condense multiple spaces to one space + .replace(/^\.+/, '') // remove periods at the beginning of the sentence + .replace(/^,+/, '') // remove commas at the beginning of the sentence + .replace(/,+$/, '') // remove commas at the end of the sentence + .replace(/^:+/, '') // remove colons at the beginning of the sentence + .replace(/:+$/, '') // remove colons at the end of the sentence + .replace(/^;+/, '') // remove semicolons at the beginning of the sentence + .replace(/;+$/, '') // remove semicolons at the end of the sentence + .replace(/^\s+/, '') // remove spaces at the beginning of the sentence + .replace(/\s+$/, '') // remove spaces at the end of the sentence + .replace(/\u0E40\u0E40/g, '\u0E41') // normalize Sara E + Sara E -> Sara Ae + .replace(/\u0E4d([\u0E48\u0E49\u0E4A\u0E4B]*)\u0E32/g, '$1\u0E33') // normalize Nikhahit + Sara Aa -> Sara Am + .replace(/([\u0E24\u0E26])\u0E32/g, '$1\u0E45') // normalize Ru/Lu + Sara Aa -> Ru/Lu + Lakkhangyao ; }); } diff --git a/server/lib/validation/languages/th.js b/server/lib/validation/languages/th.js index b708320d..141eda19 100644 --- a/server/lib/validation/languages/th.js +++ b/server/lib/validation/languages/th.js @@ -9,8 +9,8 @@ const MIN_LENGTH = 2; const MAX_LENGTH = 80; -// Numbers that are not allowed in a sentence depending on the language. For -// English this is 0-9 once or multiple times after each other. +// Numbers that are not allowed in a sentence depending on the language. +// For English this is 0-9 once or multiple times after each other. // Thai digits: \u0E50-\u0E59 (๐-๙) const NUMBERS_REGEX = /[0-9๐-๙]+/; @@ -44,7 +44,8 @@ const BEGIN_REGEX = /(^|\s+)[\u0E30\u0E32\u0E33\u0E45\u0E31\u0E34\u0E35\u0E36\u0 /* eslint-disable-next-line no-misleading-character-class */ const END_REGEX = /[\u0E40\u0E41\u0E42\u0E43\u0E44](\s+|$)/; -// The following symbols are disallowed, please update here as well and not just the regex +// The following symbols are disallowed, +// please update here as well and not just the regex // to make it easier to read: // < > + * \ # @ ^ [ ] ( ) / // Paiyannoi: \u0E2F ฯ (ellipsis, abbreviation) @@ -52,10 +53,10 @@ const END_REGEX = /[\u0E40\u0E41\u0E42\u0E43\u0E44](\s+|$)/; // Fongman: \u0E4F ๏ (used as bullet) // Angkhankhu: \u0E5A ๚ (used to mark end of section/verse) // Khomut: \u0E5B ๛ (used to mark end of chapter/document) -// -// Latin characters are disallowed as well, -// as they can introduce difficulty for pronunciation. -const SYMBOL_REGEX = /[<>+*\\#@^[\]()/\u0E2F\u0E46\u0E4F\u0E5A\u0E5B]|[A-Za-z]+/; +// Latin characters (difficult to pronouce) +// Emoji range from https://www.regextester.com/106421 and +// https://stackoverflow.com/questions/10992921/how-to-remove-emoji-code-using-javascript +const SYMBOL_REGEX = /[<>+*\\#@^[\]()/\u0E2F\u0E46\u0E4F\u0E5A\u0E5B]|[A-Za-z]+|(\u00a9|\u00ae|[\u2000-\u3300]|[\u2580-\u27bf]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]|[\ue000-\uf8ff])/; // Any words consisting of uppercase letters or uppercase letters with a period // inbetween are considered abbreviations or acronyms.