Skip to content
This repository has been archived by the owner on May 10, 2023. It is now read-only.

Disallow emojis for Thai sentences #431

Merged
merged 7 commits into from
Apr 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 27 additions & 16 deletions server/lib/cleanup/languages/th.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,36 @@ function sortSentences(sentences) {
// question mark http://www.royin.go.th/?page_id=10418
// exclamation mark http://www.royin.go.th/?page_id=10433
// Maiyamok http://www.royin.go.th/?page_id=10427
//
// Emoji range from
// https://www.regextester.com/106421
// https://stackoverflow.com/questions/10992921/how-to-remove-emoji-code-using-javascript
function clean(sentences) {
return sentences.map((sentence) => {
return sentence
.replace(/[\u200b\u200c]/g, '') // removes zero-width chars (occurs in some Thai texts)
.replace(/:/g, ' : ') // add a space before and after colon
.replace(/\?/g, ' ? ') // adds a space before and after question mark
.replace(/!/g, ' ! ') // adds a space before and after exclamation mark
.replace(/,/g, ' ') // replaces comma with space
.replace(/\.(\.\s*)+/g, ' ') // replaces ellipsis (.., ...) with space
.replace(/\s\./g, ' ') // replaces orphan period with space
.replace(/(\u0E46\s*)+/g, '\u0E46') // condenses multiple Maiyamok to one Maiyamok
.replace(/\u0E46/g, ' \u0E46 ') // adds a space before and after Maiyamok
.replace(/\s+/g, ' ') // condenses multiple spaces to one space
.replace(/^\./, '') // removes periods at the beginning of the sentence
.replace(/^\s+/, '') // removes spaces at the beginning of the sentence
.replace(/\s+$/, '') // removes spaces at the end of the sentence
.replace(/\u0E40\u0E40/g, '\u0E41') // normalizes Sara E + Sara E -> Sara Ae
.replace(/\u0E4d([\u0E48\u0E49\u0E4A\u0E4B]*)\u0E32/g, '$1\u0E33') // normalizes Nikhahit + Sara Aa -> Sara Am
.replace(/([\u0E24\u0E26])\u0E32/g, '$1\u0E45') // normalizes Ru/Lu + Sara Aa -> Ru/Lu + Lakkhangyao
.replace(/[\u200b\u200c]/g, '') // remove zero-width chars (occurs in some Thai texts)
.replace(/\u00a9|\u00ae|[\u2000-\u3300]|[\u2580-\u27bf]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]|[\ue000-\uf8ff]/g, '') // remove emoji
.replace(/:/g, ' : ') // add a space before and after colon
.replace(/\?/g, ' ? ') // add a space before and after question mark
.replace(/!/g, ' ! ') // add a space before and after exclamation mark
.replace(/,/g, ' ') // replace comma with space
.replace(/\.(\.\s*)+/g, ' ') // replace ellipsis (.., ...) with space
.replace(/\s\./g, ' ') // replace orphan period with space
.replace(/(\u0E46\s*)+/g, '\u0E46') // condense multiple Maiyamok to one Maiyamok
.replace(/\u0E46/g, ' \u0E46 ') // add a space before and after Maiyamok
.replace(/\s+/g, ' ') // condense multiple spaces to one space
.replace(/^\.+/, '') // remove periods at the beginning of the sentence
.replace(/^,+/, '') // remove commas at the beginning of the sentence
.replace(/,+$/, '') // remove commas at the end of the sentence
.replace(/^:+/, '') // remove colons at the beginning of the sentence
.replace(/:+$/, '') // remove colons at the end of the sentence
.replace(/^;+/, '') // remove semicolons at the beginning of the sentence
.replace(/;+$/, '') // remove semicolons at the end of the sentence
.replace(/^\s+/, '') // remove spaces at the beginning of the sentence
.replace(/\s+$/, '') // remove spaces at the end of the sentence
.replace(/\u0E40\u0E40/g, '\u0E41') // normalize Sara E + Sara E -> Sara Ae
.replace(/\u0E4d([\u0E48\u0E49\u0E4A\u0E4B]*)\u0E32/g, '$1\u0E33') // normalize Nikhahit + Sara Aa -> Sara Am
.replace(/([\u0E24\u0E26])\u0E32/g, '$1\u0E45') // normalize Ru/Lu + Sara Aa -> Ru/Lu + Lakkhangyao
;
});
}
15 changes: 8 additions & 7 deletions server/lib/validation/languages/th.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
const MIN_LENGTH = 2;
const MAX_LENGTH = 80;

// Numbers that are not allowed in a sentence depending on the language. For
// English this is 0-9 once or multiple times after each other.
// Numbers that are not allowed in a sentence depending on the language.
// For English this is 0-9 once or multiple times after each other.
// Thai digits: \u0E50-\u0E59 (๐-๙)
const NUMBERS_REGEX = /[0-9๐-๙]+/;

Expand Down Expand Up @@ -44,18 +44,19 @@ const BEGIN_REGEX = /(^|\s+)[\u0E30\u0E32\u0E33\u0E45\u0E31\u0E34\u0E35\u0E36\u0
/* eslint-disable-next-line no-misleading-character-class */
const END_REGEX = /[\u0E40\u0E41\u0E42\u0E43\u0E44](\s+|$)/;

// The following symbols are disallowed, please update here as well and not just the regex
// The following symbols are disallowed,
// please update here as well and not just the regex
// to make it easier to read:
// < > + * \ # @ ^ [ ] ( ) /
// Paiyannoi: \u0E2F ฯ (ellipsis, abbreviation)
// Maiyamok: \u0E46 ๆ (repetition)
// Fongman: \u0E4F ๏ (used as bullet)
// Angkhankhu: \u0E5A ๚ (used to mark end of section/verse)
// Khomut: \u0E5B ๛ (used to mark end of chapter/document)
//
// Latin characters are disallowed as well,
// as they can introduce difficulty for pronunciation.
const SYMBOL_REGEX = /[<>+*\\#@^[\]()/\u0E2F\u0E46\u0E4F\u0E5A\u0E5B]|[A-Za-z]+/;
// Latin characters (difficult to pronouce)
// Emoji range from https://www.regextester.com/106421 and
// https://stackoverflow.com/questions/10992921/how-to-remove-emoji-code-using-javascript
const SYMBOL_REGEX = /[<>+*\\#@^[\]()/\u0E2F\u0E46\u0E4F\u0E5A\u0E5B]|[A-Za-z]+|(\u00a9|\u00ae|[\u2000-\u3300]|[\u2580-\u27bf]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]|[\ue000-\uf8ff])/;

// Any words consisting of uppercase letters or uppercase letters with a period
// inbetween are considered abbreviations or acronyms.
Expand Down