Skip to content
This repository has been archived by the owner on May 10, 2023. It is now read-only.

Commit

Permalink
fix: disallow emojis for Thai sentences (#431)
Browse files Browse the repository at this point in the history
* Disallow emojis

* Adjust cleanup for .,:; at the beginning/end of sentence

* Allow question mark and exclamation mark

* Use specific match ([^..] does not work with multiple code-point emojis).

* Credit the emoji range

* Put multiple code point in \u{...}

* Change emoji code point to UTF-16 surrogate pair
  • Loading branch information
bact authored Apr 17, 2021
1 parent 54e2831 commit a33a6e2
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 23 deletions.
43 changes: 27 additions & 16 deletions server/lib/cleanup/languages/th.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,36 @@ function sortSentences(sentences) {
// question mark http://www.royin.go.th/?page_id=10418
// exclamation mark http://www.royin.go.th/?page_id=10433
// Maiyamok http://www.royin.go.th/?page_id=10427
//
// Emoji range from
// https://www.regextester.com/106421
// https://stackoverflow.com/questions/10992921/how-to-remove-emoji-code-using-javascript
function clean(sentences) {
return sentences.map((sentence) => {
return sentence
.replace(/[\u200b\u200c]/g, '') // removes zero-width chars (occurs in some Thai texts)
.replace(/:/g, ' : ') // add a space before and after colon
.replace(/\?/g, ' ? ') // adds a space before and after question mark
.replace(/!/g, ' ! ') // adds a space before and after exclamation mark
.replace(/,/g, ' ') // replaces comma with space
.replace(/\.(\.\s*)+/g, ' ') // replaces ellipsis (.., ...) with space
.replace(/\s\./g, ' ') // replaces orphan period with space
.replace(/(\u0E46\s*)+/g, '\u0E46') // condenses multiple Maiyamok to one Maiyamok
.replace(/\u0E46/g, ' \u0E46 ') // adds a space before and after Maiyamok
.replace(/\s+/g, ' ') // condenses multiple spaces to one space
.replace(/^\./, '') // removes periods at the beginning of the sentence
.replace(/^\s+/, '') // removes spaces at the beginning of the sentence
.replace(/\s+$/, '') // removes spaces at the end of the sentence
.replace(/\u0E40\u0E40/g, '\u0E41') // normalizes Sara E + Sara E -> Sara Ae
.replace(/\u0E4d([\u0E48\u0E49\u0E4A\u0E4B]*)\u0E32/g, '$1\u0E33') // normalizes Nikhahit + Sara Aa -> Sara Am
.replace(/([\u0E24\u0E26])\u0E32/g, '$1\u0E45') // normalizes Ru/Lu + Sara Aa -> Ru/Lu + Lakkhangyao
.replace(/[\u200b\u200c]/g, '') // remove zero-width chars (occurs in some Thai texts)
.replace(/\u00a9|\u00ae|[\u2000-\u3300]|[\u2580-\u27bf]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]|[\ue000-\uf8ff]/g, '') // remove emoji
.replace(/:/g, ' : ') // add a space before and after colon
.replace(/\?/g, ' ? ') // add a space before and after question mark
.replace(/!/g, ' ! ') // add a space before and after exclamation mark
.replace(/,/g, ' ') // replace comma with space
.replace(/\.(\.\s*)+/g, ' ') // replace ellipsis (.., ...) with space
.replace(/\s\./g, ' ') // replace orphan period with space
.replace(/(\u0E46\s*)+/g, '\u0E46') // condense multiple Maiyamok to one Maiyamok
.replace(/\u0E46/g, ' \u0E46 ') // add a space before and after Maiyamok
.replace(/\s+/g, ' ') // condense multiple spaces to one space
.replace(/^\.+/, '') // remove periods at the beginning of the sentence
.replace(/^,+/, '') // remove commas at the beginning of the sentence
.replace(/,+$/, '') // remove commas at the end of the sentence
.replace(/^:+/, '') // remove colons at the beginning of the sentence
.replace(/:+$/, '') // remove colons at the end of the sentence
.replace(/^;+/, '') // remove semicolons at the beginning of the sentence
.replace(/;+$/, '') // remove semicolons at the end of the sentence
.replace(/^\s+/, '') // remove spaces at the beginning of the sentence
.replace(/\s+$/, '') // remove spaces at the end of the sentence
.replace(/\u0E40\u0E40/g, '\u0E41') // normalize Sara E + Sara E -> Sara Ae
.replace(/\u0E4d([\u0E48\u0E49\u0E4A\u0E4B]*)\u0E32/g, '$1\u0E33') // normalize Nikhahit + Sara Aa -> Sara Am
.replace(/([\u0E24\u0E26])\u0E32/g, '$1\u0E45') // normalize Ru/Lu + Sara Aa -> Ru/Lu + Lakkhangyao
;
});
}
15 changes: 8 additions & 7 deletions server/lib/validation/languages/th.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
const MIN_LENGTH = 2;
const MAX_LENGTH = 80;

// Numbers that are not allowed in a sentence depending on the language. For
// English this is 0-9 once or multiple times after each other.
// Numbers that are not allowed in a sentence depending on the language.
// For English this is 0-9 once or multiple times after each other.
// Thai digits: \u0E50-\u0E59 (๐-๙)
const NUMBERS_REGEX = /[0-9๐-๙]+/;

Expand Down Expand Up @@ -44,18 +44,19 @@ const BEGIN_REGEX = /(^|\s+)[\u0E30\u0E32\u0E33\u0E45\u0E31\u0E34\u0E35\u0E36\u0
/* eslint-disable-next-line no-misleading-character-class */
const END_REGEX = /[\u0E40\u0E41\u0E42\u0E43\u0E44](\s+|$)/;

// The following symbols are disallowed, please update here as well and not just the regex
// The following symbols are disallowed,
// please update here as well and not just the regex
// to make it easier to read:
// < > + * \ # @ ^ [ ] ( ) /
// Paiyannoi: \u0E2F ฯ (ellipsis, abbreviation)
// Maiyamok: \u0E46 ๆ (repetition)
// Fongman: \u0E4F ๏ (used as bullet)
// Angkhankhu: \u0E5A ๚ (used to mark end of section/verse)
// Khomut: \u0E5B ๛ (used to mark end of chapter/document)
//
// Latin characters are disallowed as well,
// as they can introduce difficulty for pronunciation.
const SYMBOL_REGEX = /[<>+*\\#@^[\]()/\u0E2F\u0E46\u0E4F\u0E5A\u0E5B]|[A-Za-z]+/;
// Latin characters (difficult to pronouce)
// Emoji range from https://www.regextester.com/106421 and
// https://stackoverflow.com/questions/10992921/how-to-remove-emoji-code-using-javascript
const SYMBOL_REGEX = /[<>+*\\#@^[\]()/\u0E2F\u0E46\u0E4F\u0E5A\u0E5B]|[A-Za-z]+|(\u00a9|\u00ae|[\u2000-\u3300]|[\u2580-\u27bf]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]|[\ue000-\uf8ff])/;

// Any words consisting of uppercase letters or uppercase letters with a period
// inbetween are considered abbreviations or acronyms.
Expand Down

0 comments on commit a33a6e2

Please sign in to comment.