Skip to content
This repository has been archived by the owner on May 10, 2023. It is now read-only.

Commit

Permalink
add korean sentence validation #630 (#630)
Browse files Browse the repository at this point in the history
  • Loading branch information
sftblw authored Aug 8, 2022
1 parent 5a86a81 commit 70183de
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 0 deletions.
2 changes: 2 additions & 0 deletions server/lib/validation/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const eo = require('./languages/eo');
const ig = require('./languages/ig');
const it = require('./languages/it');
const kab = require( './languages/kab');
const ko = require( './languages/ko');
const ne = require('./languages/ne');
const or = require('./languages/or');
const ru = require('./languages/ru');
Expand All @@ -25,6 +26,7 @@ const VALIDATORS = {
ig,
it,
kab,
ko,
ne,
or,
ru,
Expand Down
55 changes: 55 additions & 0 deletions server/lib/validation/languages/ko.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Minimum of characters that qualify as a sentence.
const MIN_CHARACTERS = 1;

// Maximum of characters allowed per sentence to keep recordings in a manageable duration.
const MAX_CHARACTERS = 50;

const INVALIDATIONS = [{
fn: (sentence) => {
// To properly tokenize Korean, We need some heavy tokenizers (ex: mecab-ko, nori, ...),
// For counting letters those tokenizers are not necessary.
return sentence.length < MIN_CHARACTERS || sentence.length > MAX_CHARACTERS;
},
error: `문장의 글자 수는 ${MIN_CHARACTERS}글자 이상, ${MAX_CHARACTERS}글자 이하여야 합니다.`,
}, {
// One Korean letter is composed with two or three letters,
// in order of (consonant(1st) - vowel(2nd) - consonant(3rd, optional)).
// It shouldn't be allowed to use them separately, since that could cause various pronunciation issues.
//
// This regex is for Unicode "Hangul Syllables" (U+AC00–U+D7A3), Which are composed form (see below).
regex: /[ㄱ-ㅎㅏ-ㅣ]/,
error: '문장에는 자음이나 모음만 따로 있는 글자가 있어서는 안 됩니다.',
},
{
// Korean letters (Hangul) have two type of Unicode code points.
//
// - Composed form (Unicode "Hangul Syllables" : U+AC00–U+D7A3)
// - One Unicode codepoint contains three or two letters in rectangular shape.
// - This is normally used codepoints.
// - Other forms
// - Other Unicode codepoints deal korean letters as separated vowels and consonants.
// - This takes doubled space in bytes.
// - This only appears when a contributor is using keyboard layout called "Sebeolsik", which is akin to Dvorak.
// - After NFC normalization ( 5a86a81 ),
// Composible combination of two or three characters (1st - 2nd - 3rd (optional)) will become
// Composed form ("Hangul Syllables"). Characters that cannot be combined may remain.
//
// This regex is for codepoints other than "Hangul Syllables" (U+AC00–U+D7A3).
regex: /[\u1100-\u11FF\uA960-\uA97F\u3130-\u318F]/u,
error: '문장에는 첫가끝 형태의 분해된 글자가 있어서는 안 됩니다. 완성형 글자를 입력해주세요.',
}, {
// Since there are so may kinds of "should not be allowd" letters,
// It would be convenient to allow only certain type of characters.
// examples: CJK chinese letters, Japanese letters, Korean specific chinese letters (aka hanja),
// not-used symbols (semicolon, colon - native korean sentences do not contain them),
// better to be excluded symbols (quote, tilda, ...),
// characters that can be normalized into normal characters with destructive NFKC normalization (ⓐ, ㈜, ...),
// historical korean letters (aka 옛한글 - ㆆ, ㅿ, ㆁ, ...)
// ...
regex: /[^가-힣.,?! ]/u,
error: '문장에는 한글과 마침표, 쉼표, 느낌표, 물음표, 공백만 들어있어야 합니다.',
}];

module.exports = {
INVALIDATIONS,
};

0 comments on commit 70183de

Please sign in to comment.