Skip to content

Commit

Permalink
Add Thai lex_attrs (#3655)
Browse files Browse the repository at this point in the history
* test sPacy commit to git fri 04052019 10:54

* change Data format from my format to master format

* ทัทั้งนี้ ---> ทั้งนี้

* delete stop_word translate from Eng

* Adjust formatting and readability

* add Thai norm_exception

* Add Dobita21 SCA

* editรึ : หรือ,

* Update Dobita21.md

* Auto-format

* Integrate norms into language defaults

* add acronym and some norm exception words

* add lex_attrs

* Add lexical attribute getters into the language defaults

* fix LEX_ATTRS


Co-authored-by: Donut <[email protected]>
Co-authored-by: Ines Montani <[email protected]>
  • Loading branch information
3 people committed May 1, 2019
1 parent ba1ff00 commit f95eced
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 0 deletions.
2 changes: 2 additions & 0 deletions spacy/lang/th/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .norm_exceptions import NORM_EXCEPTIONS
from .lex_attrs import LEX_ATTRS

from ..norm_exceptions import BASE_NORMS
from ...attrs import LANG, NORM
Expand Down Expand Up @@ -34,6 +35,7 @@ def __call__(self, text):

class ThaiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda _text: "th"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
Expand Down
62 changes: 62 additions & 0 deletions spacy/lang/th/lex_attrs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# coding: utf8
from __future__ import unicode_literals

from ...attrs import LIKE_NUM


_num_words = [
"ศูนย์",
"หนึ่ง",
"สอง",
"สาม",
"สี่",
"ห้า",
"หก",
"เจ็ด",
"แปด",
"เก้า",
"สิบ",
"สิบเอ็ด",
"ยี่สิบ",
"ยี่สิบเอ็ด",
"สามสิบ",
"สามสิบเอ็ด",
"สี่สิบ",
"สี่สิบเอ็ด",
"ห้าสิบ",
"ห้าสิบเอ็ด",
"หกสิบเอ็ด",
"เจ็ดสิบ",
"เจ็ดสิบเอ็ด",
"แปดสิบ",
"แปดสิบเอ็ด",
"เก้าสิบ",
"เก้าสิบเอ็ด",
"ร้อย",
"พัน",
"ล้าน",
"พันล้าน",
"หมื่นล้าน",
"แสนล้าน",
"ล้านล้าน",
"ล้านล้านล้าน",
"ล้านล้านล้านล้าน",
]


def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
return False


LEX_ATTRS = {LIKE_NUM: like_num}

0 comments on commit f95eced

Please sign in to comment.