From f95ecedd83ce75b7062af6afaf47f9ed6fe59550 Mon Sep 17 00:00:00 2001
From: Dobita21 <39238314+Dobita21@users.noreply.github.com>
Date: Wed, 1 May 2019 17:03:14 +0700
Subject: [PATCH] Add Thai lex_attrs (#3655)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* test sPacy commit to git fri 04052019 10:54

* change Data format from my format to master format

* ทัทั้งนี้ ---> ทั้งนี้

* delete stop_word translate from Eng

* Adjust formatting and readability

* add Thai norm_exception

* Add Dobita21 SCA

* editรึ : หรือ,

* Update Dobita21.md

* Auto-format

* Integrate norms into language defaults

* add acronym and some norm exception words

* add lex_attrs

* Add lexical attribute getters into the language defaults

* fix LEX_ATTRS


Co-authored-by: Donut <dobita21@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>
---
 spacy/lang/th/__init__.py  |  2 ++
 spacy/lang/th/lex_attrs.py | 62 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 spacy/lang/th/lex_attrs.py

diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py
index ba5b86d773d..b3150fa2fa7 100644
--- a/spacy/lang/th/__init__.py
+++ b/spacy/lang/th/__init__.py
@@ -5,6 +5,7 @@
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .norm_exceptions import NORM_EXCEPTIONS
+from .lex_attrs import LEX_ATTRS
 
 from ..norm_exceptions import BASE_NORMS
 from ...attrs import LANG, NORM
@@ -34,6 +35,7 @@ def __call__(self, text):
 
 class ThaiDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters.update(LEX_ATTRS)
     lex_attr_getters[LANG] = lambda _text: "th"
     lex_attr_getters[NORM] = add_lookups(
         Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
diff --git a/spacy/lang/th/lex_attrs.py b/spacy/lang/th/lex_attrs.py
new file mode 100644
index 00000000000..047d046c26f
--- /dev/null
+++ b/spacy/lang/th/lex_attrs.py
@@ -0,0 +1,62 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+
+_num_words = [
+    "ศูนย์",
+    "หนึ่ง",
+    "สอง",
+    "สาม",
+    "สี่",
+    "ห้า",
+    "หก",
+    "เจ็ด",
+    "แปด",
+    "เก้า",
+    "สิบ",
+    "สิบเอ็ด",
+    "ยี่สิบ",
+    "ยี่สิบเอ็ด",
+    "สามสิบ",
+    "สามสิบเอ็ด",
+    "สี่สิบ",
+    "สี่สิบเอ็ด",
+    "ห้าสิบ",
+    "ห้าสิบเอ็ด",
+    "หกสิบเอ็ด",
+    "เจ็ดสิบ",
+    "เจ็ดสิบเอ็ด",
+    "แปดสิบ",
+    "แปดสิบเอ็ด",
+    "เก้าสิบ",
+    "เก้าสิบเอ็ด",
+    "ร้อย",
+    "พัน",
+    "ล้าน",
+    "พันล้าน",
+    "หมื่นล้าน",
+    "แสนล้าน",
+    "ล้านล้าน",
+    "ล้านล้านล้าน",
+    "ล้านล้านล้านล้าน",
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}