-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
lex_attrs.py
92 lines (81 loc) · 2.97 KB
/
lex_attrs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# coding: utf8
from __future__ import unicode_literals
from ..norm_exceptions import BASE_NORMS
from ...attrs import NORM, LIKE_NUM
# fmt: off
_stem_suffixes = [
["ो", "े", "ू", "ु", "ी", "ि", "ा"],
["कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें"],
["ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं"],
["ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां"],
["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां"]
]
# fmt: on
# reference 1:https://en.wikipedia.org/wiki/Indian_numbering_system
# reference 2: https://blogs.transparent.com/hindi/hindi-numbers-1-100/
_num_words = [
"शून्य",
"एक",
"दो",
"तीन",
"चार",
"पांच",
"छह",
"सात",
"आठ",
"नौ",
"दस",
"ग्यारह",
"बारह",
"तेरह",
"चौदह",
"पंद्रह",
"सोलह",
"सत्रह",
"अठारह",
"उन्नीस",
"बीस",
"तीस",
"चालीस",
"पचास",
"साठ",
"सत्तर",
"अस्सी",
"नब्बे",
"सौ",
"हज़ार",
"लाख",
"करोड़",
"अरब",
"खरब",
]
def norm(string):
# normalise base exceptions, e.g. punctuation or currency symbols
if string in BASE_NORMS:
return BASE_NORMS[string]
# set stem word as norm, if available, adapted from:
# http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
# http://research.variancia.com/hindi_stemmer/
# https:/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142
for suffix_group in reversed(_stem_suffixes):
length = len(suffix_group[0])
if len(string) <= length:
break
for suffix in suffix_group:
if string.endswith(suffix):
return string[:-length]
return string
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(", ", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text.lower() in _num_words:
return True
return False
LEX_ATTRS = {NORM: norm, LIKE_NUM: like_num}