-
Notifications
You must be signed in to change notification settings - Fork 3
/
SOAS_2_lighttag.py
91 lines (69 loc) · 1.94 KB
/
SOAS_2_lighttag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from pybo import BoPipeline
import re
from pathlib import Path
def basic_cleanup(text: str) -> str:
# used by base_pipeline
text = text.strip()
text = re.sub(r'\n\n', '_', text)
text = re.sub(r'\n', '', text)
return text
def keep_returns(text: str) -> str:
# used by suggestion_pipeline
text = text.strip()
text = re.sub(r'\n+', ' ', text)
text = re.sub(r'\s+', '_', text)
return text
def process(tokens):
# used by suggestion_pipeline
content = ''
annotations = []
idx = 0
start = 0
end = 0
annot = {}
for token in tokens:
idx += len(token)
if '།' in token or '_' in token:
content += token
print('ok')
return tokens
def merge_spaces(tokens):
out = []
for t in tokens:
if t == '_':
if not out:
out.append(t)
else:
out[-1] = out[-1] + t
else:
out.append(t)
return out
def lighttag_raw(tokens):
return ''.join(tokens).replace('_', ' ')
def json_maker(tokens):
# used by suggestion_pipeline
out = ''
return out
def lighttag_base_pipeline():
# pre: remove all \n
# tok: reuse bo_syl_tok
# proc: spaces_plain_fulltext
# frm: plaintext
return BoPipeline(basic_cleanup,
'syls',
merge_spaces,
lighttag_raw)
def lighttag_suggestion_pipeline():
# pre: remove all \n and add space instead (keep_returns)
# tok: syls
# proc: ??? (process)
# frm: json_maker
return BoPipeline(keep_returns,
'syls',
process,
json_maker)
pipeline1 = lighttag_base_pipeline()
pipeline2 = lighttag_suggestion_pipeline()
for f in Path('soas-segmentation/').glob('*.txt'):
pipeline1.pipe_file(f, 'lighttag/totag/')
# pipeline2.pipe_file('soas-segmentation/mdzangs_blun.txt', 'lighttag/totag/')