-
Notifications
You must be signed in to change notification settings - Fork 1
/
data.py
191 lines (171 loc) · 8.43 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import sys
import json
import random
import numpy
import numpy.random
import keras.utils
from keras.preprocessing.sequence import pad_sequences
import model
from gensim.models import KeyedVectors
from gensim.models.keyedvectors import Vocab
ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC=range(10)
def read_conll(inp,max_sent=0,drop_tokens=True,drop_nulls=True):
comments=[]
sent=[]
yielded=0
for line in inp:
line=line.strip()
if line.startswith("#"):
comments.append(line)
elif not line:
if sent:
yield sent,comments
yielded+=1
if max_sent>0 and yielded==max_sent:
break
sent,comments=[],[]
else:
cols=line.split("\t")
if drop_tokens and "-" in cols[ID]:
continue
if drop_nulls and "." in cols[ID]:
continue
sent.append(cols)
else:
if sent:
yield sent,comments
def read_embeddings(embeddings_filename,max_rank_emb):
"""Reads .vector or .bin file, modifies it to include <OOV> and <PADDING> and <SENTROOT>"""
if embeddings_filename.endswith(".bin"):
binary=True
else:
binary=False
gensim_vectors=KeyedVectors.load_word2vec_format(embeddings_filename, binary=binary, limit=max_rank_emb)
gensim_vectors.vocab["<OOV>"]=Vocab(index=1)
gensim_vectors.vocab["<PADDING>"]=Vocab(index=0)
gensim_vectors.vocab["<SENTROOT>"]=Vocab(index=2)
for word_record in gensim_vectors.vocab.values():
word_record.index+=3
two_random_rows=numpy.random.uniform(low=-0.01, high=0.01, size=(3,gensim_vectors.vectors.shape[1]))
# stack the two rows, and the embedding matrix on top of each other
gensim_vectors.vectors=numpy.vstack([two_random_rows,gensim_vectors.vectors])
gensim_vectors.vectors=keras.utils.normalize(gensim_vectors.vectors,axis=0)
gensim_vectors.vectors=keras.utils.normalize(gensim_vectors.vectors)
return gensim_vectors
def build_dicts(inp):
char_dict={"<PAD>":0,"<OOV>":1}
pos_dict={"<OOV>":0}
deprel_dict={"<OOV>":0}
feat_val_dict={} #"number" -> {"<UNSET>":0,"sg":1}
for tree,comments in read_conll(inp):
for cols in tree:
for char in cols[FORM]:
char_dict.setdefault(char,len(char_dict))
pos_dict.setdefault(cols[UPOS],len(pos_dict))
deprel_dict.setdefault(cols[DEPREL],len(deprel_dict))
deprel_dict.setdefault(cols[DEPREL]+"-left",len(deprel_dict))
deprel_dict.setdefault(cols[DEPREL]+"-right",len(deprel_dict))
if cols[FEATS]!="_":
for feat_val in cols[FEATS].split("|"):
feat,val=feat_val.split("=",1)
feat_dict=feat_val_dict.setdefault(feat,{"<UNSET>":0})
feat_dict.setdefault(val,len(feat_dict))
return char_dict,pos_dict,deprel_dict,feat_val_dict
def vectorize_word(cols,headword,left_deps,right_deps,left_sibling_rels,right_sibling_rels,output_features,char_dict,pos_dict,deprel_dict,feat_val_dict,word_vec_vocab):
""" `cols` one line of conllu"""
#Stuff on input
char_seq=[char_dict.get(char,char_dict["<OOV>"]) for char in cols[FORM]]
left_deprel=[deprel_dict.get(deprel,deprel_dict["<OOV>"]) for deprel in left_deps]
right_deprel=[deprel_dict.get(deprel,deprel_dict["<OOV>"]) for deprel in right_deps]
left_sibling_rels=[deprel_dict.get(deprel,deprel_dict["<OOV>"]) for deprel in left_sibling_rels]
right_sibling_rels=[deprel_dict.get(deprel,deprel_dict["<OOV>"]) for deprel in right_sibling_rels]
pos=pos_dict.get(cols[UPOS],pos_dict["<OOV>"])
if int(cols[ID])>=int(cols[HEAD]): #Distinguish left/right head
deprel=deprel_dict.get(cols[DEPREL]+"-left",deprel_dict["<OOV>"])
else:
deprel=deprel_dict.get(cols[DEPREL]+"-right",deprel_dict["<OOV>"])
if cols[FORM] in word_vec_vocab:
word=word_vec_vocab[cols[FORM]].index
elif cols[FORM].lower() in word_vec_vocab:
word=word_vec_vocab[cols[FORM].lower()].index
else:
word=word_vec_vocab["<OOV>"].index
headword_char_seq=[char_dict.get(char,char_dict["<OOV>"]) for char in headword]
if headword in word_vec_vocab:
headword=word_vec_vocab[headword].index
elif headword.lower() in word_vec_vocab:
headword=word_vec_vocab[headword.lower()].index
else:
headword=word_vec_vocab["<OOV>"].index
#Stuff on output
outputs=[]
example_feats={}
if cols[FEATS]!="_":
for feat_val in cols[FEATS].split("|"):
feat,val=feat_val.split("=",1)
example_feats[feat]=val
for feat in output_features: #The feature we want
if feat in example_feats: #yes it was set!
feat_dict=feat_val_dict[feat]
outputs.append(feat_dict.get(example_feats[feat],feat_dict["<UNSET>"])) #Unknown feature, guess we pretend unset...?
else:
#No it was not set
outputs.append(feat_val_dict[feat]["<UNSET>"])
return [char_seq,word,headword,headword_char_seq,left_deprel,right_deprel,left_sibling_rels,right_sibling_rels,pos,deprel],outputs
def vectorize_data(inp,dicts_filename,word_vec_vocab):
""" `word_vec_vocab` is gensim's KeyedVectors.vocab with <OOV> and <PADDING> present"""
with open(dicts_filename,"rt") as f:
char_dict,pos_dict,deprel_dict,feat_val_dict=json.load(f)
output_features=[feat for feat in sorted(feat_val_dict.keys())]
result=[]
for tree,comments in inp:
deprels=[[] for _ in range(len(tree)+1)]
for row_idx,cols in enumerate(tree):
deprels[int(cols[HEAD])].append((row_idx,cols[DEPREL])) #index by head, list of deprels, 0 is root
for row_idx,cols in enumerate(tree):
left_deps=[deprel for (deprel_idx,deprel) in deprels[row_idx+1] if deprel_idx<row_idx]
right_deps=[deprel for (deprel_idx,deprel) in deprels[row_idx+1] if deprel_idx>row_idx]
if cols[HEAD]=="0":
headword="<SENTROOT>"
else:
headword=tree[int(cols[HEAD])-1][FORM]
left_sibling_rels=[]
right_sibling_rels=[]
seen_self=False
for sibling_idx,sibling_drel in deprels[int(cols[HEAD])]:
if sibling_idx < row_idx:
left_sibling_rels.append(sibling_drel)
elif sibling_idx > row_idx:
right_sibling_rels.append(sibling_drel)
else:
seen_self=True
else:
assert seen_self
result.append(vectorize_word(cols,headword,left_deps,right_deps,left_sibling_rels,right_sibling_rels,output_features,char_dict,pos_dict,deprel_dict,feat_val_dict,word_vec_vocab))
return result, output_features
def get_inp_outp(vectorized_data,output_features,word_seq_len,shuffle=False):
"""vectorized_data - (data,feature names) produced by vectorize_data()
returns ready-made dictionaries of inputs and outputs named by layer
word_seq_len can be None for max padding"""
if shuffle:
random.shuffle(vectorized_data)
inputs=numpy.array([item[0] for item in vectorized_data])
inputs_dict={"inp_char_seq":pad_sequences(inputs[:,0],padding="pre",maxlen=word_seq_len),\
"inp_word":inputs[:,1],\
"inp_headword":inputs[:,2],\
"inp_headword_char_seq":pad_sequences(inputs[:,3],padding="pre",maxlen=word_seq_len),\
"inp_left_deps":pad_sequences(inputs[:,4],padding="pre",maxlen=5),\
"inp_right_deps":pad_sequences(inputs[:,5],padding="post",maxlen=5),\
"inp_left_sibling_rels":pad_sequences(inputs[:,6],padding="pre",maxlen=5),\
"inp_right_sibling_rels":pad_sequences(inputs[:,7],padding="post",maxlen=5),\
"inp_pos":inputs[:,8],\
"inp_deprel":inputs[:,9]}
outputs=numpy.array([item[1] for item in vectorized_data])
outputs_dict=dict((("out_"+model.normname(feat),outputs[:,i]) for i,feat in enumerate(output_features)))
return inputs_dict,outputs_dict
def prep_data(inp,dicts_filename,word_vec_vocab,word_seq_len=None,shuffle=False):
data,output_features=vectorize_data(inp,dicts_filename,word_vec_vocab)
inputs_dict,outputs_dict=get_inp_outp(data,output_features,word_seq_len,shuffle)
return inputs_dict,outputs_dict,output_features
if __name__=="__main__":
pass