-
Notifications
You must be signed in to change notification settings - Fork 0
/
avro_loaders.lua
308 lines (295 loc) · 10.6 KB
/
avro_loaders.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
local fio = require('fio')
local fun = require('fun')
local log = require('log')
local json = require('json')
local uuid = require('uuid')
local fiber = require('fiber')
local clock = require('clock')
local pavro = require('pregel.avro')
local ploader = require('pregel.loader')
local utils = require('utils')
local constants = require('constants')
local FEATURE_COUNT = 300
local VERTEX_COUNT = 16000000
--[[--
-- Avro schema (in JSON representation) is:
-- {
-- 'type': 'record',
-- 'name': 'KeyValuePair',
-- 'namespace': 'org.apache.avro.mapreduce',
-- 'fields': [{
-- 'name': 'key',
-- 'type': {
-- 'type': 'record',
-- 'name': 'User',
-- 'namespace': 'ru.mail.avro',
-- 'fields': [{
-- 'name': 'vid',
-- 'type': {'type': 'string'}
-- }, {
-- 'name': 'okid',
-- 'type': [
-- {'type': 'null'},
-- {'type': 'string'}
-- ]
-- }, {
-- 'name': 'email',
-- 'type': [
-- {'type': 'null'},
-- {'type': 'string'}
-- ]
-- }, {
-- 'name': 'vkid',
-- 'type': [
-- {'type': 'null'},
-- {'type': 'string'}
-- ]
-- }, {
-- 'name': 'category',
-- 'type': [
-- {'type': 'null'},
-- {'type': 'int'}
-- ]
-- }, {
-- 'name': 'start',
-- 'type': [
-- {'type': 'null'},
-- {'type': 'long'}
-- ]
-- }, {
-- 'name': 'end',
-- 'type': [
-- {'type': 'null'},
-- {'type': 'long'}
-- ]
-- }],
-- }
-- }, {
-- 'name': 'value',
-- 'type': {
-- 'type': 'record',
-- 'name': 'SparseFeatureVector',
-- 'namespace': 'ru.mail.avro',
-- 'fields': [{
-- 'name': 'features',
-- 'type': {
-- 'type': 'array',
-- 'items': {
-- 'type': 'record',
-- 'name': 'Feature',
-- 'fields': [{
-- 'name': 'feature_id',
-- 'type': {'type': 'string'}
-- }, {
-- 'name': 'value',
-- 'type': [
-- {'type': 'double'},
-- {'type': 'null'}
-- ]
-- }, {
-- 'name': 'timestamps',
-- 'type': [{
-- 'type': 'array',
-- 'items': {'type': 'int'}
-- }, {
-- 'type': 'null'
-- }]
-- }]
-- }
-- }
-- }],
-- }
-- }]
-- }
--]]--
local function append_feature_vector(feature_count, features)
if #features < feature_count then
for k = #features + 1, feature_count do
table.insert(features, math.random() * math.random(-30, 30))
end
end
end
local function process_avro_file(self, filename, cnt_cur, cnt_all, feature_count)
log.info('%03d/%03d processing %s', cnt_cur, cnt_all, filename)
local avro_file = pavro.open(filename)
local count = 0
local begin_time = clock.time()
while true do
local line = avro_file:read_raw()
if line == nil then break end
assert(line:type() == pavro.RECORD and
line:schema_name() == 'KeyValuePair')
local key_object = {}
local fea_object = {}
-- parse key
do
local key = line:get('key')
assert(key ~= nil and
key:type() == pavro.RECORD and
key:schema_name() == 'User')
for _, v in ipairs{'okid', 'email', 'vkid'} do
local obj = key:get(v)
assert(obj:type() == pavro.UNION)
local obj_value = obj:get():get()
key_object[v] = obj_value
end
-- set category
local category = key:get('category')
assert(category ~= nil and
category:type() == pavro.UNION)
local category_value = category:get()
-- local category_value = category:get('int')
key_object.category = category_value:get()
-- set vid
local vid = key:get('vid')
assert(vid ~= nil and
vid:type() == pavro.STRING)
local vid_value = vid:get()
key_object.vid = vid_value
end
-- parse value
do
local val = line:get('value')
assert(val:type() == pavro.RECORD and
val:schema_name() == 'SparseFeatureVector')
local features = val:get('features')
assert(features ~= nil and
features:type() == pavro.ARRAY)
for index, feature in features:iterate() do
assert(feature ~= nil and
feature:type() == pavro.RECORD and
feature:schema_name() == 'Feature')
local fid = feature:get('feature_id'):get()
fid = tonumber(fid:match('SVD_(%d+)')) + 1
local fval = feature:get('value'):get():get()
local tst = feature:get('timestamp')
assert(tst == nil, 'timestamp is not nil')
fea_object[fid] = fval
end
if feature_count ~= nil then
append_feature_vector(feature_count, fea_object)
end
end
local vtype = constants.vertex_type.DATA
if type(key_object.vid) == 'string' and
#key_object.vid > 0 and
key_object.vid == constants.MASTER_VERTEX_TYPE then
vtype = constants.vertex_type.MASTER
end
local vertex = {
key = key_object,
features = fea_object,
vtype = vtype,
status = constants.node_status.NEW,
}
self:store_vertex(vertex)
line:release()
count = count + 1
fiber.yield()
end
log.info('done processing %d values in %.3f seconds',
count, clock.time() - begin_time)
avro_file:close()
fiber.yield()
return count
end
local function master_avro_loader(master, path)
local function loader(self)
local avro_path = fio.pathjoin(path, '*.avro')
local avro_files = fio.glob(avro_path);
table.sort(avro_files)
log.info('%d found files found in path %s', #avro_files, avro_path)
for idx, filename in ipairs(avro_files) do
process_avro_file(self, filename, idx, #avro_files)
end
end
return ploader.new(master, loader)
end
local function worker_avro_loader(worker, path)
local function loader(self, current_idx, worker_count)
local avro_path = fio.pathjoin(path, '*.avro')
local avro_files = fun.iter(fio.glob(avro_path)):filter(function(filename)
local avrofile_no = tonumber(filename:match('part%-m%-(%d+).avro'))
if avrofile_no % worker_count == current_idx - 1 then
return true
end
return false
end):totable()
table.sort(avro_files)
log.info('%d found files found in path %s', #avro_files, avro_path)
for idx, filename in ipairs(avro_files) do
process_avro_file(self, filename, idx, #avro_files)
end
end
return ploader.new(worker, loader)
end
local function generate_random_features(feature_count)
return fun.range(feature_count):map(function()
return math.random() * math.random(-30, 30)
end):totable()
end
local function generate_random_name()
local name = {
vid = '',
email = uuid.str(),
}
local b = math.random(0, 1000000)
if b % 739 == 0 then
name['vkid'] = math.random(200000, 10000000)
elseif b % 839 == 0 then
name['okid'] = math.random(200000, 10000000)
end
return name
end
local function generate_random_vertex(feature_count)
return {
key = generate_random_name(),
features = generate_random_features(feature_count),
vtype = constants.vertex_type.DATA,
status = constants.node_status.NEW
}
end
local function worker_additional_avro_loader(worker, opts)
assert(type(opts) == 'table')
assert(type(opts.path) == 'string')
local path = opts.path
local feature_count = opts.feature_count or FEATURE_COUNT
local vertex_count = opts.vertex_count or nil
local function loader(self, current_idx, worker_count)
local avro_path = fio.pathjoin(path, 'tokens', '*.avro')
local avro_files = fun.iter(fio.glob(avro_path)):filter(function(filename)
local avrofile_no = tonumber(filename:match('part%-m%-(%d+).avro'))
if avrofile_no % worker_count == current_idx - 1 then
return true
end
return false
end):totable()
table.sort(avro_files)
log.info('%d found files found in path %s', #avro_files, avro_path)
local vertex_processed = 0
for idx, filename in ipairs(avro_files) do
vertex_processed = vertex_processed + process_avro_file(self, filename, idx,
#avro_files, feature_count)
end
if vertex_count and vertex_processed < vertex_count then
vertex_count = vertex_count - vertex_processed
vertex_count = math.floor(vertex_count / worker_count)
fun.range(vertex_count):each(function(id)
if id % 100 == 0 then
fiber.yield()
end
if id % 100000 == 0 then
log.info('<preload> generated %d/%d vertices', id, vertex_count)
end
local vertex = generate_random_vertex(feature_count)
self:store_vertex(vertex)
end)
end
end
return ploader.new(worker, loader)
end
return {
master = master_avro_loader,
worker = worker_avro_loader,
worker_additional = worker_additional_avro_loader
}