-
Notifications
You must be signed in to change notification settings - Fork 0
/
Indexer.py
337 lines (290 loc) · 15.7 KB
/
Indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
import glob, sys, os, hashlib
import sqlite3
import re
import time
from django.db import models
class Indexer(object):
@staticmethod
def homogeneise_meta(meta):
for m in meta:
if isinstance(meta[m], bytes):
if meta[m][:2] == b'\xfe\xff':
meta[m] = meta[m][2:].decode('utf-16be')
elif meta[m][:2] == b'\xff\xfe':
meta[m] = meta[m][2:].decode('utf-16le')
else:
try:
meta[m] = meta[m].decode('utf-8')
except UnicodeDecodeError:
try:
meta[m] = meta[m].decode('iso8859-1')
except UnicodeDecodeError:
try:
meta[m] = meta[m].decode('latin1')
except UnicodeDecodeError:
meta[m] = meta[m].decode('ascii')
if str(meta[m]).find('Ã') == -1:
continue
try:
meta[m] = meta[m].encode('iso8859-1').decode('utf-8')
except:
raise
return meta
@staticmethod
def index_pdf(file, last, conn):
excludes = os.environ.get('COMPTA_PDF_EXCLUDE_PATH', None)
for exclude in excludes.split('|'):
if exclude and file.find(exclude) > -1:
return False
mtime = os.path.getmtime(file)
if mtime <= last:
return False
fp = open(file, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
try:
meta = doc.info[0]
except IndexError:
meta = {}
meta = Indexer.homogeneise_meta(meta)
fp.seek(0)
hash_md5 = hashlib.md5()
for chunk in iter(lambda: fp.read(4096), b""):
hash_md5.update(chunk)
meta['md5'] = hash_md5.hexdigest()
meta['ctime'] = os.path.getctime(file)
meta['mtime'] = mtime
filename = os.path.basename(file)
file_date = None
searchisodate = re.search(r'(^|[^0-9])(20[0-9][0-9])-?([01][0-9])-?([0-9][0-9])', filename)
searchfradate = re.search(r'(^|[^0-9])([0-9][0-9])[-_]?([01][0-9])[-_]?(20[0-9][0-9])', filename)
searchpdfdate = None
if meta.get('CreationDate'):
searchpdfdate = re.search(r'(^|[^0-9])(20[0-9][0-9])-?([01][0-9])-?([0-9][0-9])', meta['CreationDate'])
if meta.get('facture:date'):
file_date = meta['facture:date']
elif searchisodate and int(searchisodate.group(2)) > 2000 and int(searchisodate.group(2)) < 2100 and int(searchisodate.group(3)) < 13 and int(searchisodate.group(4)) < 32:
file_date = searchisodate.group(2) + '-' + searchisodate.group(3) + '-' + searchisodate.group(4)
elif searchfradate and int(searchfradate.group(4)) > 2000 and int(searchfradate.group(4)) < 2100 and int(searchfradate.group(3)) < 13 and int(searchfradate.group(2)) < 32:
file_date = searchfradate.group(4) + '-' + searchfradate.group(3) + '-' + searchfradate.group(2)
elif searchpdfdate and int(searchpdfdate.group(2)) > 2000 and int(searchpdfdate.group(2)) < 2100 and int(searchpdfdate.group(3)) < 13 and int(searchpdfdate.group(4)) < 32:
file_date = searchpdfdate.group(2) + '-' + searchpdfdate.group(3) + '-' + searchpdfdate.group(4)
else:
file_date = time.strftime('%Y-%m-%d', time.gmtime(meta['ctime']))
if meta.get('ModDate'):
searchpdfdate = re.search(r'(^|[^0-9])(20[0-9][0-9])-?([01][0-9])-?([0-9][0-9])', meta['ModDate'])
if searchpdfdate and int(searchpdfdate.group(2)) > 2000 and int(searchpdfdate.group(2)) < 2100 and int(searchpdfdate.group(3)) < 13 and int(searchpdfdate.group(4)) < 32:
file_date2 = searchpdfdate.group(2) + '-' + searchpdfdate.group(3) + '-' + searchpdfdate.group(4)
if file_date2 < file_date:
file_date = file_date2
res = conn.execute("SELECT id FROM pdf_file WHERE fullpath = \"%s\" OR md5 = \"%s\"" % (file, meta['md5']))
has_file = res.fetchone()
if not has_file:
sql = "INSERT INTO pdf_file (fullpath, filename, md5, date, ctime, mtime) VALUES (\"%s\", \"%s\", \"%s\", \"%s\", %d, %d) ; " % (file, filename, meta['md5'], file_date, meta['ctime'], meta['mtime'])
conn.execute(sql)
else:
sql = 'UPDATE pdf_file SET filename = "%s", md5 = "%s", mtime = %d, date = "%s" WHERE fullpath = "%s" OR md5 = "%s"' % (filename, meta['md5'], meta['mtime'], file_date, file, meta['md5'])
conn.execute(sql)
sql_update = "UPDATE pdf_piece SET "
sql_update = sql_update + " filename = \"%s\", extention = \"pdf\" " % filename
sql_update = sql_update + ', md5 = "%s"' % meta['md5']
sql_update = sql_update + ', mtime = %d' % meta['mtime']
sql_update = sql_update + ", fullpath = \"%s\" " % file
need_update = False
if meta.get('facture:type'):
sql_update = sql_update + ", facture_type = \"%s\"" % meta['facture:type']
need_update = True
if meta.get('facture:author'):
sql_update = sql_update + ", facture_author = \"%s\" " % meta['facture:author']
need_update = True
if meta.get('facture:client'):
sql_update = sql_update + ", facture_client = \"%s\" " % meta['facture:client']
need_update = True
if meta.get('facture:identifier'):
sql_update = sql_update + ", facture_identifier = \"%s\" " % meta['facture:identifier']
need_update = True
elif meta.get('facture:id'):
sql_update = sql_update + ", facture_identifier = \"%s\" " % meta['facture:id']
need_update = True
if meta.get('facture:date'):
sql_update = sql_update + ", facture_date = \"%s\" " % meta['facture:date']
need_update = True
if meta.get('facture:libelle'):
sql_update = sql_update + ", facture_libelle = \"%s\" " % meta['facture:libelle'];
need_update = True
if meta.get('facture:HT'):
sql_update = sql_update + ', facture_prix_ht = %s ' % str(meta['facture:HT']).replace(',', '.');
need_update = True
if meta.get('facture:TVA'):
sql_update = sql_update + ', facture_prix_tax = %s ' % str(meta['facture:TVA']).replace(',', '.');
need_update = True
if meta.get('facture:TTC'):
sql_update = sql_update + ', facture_prix_ttc = %s ' % str(meta['facture:TTC']).replace(',', '.');
need_update = True
if meta.get('facture:devise'):
sql_update = sql_update + ", facture_devise = \"%s\" " % meta['facture:devise']
need_update = True
if meta.get('paiement:comment'):
sql_update = sql_update + ", paiement_comment = \"%s\" " % meta['paiement:comment']
need_update = True
if meta.get('paiement:proof'):
sql_update = sql_update + ", paiement_proof = \"%s\" " % meta['paiement:proof']
need_update = True
if meta.get('paiement:date'):
sql_update = sql_update + ", paiement_date = \"%s\" " % meta['paiement:date']
need_update = True
sql_update = sql_update + " WHERE fullpath = \"%s\" OR md5 = \"%s\"" % (file, meta['md5'])
sql_update = sql_update + " ; "
if not need_update:
return True
res = conn.execute("SELECT * FROM pdf_piece WHERE fullpath = \"%s\" OR md5 = \"%s\"" % (file, meta['md5']))
has_piece = res.fetchone()
sql = "INSERT INTO pdf_piece (fullpath, filename, md5, ctime, mtime) VALUES (\"%s\", \"%s\", \"%s\", %d, %d) ; " % (file, filename, meta['md5'], meta['ctime'], meta['mtime'])
if not has_piece:
conn.execute(sql)
conn.execute(sql_update)
conn.commit()
if os.environ.get('VERBOSE', None):
print("Index: %s" % file)
return True
@staticmethod
def index_banque(csv_url, force, conn):
import csv
import requests
from io import StringIO
import datetime
imported_at = datetime.datetime.now().timestamp()
updated_at = imported_at
last = None
res = conn.execute("SELECT mtime FROM pdf_banque ORDER BY mtime DESC LIMIT 1;");
fetch = res.fetchone()
if fetch:
last = fetch[0]
if not force and last and (updated_at - last) < 15 * 60:
return False
with requests.get(csv_url, stream=True) as r:
csv_raw = StringIO(r.text)
csv_reader = csv.reader(csv_raw, delimiter=",")
for csv_row in csv_reader:
if (csv_row[0] == 'date') or (len(csv_row) < 7):
continue
csv_row[1] = re.sub(r' +', ' ', csv_row[1])
csv_row[7] = re.sub(r' +', ' ', csv_row[7])
sql = "SELECT id FROM pdf_banque WHERE date = \"%s\" AND raw = \"%s\";" % (csv_row[0], csv_row[1])
res = conn.execute(sql)
row = res.fetchone()
if not row or not row[0]:
sql = "INSERT INTO pdf_banque (date, raw, ctime) VALUES (\"%s\", \"%s\" , \"%s\")" % (csv_row[0], csv_row[1], imported_at)
conn.execute(sql)
sql = "UPDATE pdf_banque SET "
sql = sql + 'amount = %s, ' % csv_row[2]
sql = sql + 'type = "%s", ' % csv_row[3]
sql = sql + 'banque_account = "%s", ' % csv_row[4]
sql = sql + 'rdate = "%s", ' % csv_row[5]
sql = sql + 'vdate = "%s", ' % csv_row[6]
sql = sql + 'label = "%s", ' % csv_row[7]
sql = sql + 'mtime = %d' % updated_at
sql = sql + " WHERE date = \"%s\" AND raw = \"%s\";" % (csv_row[0], csv_row[1])
conn.execute(sql)
if last:
res = conn.execute("UPDATE pdf_banque SET mtime = %d WHERE mtime = %d LIMIT 1" % (updated_at, last));
conn.commit()
if os.environ.get('VERBOSE', None):
print("Indexed: Banque")
return True
@staticmethod
def consolidate(conn):
res = conn.execute("SELECT id, date, raw, label FROM pdf_banque");
proof2banqueid = {}
if os.environ.get('VERBOSE', None):
print("Index: consolidating")
for row in res:
if row['raw']:
proof2banqueid[re.sub(r' *', ' ', row['raw']) + 'ø' + row['date']] = row['id'];
if row['label']:
proof2banqueid[re.sub(r' *', ' ', row['label']) + 'ø' + row['date']] = row['id'];
md52pid = {}
res = conn.execute("SELECT id, paiement_proof, paiement_date, fullpath, md5 FROM pdf_piece")
for row in res:
md52pid[row['md5']] = row['id']
banqueid = None
if not row['paiement_proof']:
continue
if not row['paiement_date']:
continue
proofs = row['paiement_proof'].split('|')
dates = row['paiement_date'].split('|')
if len(proofs) > 1 and len(proofs) != len(dates):
continue
for i in range(0,len(dates)):
paiement_date = dates[i]
try:
paiement_proof = proofs[i]
except:
paiement_proof = proofs[0]
paiement_proof = re.sub(r' *', ' ', paiement_proof)
paiement_proof = re.sub(r'^ *', '', paiement_proof)
paiement_proof = re.sub(r' *$', '', paiement_proof)
if not paiement_proof:
continue
if paiement_date:
banqueid = proof2banqueid.get(paiement_proof + 'ø' + paiement_date)
if not banqueid:
ids = []
for pkey in proof2banqueid:
(label, date) = pkey.split('ø')
label = re.sub(r' *', ' ', label)
label = re.sub(r'^ *', '', label)
label = re.sub(r' *$', '', label)
if label.find(paiement_proof) != -1 or paiement_proof.find(label) != -1:
if date == paiement_date:
ids.append(proof2banqueid[pkey])
if len(ids) == 1:
banqueid = ids[0]
if banqueid:
conn.execute("UPDATE pdf_piece SET banque_id = %d WHERE id = %d" % (banqueid, row['id']) )
conn.execute("UPDATE pdf_banque SET piece_id = %d WHERE id = %d" % (row['id'], banqueid) )
conn.execute("UPDATE pdf_file SET date = \"%s\" WHERE date IS NULL AND md5 = \"%s\"" % (paiement_date, row['md5']))
conn.commit()
res = conn.execute("SELECT id, md5 FROM pdf_file WHERE piece_id IS NULL")
for row in res:
if md52pid.get(row['md5']):
res = conn.execute("UPDATE pdf_file SET piece_id = '%s' WHERE id = %d" % (md52pid[row['md5']], row['id']))
conn.commit()
@staticmethod
def update_path(path, force):
with sqlite3.connect('db/database.sqlite') as conn:
conn.row_factory = sqlite3.Row
need_consolidate = False
last = 0
try:
res = conn.execute("SELECT mtime FROM pdf_file WHERE fullpath LIKE \"" + path + "%\" ORDER BY mtime DESC LIMIT 1;");
row = res.fetchone()
if row:
last = row[0]
except sqlite3.OperationalError:
if os.environ.get('VERBOSE', None):
print("Index: creating database")
conn.execute("CREATE TABLE pdf_file (id INTEGER PRIMARY KEY, filename TEXT, fullpath TEXT UNIQUE, extention TEXT, size INTEGER, date DATE, ctime INTEGER, mtime INTEGER, md5 TEXT, piece_id INTEGER);");
conn.execute("CREATE TABLE pdf_piece (id INTEGER PRIMARY KEY, filename TEXT, fullpath TEXT UNIQUE, extention TEXT, size INTEGER, ctime INTEGER, mtime INTEGER, md5 TEXT, facture_type TEXT, facture_author TEXT, facture_client TEXT, facture_identifier TEXT, facture_date DATE, facture_libelle TEXT, facture_prix_ht FLOAT, facture_prix_tax FLOAT, facture_prix_ttc FLOAT, facture_devise TEXT, paiement_comment TEXT, paiement_date DATE, paiement_proof TEXT, banque_id INTEGER, exercice_comptable TEXT, CONSTRAINT constraint_name UNIQUE (md5) );");
conn.execute("CREATE TABLE pdf_banque (id INTEGER PRIMARY KEY, date DATE, raw TEXT, amount FLOAT, type TEXT, banque_account TEXT, rdate DATE, vdate DATE, label TEXT, piece_id INTEGER, ctime INTEGER, mtime INTEGER, CONSTRAINT constraint_name UNIQUE (date, raw) );");
if os.environ.get('VERBOSE', None):
print("Index: indexing files from %s" % path)
for file in glob.glob(path+'/**/*pdf', recursive=True):
need_consolidate = Indexer.index_pdf(file, last, conn) or need_consolidate
if os.environ.get('VERBOSE', None):
print("Index: indexing banque")
need_consolidate = Indexer.index_banque('https://raw.githubusercontent.com/24eme/banque/master/data/history.csv', force, conn) or need_consolidate
if need_consolidate:
Indexer.consolidate(conn)
conn.commit()
@staticmethod
def update(force = False):
for subdir in os.environ.get('COMPTA_PDF_COMPTA_SUBDIR').split('|'):
Indexer.update_path(os.environ.get('COMPTA_PDF_BASE_PATH') + '/' + subdir, force)
def main():
Indexer.update_path(sys.argv[1], True)
if __name__ == "__main__":
main()