forked from arthurdejong/python-stdnum
-
Notifications
You must be signed in to change notification settings - Fork 0
/
getisbn.py
executable file
·90 lines (73 loc) · 2.96 KB
/
getisbn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python
# getisbn.py - script to get ISBN prefix data
#
# Copyright (C) 2010, 2011, 2014 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA
"""This script downloads XML data from the International ISBN Agency
website and provides a compact form of all group prefixes, and registrant
ranges for those prefixes suitable for the numdb module. This data is needed
to correctly split ISBNs into an EAN.UCC prefix, a group prefix, a registrant,
an item number and a check-digit."""
from xml.etree import ElementTree
import urllib
# the location of the ISBN Ranges XML file
download_url = 'https://www.isbn-international.org/export_rangemessage.xml'
def ranges(group):
for rule in group.find('Rules').findall('Rule'):
length = int(rule.find('Length').text.strip())
if length:
yield '-'.join(
x[:length]
for x in rule.find('Range').text.strip().split('-'))
def wrap(text):
while text:
i = len(text)
if i > 73:
i = text.rindex(',', 20, 73)
yield text[:i]
text = text[i + 1:]
def get(f=None):
if f is None:
yield '# generated from RangeMessage.xml, downloaded from'
yield '# %s' % download_url
f = urllib.urlopen(download_url)
else:
yield '# generated from %r' % f
# parse XML document
msg = ElementTree.parse(f).getroot()
# dump data from document
yield '# file serial %s' % msg.find('MessageSerialNumber').text.strip()
yield '# file date %s' % msg.find('MessageDate').text.strip()
top_groups = dict(
(x.find('Prefix').text.strip(), x)
for x in msg.find('EAN.UCCPrefixes').findall('EAN.UCC'))
prevtop = None
for group in msg.find('RegistrationGroups').findall('Group'):
top, prefix = group.find('Prefix').text.strip().split('-')
agency = group.find('Agency').text.strip()
if top != prevtop:
yield top
for line in wrap(','.join(ranges(top_groups[top]))):
yield ' %s' % line
prevtop = top
yield ' %s agency="%s"' % (prefix, agency)
for line in wrap(','.join(ranges(group))):
yield ' %s' % line
if __name__ == '__main__':
# get('RangeMessage.xml')
for row in get():
print row.encode('utf-8')