-
Notifications
You must be signed in to change notification settings - Fork 142
/
doctest_split.py
executable file
·30 lines (26 loc) · 927 Bytes
/
doctest_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#!/usr/bin/env python
#
# Natural Language Toolkit: Split an RST file into sections for independent doctest checking
#
# Copyright (C) 2001-2012 NLTK Project
# Author: Steven Bird <[email protected]>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
import sys
import re
EXT = "doctest" # output filename extension
SEC = r"\n(?=-+\n.+\n-+\n)" # pattern to match section heading
# include this at the top of each output file
HDR = """
>>> import nltk, re, pprint
>>> from nltk import word_tokenize
"""
for filename in sys.argv[1:]:
contents = open(filename).read()
basename, suffix = filename.split('.')
for count, section in enumerate(re.split(SEC, contents)):
chunk_name = "%s-%d.%s" % (basename, count+1, EXT)
chunk_file = open(chunk_name, "w")
chunk_file.write(HDR + "\n")
chunk_file.write(section)
chunk_file.close()