Skip to content

Commit

Permalink
Merge pull request #137 from vmenger/tab-spaces-as-token
Browse files Browse the repository at this point in the history
Recognize multiple spaces as token
  • Loading branch information
vmenger authored Feb 15, 2024
2 parents 199bedd + 37fd838 commit 8d09277
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 3 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,6 @@ ENV/
# mypy
.mypy_cache/

.idea
# ide
.idea
.vscode
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## 3.0.2 (2023-02-15)

### Changed
- recognize 4+ spaces as a token, blocking annotations

## 3.0.1 (2023-12-20)

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion deduce/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import docdeid as dd
import regex

_TOKENIZER_PATTERN = regex.compile(r"\w+|[\n\r\t]|.(?<! )", flags=regex.I | regex.M)
_TOKENIZER_PATTERN = regex.compile(r"\w+|[\n\r\t]| {4,}|[^ ]", flags=regex.I | regex.M)


class DeduceTokenizer(dd.tokenizer.Tokenizer): # pylint: disable=R0903
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "deduce"
version = "3.0.1"
version = "3.0.2"
description = "Deduce: de-identification method for Dutch medical text"
authors = ["Vincent Menger <[email protected]>"]
maintainers = ["Vincent Menger <[email protected]>"]
Expand Down
16 changes: 16 additions & 0 deletions tests/unit/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,22 @@ def test_split_nonalpha(self):

assert tokenizer._split_text(text=text) == expected_tokens

def test_split_multiple_spaces(self):
tokenizer = DeduceTokenizer()
text = "Pieter van der Zee Bergen Op Zoom"
expected_tokens = [
dd.Token(text="Pieter", start_char=0, end_char=6),
dd.Token(text="van", start_char=7, end_char=10),
dd.Token(text="der", start_char=11, end_char=14),
dd.Token(text="Zee", start_char=15, end_char=18),
dd.Token(text=" ", start_char=18, end_char=23),
dd.Token(text="Bergen", start_char=23, end_char=29),
dd.Token(text="Op", start_char=30, end_char=32),
dd.Token(text="Zoom", start_char=34, end_char=38),
]

assert tokenizer._split_text(text=text) == expected_tokens

def test_split_newline(self):
tokenizer = DeduceTokenizer()
text = "regel 1 \n gevolgd door regel 2"
Expand Down

0 comments on commit 8d09277

Please sign in to comment.