From 1c06c5bb218e626ebe10fdf3d09dfcfbd4db39ea Mon Sep 17 00:00:00 2001 From: Ori Avtalion Date: Tue, 16 Jul 2024 13:25:48 +0300 Subject: [PATCH 1/5] feat: add `load_iana_tlds_to_memory` --- docs/api/domain.rst | 1 + src/validators/__init__.py | 3 ++- src/validators/domain.py | 32 ++++++++++++++++++++++++-------- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/docs/api/domain.rst b/docs/api/domain.rst index 33f17f19..88e21876 100644 --- a/docs/api/domain.rst +++ b/docs/api/domain.rst @@ -3,3 +3,4 @@ domain .. module:: validators.domain .. autofunction:: domain +.. autofunction:: load_iana_tlds_to_memory diff --git a/src/validators/__init__.py b/src/validators/__init__.py index d5e6de83..84da953a 100644 --- a/src/validators/__init__.py +++ b/src/validators/__init__.py @@ -6,7 +6,7 @@ from .country import calling_code, country_code, currency from .cron import cron from .crypto_addresses import bsc_address, btc_address, eth_address, trx_address -from .domain import domain +from .domain import domain, load_iana_tlds_to_memory from .email import email from .encoding import base16, base32, base58, base64 from .finance import cusip, isin, sedol @@ -58,6 +58,7 @@ "cron", # ... "domain", + "load_iana_tlds_to_memory", # ... "email", # encodings diff --git a/src/validators/domain.py b/src/validators/domain.py index ecca605a..f860045e 100644 --- a/src/validators/domain.py +++ b/src/validators/domain.py @@ -3,18 +3,34 @@ # standard from pathlib import Path import re +from typing import Optional, Set # local from .utils import validator -def _iana_tld(): - """Load IANA TLDs as a Generator.""" - # source: https://data.iana.org/TLD/tlds-alpha-by-domain.txt - with Path(__file__).parent.joinpath("_tld.txt").open() as tld_f: - _ = next(tld_f) # ignore the first line - for line in tld_f: - yield line.strip() +class _TLDList: + + preloaded: Optional[Set[str]] = None + + @classmethod + def read_tlds_from_file(cls): + with Path(__file__).parent.joinpath("_tld.txt").open() as tld_f: + _ = next(tld_f) # ignore the first line + for line in tld_f: + yield line.strip() + + @classmethod + def tlds(cls): + if cls.preloaded: + return cls.preloaded + + return cls.read_tlds_from_file() + + +def load_iana_tlds_to_memory(): + """Loads the IANA TLD list into memory, for faster lookup with ``consider_tld=True``.""" + _TLDList.preloaded = set(_TLDList.read_tlds_from_file()) @validator @@ -56,7 +72,7 @@ def domain( if not value: return False - if consider_tld and value.rstrip(".").rsplit(".", 1)[-1].upper() not in _iana_tld(): + if consider_tld and value.rstrip(".").rsplit(".", 1)[-1].upper() not in _TLDList.tlds(): return False try: From 999dfac8859401dfa841e3758e89c0ec3ac90da3 Mon Sep 17 00:00:00 2001 From: Ori Avtalion Date: Wed, 17 Jul 2024 19:04:44 +0300 Subject: [PATCH 2/5] Updates based on discussion --- docs/api/domain.rst | 1 - src/validators/__init__.py | 3 +-- src/validators/domain.py | 23 ++++++++++++----------- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/docs/api/domain.rst b/docs/api/domain.rst index 88e21876..33f17f19 100644 --- a/docs/api/domain.rst +++ b/docs/api/domain.rst @@ -3,4 +3,3 @@ domain .. module:: validators.domain .. autofunction:: domain -.. autofunction:: load_iana_tlds_to_memory diff --git a/src/validators/__init__.py b/src/validators/__init__.py index 84da953a..d5e6de83 100644 --- a/src/validators/__init__.py +++ b/src/validators/__init__.py @@ -6,7 +6,7 @@ from .country import calling_code, country_code, currency from .cron import cron from .crypto_addresses import bsc_address, btc_address, eth_address, trx_address -from .domain import domain, load_iana_tlds_to_memory +from .domain import domain from .email import email from .encoding import base16, base32, base58, base64 from .finance import cusip, isin, sedol @@ -58,7 +58,6 @@ "cron", # ... "domain", - "load_iana_tlds_to_memory", # ... "email", # encodings diff --git a/src/validators/domain.py b/src/validators/domain.py index f860045e..5854c482 100644 --- a/src/validators/domain.py +++ b/src/validators/domain.py @@ -1,38 +1,39 @@ """Domain.""" # standard +import os from pathlib import Path import re -from typing import Optional, Set +from typing import Generator, Optional, Set, Union # local from .utils import validator class _TLDList: + """Read IANA TLDs, and optionally cache them.""" - preloaded: Optional[Set[str]] = None + cache: Optional[Set[str]] = None @classmethod - def read_tlds_from_file(cls): + def read_tlds_from_file(cls) -> Generator[str, None, None]: + # Try the most common TLDs before opening the file + yield from ("COM", "ORG", "RU", "DE", "NET", "BR", "UK", "JP", "FR", "IT") with Path(__file__).parent.joinpath("_tld.txt").open() as tld_f: _ = next(tld_f) # ignore the first line for line in tld_f: yield line.strip() @classmethod - def tlds(cls): - if cls.preloaded: - return cls.preloaded + def tlds(cls) -> Union[Set[str], Generator[str, None, None]]: + if not cls.cache and os.environ.get("PYVLD_LOAD_TLD_TO_MEMORY") == "True": + cls.cache = set(_TLDList.read_tlds_from_file()) + if cls.cache: + return cls.cache return cls.read_tlds_from_file() -def load_iana_tlds_to_memory(): - """Loads the IANA TLD list into memory, for faster lookup with ``consider_tld=True``.""" - _TLDList.preloaded = set(_TLDList.read_tlds_from_file()) - - @validator def domain( value: str, /, *, consider_tld: bool = False, rfc_1034: bool = False, rfc_2782: bool = False From 027319283566ce45326c3ecabad64718ed246d5f Mon Sep 17 00:00:00 2001 From: Ori Avtalion Date: Fri, 19 Jul 2024 08:25:32 +0300 Subject: [PATCH 3/5] More updates --- src/validators/domain.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/validators/domain.py b/src/validators/domain.py index 5854c482..c3d3f023 100644 --- a/src/validators/domain.py +++ b/src/validators/domain.py @@ -1,37 +1,39 @@ """Domain.""" # standard -import os +from os import environ from pathlib import Path import re -from typing import Generator, Optional, Set, Union +from typing import Optional, Set # local from .utils import validator -class _TLDList: +class _IanaTLD: """Read IANA TLDs, and optionally cache them.""" - cache: Optional[Set[str]] = None + _full_cache: Optional[Set[str]] = None + # source: https://www.statista.com/statistics/265677 + _popular_cache = {"COM", "ORG", "RU", "DE", "NET", "BR", "UK", "JP", "FR", "IT"} @classmethod - def read_tlds_from_file(cls) -> Generator[str, None, None]: - # Try the most common TLDs before opening the file - yield from ("COM", "ORG", "RU", "DE", "NET", "BR", "UK", "JP", "FR", "IT") + def _retrieve(cls): with Path(__file__).parent.joinpath("_tld.txt").open() as tld_f: _ = next(tld_f) # ignore the first line for line in tld_f: yield line.strip() @classmethod - def tlds(cls) -> Union[Set[str], Generator[str, None, None]]: - if not cls.cache and os.environ.get("PYVLD_LOAD_TLD_TO_MEMORY") == "True": - cls.cache = set(_TLDList.read_tlds_from_file()) - if cls.cache: - return cls.cache - - return cls.read_tlds_from_file() + def check(cls, tld: str): + if tld in cls._popular_cache: + return True + if not cls._full_cache: + if environ.get("PYVLD_CACHE_TLD", "False") == "True": + cls._full_cache = set(cls._retrieve()) + else: + return tld in cls._retrieve() + return tld in cls._full_cache @validator @@ -73,7 +75,7 @@ def domain( if not value: return False - if consider_tld and value.rstrip(".").rsplit(".", 1)[-1].upper() not in _TLDList.tlds(): + if consider_tld and not _IanaTLD.check(value.rstrip(".").rsplit(".", 1)[-1].upper()): return False try: From 1fa4bb52b27c537eb32ad1ad46bcb7d523b3cd2c Mon Sep 17 00:00:00 2001 From: Ori Avtalion Date: Fri, 19 Jul 2024 09:43:16 +0300 Subject: [PATCH 4/5] Update _full_cache null test --- src/validators/domain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/validators/domain.py b/src/validators/domain.py index c3d3f023..5a85a59a 100644 --- a/src/validators/domain.py +++ b/src/validators/domain.py @@ -28,7 +28,7 @@ def _retrieve(cls): def check(cls, tld: str): if tld in cls._popular_cache: return True - if not cls._full_cache: + if cls._full_cache is None: if environ.get("PYVLD_CACHE_TLD", "False") == "True": cls._full_cache = set(cls._retrieve()) else: From 4618dd634f1c08650cdc63086c8ee9da1e5452a7 Mon Sep 17 00:00:00 2001 From: Ori Avtalion Date: Fri, 19 Jul 2024 09:48:34 +0300 Subject: [PATCH 5/5] Simplify PYVLD_CACHE_TLD check --- src/validators/domain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/validators/domain.py b/src/validators/domain.py index 5a85a59a..23ae263d 100644 --- a/src/validators/domain.py +++ b/src/validators/domain.py @@ -29,7 +29,7 @@ def check(cls, tld: str): if tld in cls._popular_cache: return True if cls._full_cache is None: - if environ.get("PYVLD_CACHE_TLD", "False") == "True": + if environ.get("PYVLD_CACHE_TLD") == "True": cls._full_cache = set(cls._retrieve()) else: return tld in cls._retrieve()