diff --git a/bin/pyntlm_auth/app.py b/bin/pyntlm_auth/app.py deleted file mode 100644 index 735cb806083a..000000000000 --- a/bin/pyntlm_auth/app.py +++ /dev/null @@ -1,12 +0,0 @@ -from threading import Thread - -import t_api -import t_sdnotify - -if __name__ == '__main__': - t1 = Thread(target=t_api.api) - t2 = Thread(target=t_sdnotify.sd_notify) - t1.start() - t2.start() - t1.join() - t2.join() diff --git a/bin/pyntlm_auth/config_loader.py b/bin/pyntlm_auth/config_loader.py index 54f7a00beefa..6c3bf4fd20d8 100644 --- a/bin/pyntlm_auth/config_loader.py +++ b/bin/pyntlm_auth/config_loader.py @@ -15,9 +15,9 @@ def expand_machine_account_list(): - r = [global_vars.c_username] + r = [global_vars.s_computer_account_base] - m = global_vars.c_username.replace("$", "") + m = global_vars.s_computer_account_base.replace("$", "") for i in range(global_vars.c_additional_machine_accounts): r.append(f"{m}-{i}$") @@ -381,3 +381,16 @@ def config_load(): global_vars.c_db_pass = c_db_pass global_vars.c_db = c_db global_vars.c_db_unix_socket = c_db_unix_socket + + global_vars.s_computer_account_base = username + + +def reload_worker_config(): + computer_account = global_vars.s_bind_account.replace("$", "") + + global_vars.c_username = computer_account.upper() + "$" + global_vars.c_netbios_name = computer_account.upper() + global_vars.c_workstation = computer_account.upper() + global_vars.c_server_string = computer_account + + global_vars.s_password_ro = global_vars.c_password diff --git a/bin/pyntlm_auth/entrypoint.py b/bin/pyntlm_auth/entrypoint.py index a802171cb1f1..8881d00729f5 100644 --- a/bin/pyntlm_auth/entrypoint.py +++ b/bin/pyntlm_auth/entrypoint.py @@ -1,7 +1,9 @@ import logging import os import time +from threading import Thread +import pymysql from flask import Flask, g, request from flaskext.mysql import MySQL @@ -9,6 +11,9 @@ import global_vars import handlers +import t_health_checker +import t_async_job + app = Flask(__name__) time.sleep(1) @@ -20,12 +25,23 @@ while True: m = config_loader.bind_machine_account(worker_pid) if m is not None: + global_vars.s_bind_account = m break - print(f"---- worker {worker_pid} failed to bind machine account: no available accounts, retrying.") + global_vars.s_worker.log.warning(f"failed to bind machine account: no available accounts, retrying.") time.sleep(1) -print(f"---- worker {worker_pid} successfully registered with machine account '{m}', ready to handle requests.") +config_loader.reload_worker_config() +global_vars.s_worker.log.info(f"successfully registered with machine account '{m}', ready to handle requests.") + +flask_jobs = ( + Thread(target=t_async_job.async_auth, daemon=False, args=(global_vars.s_worker,)), + Thread(target=t_async_job.async_test, daemon=False, args=(global_vars.s_worker,)), + Thread(target=t_health_checker.health_check, daemon=True, args=(global_vars.s_worker,)) +) + +for job in flask_jobs: + job.start() werkzeug_logger = logging.getLogger('werkzeug') @@ -78,7 +94,7 @@ def teardown_request(exception=None): app.route('/ntlm/expire', methods=['POST'])(handlers.ntlm_expire_handler) app.route('/event/report', methods=['POST'])(handlers.event_report_handler) app.route('/ntlm/connect', methods=['GET'])(handlers.ntlm_connect_handler) -app.route('/ntlm/connect', methods=['POST'])(handlers.test_password_handler) +app.route('/ntlm/connect', methods=['POST'])(handlers.ntlm_connect_handler_with_password) app.route('/ping', methods=['GET'])(handlers.ping_handler) if __name__ == '__main__': diff --git a/bin/pyntlm_auth/global_vars.py b/bin/pyntlm_auth/global_vars.py index b56a8fe7d3d3..296cfb32eafb 100644 --- a/bin/pyntlm_auth/global_vars.py +++ b/bin/pyntlm_auth/global_vars.py @@ -29,6 +29,11 @@ def get_value(key, value=None): s_lock = threading.Lock() +s_worker = None # gunicorn worker object. +s_bind_account = None # machine account bind to specific worker +s_computer_account_base = None +s_password_ro = None # machine account password loaded from config file + # config for domain.conf - AD c_netbios_name = None c_realm = None @@ -56,10 +61,9 @@ def get_value(key, value=None): c_nt_key_cache_enabled = None c_nt_key_cache_expire = None -c_ad_account_lockout_threshold = 0 # 0..999. Default=0, never locks -c_ad_account_lockout_duration = None # Default not set -c_ad_reset_account_lockout_counter_after = None # Default not set -c_ad_old_password_allowed_period = None # Windows 2003+, Default not set, if not set, 60 +c_ad_account_lockout_threshold = 0 # 0..999. Default=0, never locks +c_ad_account_lockout_duration = None # Default not set +c_ad_reset_account_lockout_counter_after = None # Default not set +c_ad_old_password_allowed_period = None # Windows 2003+, Default not set, if not set, 60 c_max_allowed_password_attempts_per_device = None - diff --git a/bin/pyntlm_auth/gunicorn.conf.py b/bin/pyntlm_auth/gunicorn.conf.py index d5aeeeb1c017..33ab0c2bfdf6 100644 --- a/bin/pyntlm_auth/gunicorn.conf.py +++ b/bin/pyntlm_auth/gunicorn.conf.py @@ -4,13 +4,13 @@ import config_loader import global_vars +import t_async_job +import t_health_checker import t_sdnotify import t_worker_register NAME = "NTLM Auth API" -config_loader.config_load() - try: LISTEN = os.getenv("LISTEN") bind_port = int(LISTEN) @@ -21,16 +21,17 @@ print(f"failed to extract parameter 'LISTEN' from environment variable: {str(e)}. {NAME} terminated.") sys.exit(1) +config_loader.config_load() worker_num = global_vars.c_additional_machine_accounts + 1 wsgi_app = 'entrypoint:app' -bind = f"127.0.0.1:{bind_port}" +bind = f"0.0.0.0:{bind_port}" backlog = 2048 workers = worker_num worker_class = 'sync' # use sync, do not use 'gevent', or 'eventlet' due to block operations. timeout = 30 -graceful_timeout = 30 +graceful_timeout = 10 accesslog = '-' # to stdout errorlog = '-' # to stdout/err @@ -40,7 +41,7 @@ access_log_format = '%(h)s %(l)s %(u)s %(p)s %(t)s "%(r)s" %(s)s %(b)s "%(f)s" "%(a)s"' keepalive = 2 -max_requests = 100000 +max_requests = 10000 max_requests_jitter = 50 daemon = False @@ -54,10 +55,23 @@ reload = True # reload apps when the source code changes, For debugging purpose only. +def on_exit(server): + server.log.info(f"master process on-exit clean up started...") + server.log.info(f"master process on-exit clean up done...") + + +def worker_exit(server, worker): + worker.log.info(f"worker process pre-exit clean up started, sending thread stop event...") + + t_async_job.stop_event.set() + t_health_checker.stop_event.set() + t_worker_register.stop_event.set() + t_sdnotify.stop_event.set() + + def post_fork(server, worker): - master_pid = os.getppid() - worker_pid = os.getpid() - worker.log.info(f" worker spawned with PID of {worker_pid} by master process {master_pid}") + worker.log.info(f"post fork hook: worker spawned with PID of {worker.pid} by master {server.pid}") + global_vars.s_worker = worker background_jobs = ( Thread(target=t_worker_register.primary_worker_register, daemon=True, args=(worker,)), diff --git a/bin/pyntlm_auth/handlers.py b/bin/pyntlm_auth/handlers.py index 7eb35c8b116b..41bf332fa664 100644 --- a/bin/pyntlm_auth/handlers.py +++ b/bin/pyntlm_auth/handlers.py @@ -1,17 +1,21 @@ -from http import HTTPStatus -from flask import Flask, request, g -import ncache -import re -import hashlib import binascii -import json -import utils -import ms_event +import hashlib +import re +import time +from http import HTTPStatus + +import redis +from flask import request, g +from samba import ntstatus + +import config_loader +import flags import global_vars +import ms_event +import ncache +import redis_client import rpc -import flags -from samba import param, NTSTATUSError, ntstatus # For NTSTATUS, see: # https://github.com/samba-team/samba/blob/master/libcli/util/ntstatus_err_table.txt @@ -42,48 +46,201 @@ def ping_handler(): def ntlm_connect_handler(): - with global_vars.s_lock: - global_vars.s_reconnect_id = global_vars.s_connection_id + machine_accounts = config_loader.expand_machine_account_list() - global_vars.s_secure_channel_connection, global_vars.s_machine_cred, global_vars.s_connection_id, error_code, error_message = rpc.get_secure_channel_connection() + mapping, code, msg = _build_machine_account_bind_mapping(machine_accounts) + if code != 0: + return msg, code - if error_code == ntstatus.NT_STATUS_ACCESS_DENIED: - return "Test machine account failed. Access Denied", HTTPStatus.UNAUTHORIZED - if error_code != 0: - return "Error while establishing secure channel connection: " + error_message, HTTPStatus.INTERNAL_SERVER_ERROR + job_id, code, msg = _submit_machine_account_test_job(machine_accounts) + if code != 0: + return msg, code - return "OK", HTTPStatus.OK + results = _poll_machine_account_test_job_results(job_id, machine_accounts) + return _aggregate_results(results, machine_accounts) -def test_password_handler(): +def ntlm_connect_handler_with_password(): data = request.get_json() if data is None: - return 'No JSON payload found in request', HTTPStatus.BAD_REQUEST + return "Invalid JSON payload: decoding failed\n", HTTPStatus.BAD_REQUEST + if 'password' not in data: - return 'Invalid JSON payload format, missing required key: password', HTTPStatus.UNPROCESSABLE_ENTITY + return "Invalid JSON payload: missing required field 'password'\n", HTTPStatus.UNPROCESSABLE_ENTITY - test_password = data['password'].strip() + nt_hash = "" + password = data['password'].strip() + if password: + if re.search(r'^[a-fA-F0-9]{32}$', password): + nt_hash = password + else: + nt4_digest = hashlib.new('md4', password.encode('utf-16le')).digest() + nt_hash = binascii.hexlify(nt4_digest).decode('utf-8') - if re.search(r'^[a-fA-F0-9]{32}$', test_password): - nt_hash = test_password - else: - nt4_digest = hashlib.new('md4', test_password.encode('utf-16le')).digest() - nt_hash = binascii.hexlify(nt4_digest).decode('utf-8') + machine_account = "" + if ('machine_account' in data) and data["machine_account"].strip(): + machine_account = data["machine_account"].strip() + if "$" not in machine_account: + machine_account = f"{machine_account}$" + + machine_accounts = config_loader.expand_machine_account_list() + if machine_account and (machine_account not in machine_accounts): + msg = f"Machine account '{machine_account}' does not match current machine account name patterns" + return msg, HTTPStatus.NOT_FOUND + + if machine_account: + machine_accounts = [machine_account] + + mapping, code, msg = _build_machine_account_bind_mapping(machine_accounts) + if code != 0: + return msg, code + + job_id, code, msg = _submit_machine_account_test_job(machine_accounts, nt_hash) + if code != 0: + return msg, code + + results = _poll_machine_account_test_job_results(job_id, machine_accounts) + return _aggregate_results(results, machine_accounts) + + +def _build_machine_account_bind_mapping(machine_accounts): + mapping = {} + + for m in machine_accounts: + key = f"{redis_client.namespace}:machine-account-bind:{m}" + + try: + res = redis_client.r.get(name=key) + if res is None: + msg = f"error fetching machine account binding. machine account test failed: no binding for account {m}" + code = HTTPStatus.INTERNAL_SERVER_ERROR + return {}, code, msg + else: + mapping[m] = res + + except redis.ConnectionError: + msg = "error fetching machine account binding. machine account test failed: redis connection error." + code = HTTPStatus.INTERNAL_SERVER_ERROR + return {}, code, msg + + except Exception as e: + msg = f"error fetching machine account binding. machine account test failed: {str(e)}." + code = HTTPStatus.INTERNAL_SERVER_ERROR + return {}, code, msg + + return mapping, 0, "" - global_vars.c_password = nt_hash - with global_vars.s_lock: - global_vars.s_reconnect_id = global_vars.s_connection_id +def _submit_machine_account_test_job(machine_accounts, password=""): + job_id = time.time() - global_vars.s_secure_channel_connection, global_vars.s_machine_cred, global_vars.s_connection_id, error_code, error_message = rpc.get_secure_channel_connection() + for m in machine_accounts: + key = f"{redis_client.namespace}:async-test:jobs:{m}" - if error_code == ntstatus.NT_STATUS_ACCESS_DENIED: - return "Test machine account failed. Access Denied", HTTPStatus.UNAUTHORIZED - if error_code != 0: - return "Error while establishing secure channel connection: " + error_message, HTTPStatus.INTERNAL_SERVER_ERROR + if password.strip() == "": + value = job_id + else: + value = f"{job_id}:{password.strip()}" - return "OK", HTTPStatus.OK + try: + redis_client.r.lpush(key, value) + + except redis.ConnectionError: + msg = "error fetching machine account binding. machine account test failed: redis connection error." + code = HTTPStatus.INTERNAL_SERVER_ERROR + return None, code, msg + + except Exception as e: + msg = f"error fetching machine account binding. machine account test failed: {str(e)}." + code = HTTPStatus.INTERNAL_SERVER_ERROR + return None, code, msg + + return job_id, 0, "" + + +def _poll_machine_account_test_job_results(job_id, machine_accounts): + exp_time = job_id + 2 + results = {} + + while time.time() < exp_time: + time.sleep(0.3) + + for m in machine_accounts: + if m in results and results[m]["status"] in ("OK", "Failed"): + continue + + key = f"{redis_client.namespace}:async-test:results:{job_id}:{m}" + + try: + res = redis_client.r.get(key) + + if res is None: + continue + + if res == "OK": + results[m] = {"status": "OK", "reason": None} + continue + + if isinstance(res, str) and res != "OK": + results[m] = {"status": "Failed", "reason": res} + continue + + except redis.ConnectionError: + results[m] = {"status": "Exception", "reason": f"redis connection issue when fetching job result"} + except Exception as e: + results[m] = {"status": "Exception", "reason": f"redis error '{str(e)}' when fetching job result"} + return results + + +def _aggregate_results(results, machine_accounts): + timeout = [] + successful = [] + failed = [] + exception = [] + + for m in machine_accounts: + if m not in results: + timeout.append(m) + continue + + if results[m]["status"] == "OK": + successful.append(m) + continue + + if results[m]["status"] == "Failed": + failed.append(m) + continue + + if results[m]["status"] == "Exception": + exception.append(m) + continue + + if (not timeout) and (not failed) and (not exception): + return "OK", HTTPStatus.OK + + s_successful = "" + if successful: + s_successful = "successful: " + ", ".join(successful) + "\n" + + s_timeout = "" + if timeout: + s_timeout = "timeout: " + ", ".join(timeout) + "\n" + + s_failed = "" + if failed: + s_failed = "Failed: \n" + \ + "\n".join(f" {i}: {results[i]['status']}: {results[i]['reason']}" for i in failed) + \ + "\n" + + s_exception = "" + if exception: + s_exception = "Exception occurred: \n" + \ + "\n".join(f" {i}: {results[i]['status']}: {results[i]['reason']}" for i in exception) + \ + "\n" + + msg = f"machine account test (partially) failed: \n{s_successful}{s_timeout}{s_failed}{s_exception}" + return f"{msg}\n", HTTPStatus.UNPROCESSABLE_ENTITY def ntlm_auth_handler(): @@ -162,4 +319,3 @@ def ntlm_expire_handler(): return "OK", HTTPStatus.OK except Exception as e: return f"Error processing JSON payload, {str(e)}", HTTPStatus.UNPROCESSABLE_ENTITY - diff --git a/bin/pyntlm_auth/t_api.py b/bin/pyntlm_auth/t_api.py deleted file mode 100644 index e38e756e6e65..000000000000 --- a/bin/pyntlm_auth/t_api.py +++ /dev/null @@ -1,66 +0,0 @@ -import logging -import pymysql -from flask import Flask, g, request -from flaskext.mysql import MySQL - -import config_loader -import global_vars -import handlers - - -def api(): - config_loader.config_load() - - app = Flask(__name__) - - werkzeug_logger = logging.getLogger('werkzeug') - - @app.before_request - def register_logger(): - if request.path.startswith("/ping"): - werkzeug_logger.setLevel(logging.CRITICAL) - else: - werkzeug_logger.setLevel(logging.INFO) - - for i in range(1): - if not global_vars.c_nt_key_cache_enabled: - break - - c_db_port, err = config_loader.get_int_value(global_vars.c_db_port) - if err is not None: - global_vars.c_nt_key_cache_enabled = False - break - - app.config['MYSQL_DATABASE_HOST'] = global_vars.c_db_host - app.config['MYSQL_DATABASE_PORT'] = int(global_vars.c_db_port) - app.config['MYSQL_DATABASE_USER'] = global_vars.c_db_user - app.config['MYSQL_DATABASE_PASSWORD'] = global_vars.c_db_pass - app.config['MYSQL_DATABASE_DB'] = global_vars.c_db - app.config['MYSQL_DATABASE_CHARSET'] = 'utf8mb4' - app.config['MYSQL_DATABASE_SOCKET'] = global_vars.c_db_unix_socket - - mysql = MySQL(autocommit=True, cursorclass=pymysql.cursors.DictCursor) - mysql.init_app(app) - - @app.before_request - def before_request(): - try: - g.db = mysql.get_db().cursor() - except Exception as e: - e_code = e.args[0] - e_msg = str(e) - print(f" error while init database: {e_code}, {e_msg}. Started without NT Key cache capability.") - - @app.teardown_request - def teardown_request(exception=None): - if hasattr(g, 'db'): - g.db.close() - - app.route('/ntlm/auth', methods=['POST'])(handlers.ntlm_auth_handler) - app.route('/ntlm/expire', methods=['POST'])(handlers.ntlm_expire_handler) - app.route('/event/report', methods=['POST'])(handlers.event_report_handler) - app.route('/ntlm/connect', methods=['GET'])(handlers.ntlm_connect_handler) - app.route('/ntlm/connect', methods=['POST'])(handlers.test_password_handler) - app.route('/ping', methods=['GET'])(handlers.ping_handler) - - app.run(threaded=True, host='0.0.0.0', port=int(global_vars.c_listen_port)) diff --git a/bin/pyntlm_auth/t_async_job.py b/bin/pyntlm_auth/t_async_job.py new file mode 100644 index 000000000000..54921d65f9c6 --- /dev/null +++ b/bin/pyntlm_auth/t_async_job.py @@ -0,0 +1,103 @@ +import datetime +import time +from threading import Event + +import redis + +import global_vars +import redis_client +import rpc + +stop_event = Event() + + +def async_auth(worker): + key = f"{redis_client.namespace}:async-auth:{worker.pid}" + + while not stop_event.is_set(): + try: + res = redis_client.r.lpop(name=key) + if res is None: + time.sleep(0.5) + else: + worker.log.info(f"Data is: {res}") + + except redis.ConnectionError: + worker.log.warning(f"failed fetching async auth job: key = '{key}': redis connectivity issue.") + time.sleep(1) + except Exception as e: + worker.log.warning(f"failed fetching async auth job: key = '{key}': error = {str(e)}") + time.sleep(1) + worker.log.info("Thread 'async_auth' is done.") + + +def async_test(worker): + bind_account = global_vars.s_bind_account + key = f"{redis_client.namespace}:async-test:jobs:{bind_account}" + + while not stop_event.is_set(): + try: + res = redis_client.r.rpop(name=key) + if res is None: + time.sleep(0.5) + else: + s = res.split(":") + job_time = s[0] + password = "" + if len(s) == 2: + password = s[1] + + try: + job_time_f = float(job_time) + if job_time_f + 2 < time.time(): + job_time_fmt = datetime.datetime.fromtimestamp(job_time_f).strftime("%Y-%m-%d %H:%M:%S.%f") + worker.log.warning(f"deprecated job submitted at {job_time_fmt}, dropped. payload: {res}") + else: + worker.log.info(f"deal machine account test for: {bind_account} with password '{password}'") + _test_schannel(job_time, bind_account, password) + except Exception as e: + worker.log.warning(f"can not convert job_id '{job_time}' to float number: {str(e)}, payload: {res}") + + except redis.ConnectionError: + worker.log.warning(f"failed fetching async test job: key = '{key}': redis connectivity issue.") + time.sleep(1) + except Exception as e: + worker.log.warning(f"failed fetching async test job: key = '{key}': error = {str(e)}") + time.sleep(1) + worker.log.info("Thread 'async_test' is done.") + + +def _test_schannel(job_id, machine_account, password=""): + if not password: + password = global_vars.s_password_ro + + with global_vars.s_lock: + global_vars.s_reconnect_id = global_vars.s_connection_id + global_vars.c_password = password + + ( + global_vars.s_secure_channel_connection, + global_vars.s_machine_cred, + global_vars.s_connection_id, + error_code, error_message + ) = rpc.get_secure_channel_connection() + + with global_vars.s_lock: + global_vars.c_password = global_vars.s_password_ro + + if error_code == 0: + result = "OK" + else: + result = f"error code: {error_code}, error message: {error_message}" + # typically, we'll get an NT_STATUS_ACCESS_DENIED error is password is wrong. + + key = f"{redis_client.namespace}:async-test:results:{job_id}:{machine_account}" + + try: + redis_client.r.set(name=key, value=result, ex=5) + except redis.ConnectionError: + msg = f"redis connection error occurred when writing async test job (id = '{job_id}') results: {result}" + global_vars.s_worker.log.warning(msg) + except Exception as e: + msg = f"error '{str(e)}' occurred when writing async test job (id = '{job_id}') results: {result}" + global_vars.s_worker.log.warning(msg) diff --git a/bin/pyntlm_auth/t_health_checker.py b/bin/pyntlm_auth/t_health_checker.py new file mode 100644 index 000000000000..24f3bc196a24 --- /dev/null +++ b/bin/pyntlm_auth/t_health_checker.py @@ -0,0 +1,11 @@ +import time +from threading import Event + +stop_event = Event() + + +# we'll do health check and schannel keep alive here. +def health_check(worker): + while not stop_event.is_set(): + print(f"health check on {worker.pid}") + time.sleep(30) diff --git a/bin/pyntlm_auth/t_sdnotify.py b/bin/pyntlm_auth/t_sdnotify.py index b9928c1e8452..879fe58fb7bc 100644 --- a/bin/pyntlm_auth/t_sdnotify.py +++ b/bin/pyntlm_auth/t_sdnotify.py @@ -1,8 +1,9 @@ import time +from threading import Event import sdnotify -done = False +stop_event = Event() def sd_notify(worker): @@ -10,9 +11,7 @@ def sd_notify(worker): n.notify("READY=1") count = 0 - while True: - if done is True: - break + while not stop_event.is_set(): if count % 30 == 0: message = "WATCHDOG=1" diff --git a/bin/pyntlm_auth/t_worker_register.py b/bin/pyntlm_auth/t_worker_register.py index 047a027351ce..290b3168f78e 100644 --- a/bin/pyntlm_auth/t_worker_register.py +++ b/bin/pyntlm_auth/t_worker_register.py @@ -1,31 +1,26 @@ -import os import time +from threading import Event -worker_pid = os.getpid() +import redis import redis_client -import redis -done = False +stop_event = Event() def primary_worker_register(worker): - while True: - if done is True: - break - - key = f"{redis_client.namespace}:primary_worker" - + key = f"{redis_client.namespace}:primary_worker" + while not stop_event.is_set(): try: - res = redis_client.r.set(name=key, value=worker_pid, nx=True, ex=5, get=True) + res = redis_client.r.set(name=key, value=worker.pid, nx=True, ex=5, get=True) if res is None: - print(f"primary worker is registered on PID: {worker_pid}.") - if str(worker_pid) == res: + worker.log.info(f"primary worker is registered on PID: {worker.pid}.") + if str(worker.pid) == res: redis_client.r.expire(name=key, time=10, xx=True, gt=True) except redis.ConnectionError: - print("failed while trying to register primary worker: redis connection error.") + worker.log.warning("failed registering primary worker: redis connection error.") except Exception as e: - print(f"failed while trying to register primary worker: {str(e)}") + worker.log.warning(f"failed registering primary worker: {str(e)}") time.sleep(2) diff --git a/containers/ntlm-auth-api/Dockerfile b/containers/ntlm-auth-api/Dockerfile index 6eec15a8283f..5509152c9bda 100644 --- a/containers/ntlm-auth-api/Dockerfile +++ b/containers/ntlm-auth-api/Dockerfile @@ -7,7 +7,7 @@ WORKDIR /usr/local/pf/ COPY bin bin RUN apt-get -qq update && \ - apt-get -yqq install python3-pip python3-pymysql python3-sdnotify python3-tz python3-dev + apt-get -yqq install python3-pip python3-pymysql python3-sdnotify python3-tz python3-dev gunicorn3 python3-psutil RUN VER=`python3 -c 'import sys; val=sys.version_info;print(str(val.major)+"."+str(val.minor))'` ; \ sudo rm -rf /usr/lib/python$VER/EXTERNALLY-MANAGED && \ @@ -15,4 +15,4 @@ RUN VER=`python3 -c 'import sys; val=sys.version_info;print(str(val.major)+"."+s COPY addons/ntlm-auth-api/openssl.cnf /usr/lib/ssl/openssl.cnf -ENTRYPOINT /usr/bin/python3 /usr/local/pf/bin/pyntlm_auth/app.py +ENTRYPOINT /usr/bin/gunicorn -c /usr/local/pf/bin/pyntlm_auth/gunicorn.conf.py diff --git a/docs/installation/performance_optimizations.asciidoc b/docs/installation/performance_optimizations.asciidoc index 876cd8e77b29..414e1ca7525f 100644 --- a/docs/installation/performance_optimizations.asciidoc +++ b/docs/installation/performance_optimizations.asciidoc @@ -18,6 +18,74 @@ endif::[] //== Performance Optimizations + +=== Multi Machine Account Support === + +Starting from PacketFence 14.1, we brought new feature to ntlm-auth-api that allows the backend API to be able to process NTLM +requests in a multi-processes model. +Previously, only one machine account will be created and new NTLM auth request has to wait until the previous one was done. +The single thread model can slow down the performance in some extreme heavy loaded scenarios: e.g., a huge enterprise with +a giant windows AD database and devices. + +Now we are able to create multiple machine accounts and register each machine account with a dedicate process when NTLM auth API +starts. + +To enable this feature, you'll have to set the `additional_machine_accounts` to a non-zero value, and corresponding machine +account will be created based on the previous machine account name. + +The name pattern of additional machine account is: "base_machine_account_name"-N, N will be 0..9 +E.g., the machine account you created before was NODE-PF, and the additional machine accounts is set to 2, +then 2 additional accounts will be created on Windows AD with the names of `NODE-PF-0` and `NODE-PF-1`. + +NOTE: There's a hard limitation of how many characters can we use for a machine account name. On windows system, the +limit is 14, if you want to enable this feature, you'll have to make sure the machine account name has a length less than +12. So PacketFence can create additional accounts with a "-N" suffix. + + +==== How does this work ==== +PacketFence will re-generate the config file load the configuration file, determine how many sub-processes will be needed +to use all those machine accounts, then gunicorn master process will launch the X sub processes to handle the requests. +each of the sub process will take a dedicate machine account to perform the authentication. + +==== Limitations ==== + +1. You can not create more than 10 additional machine accounts for each domain. +2. The machine account name can not exceed 12 chars if you want to enable `additional_machine_accounts`. +3. If you use %h as machine account name, the parsed value of %h also need to be less than 12 to enable additional machine accounts. + +==== Benchmarks and suggested settings ==== +We did a benchmark on this feature, here's some results for reference: +The test is based on a VM of 4 CPU 32 Gig DDR4 VM hosted on ESXi, CPU: Intel D2123-IT 2.2G + +we are testing this using ab (apache benchmark) to directly test against NTLM auth API. +2 additional machine accounts is added (total 3) + +``` +ab -n 20000 -c 1 -p ~/eapol_test/payload.admin-akam.json -T 'application/json' http://127.0.0.1:5002/ntlm/auth + +Requests per second: 448.22 [#/sec] (mean) +Time per request: 2.231 [ms] (mean) +Time per request: 2.231 [ms] (mean, across all concurrent requests) + + +ab -n 20000 -c 2 -p ~/eapol_test/payload.admin-akam.json -T 'application/json' http://127.0.0.1:5002/ntlm/auth + +Requests per second: 721.03 [#/sec] (mean) +Time per request: 2.774 [ms] (mean) +Time per request: 1.387 [ms] (mean, across all concurrent requests) + + +ab -n 20000 -c 3 -p ~/eapol_test/payload.admin-akam.json -T 'application/json' http://127.0.0.1:5002/ntlm/auth + +Requests per second: 932.21 [#/sec] (mean) +Time per request: 3.218 [ms] (mean) +Time per request: 1.073 [ms] (mean, across all concurrent requests) + + +``` + + + === NT Key Caching === NOTE: This section assumes that you already have an Active Directory domain configuration both in _Configuration -> Policies and Access Control -> Domains -> Active Directory Domains_ and _Configuration -> Policies and Access Control -> Authentication Sources_. If you don't, you need to first configure those. Refer to the appropriate sections of this guide for details on how to configure these two components.