diff --git a/setup.py b/setup.py index 1e1e948..3893c3a 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ }, install_requires=[ 'internetarchive', + 'urllib3==1.26.13', 'docopt==0.6.2', 'yt-dlp', ] diff --git a/tests/test_utils.py b/tests/test_utils.py index 8c409d8..813271a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,13 @@ import unittest import os -from tubeup.utils import sanitize_identifier, check_is_file_empty +import json +from tubeup.utils import sanitize_identifier, check_is_file_empty, strip_ip_from_meta + +current_path = os.path.dirname(os.path.realpath(__file__)) + + +def get_testfile_path(name): + return os.path.join(current_path, 'test_tubeup_files', name) class UtilsTest(unittest.TestCase): @@ -48,3 +55,14 @@ def test_check_is_file_empty_when_file_doesnt_exist(self): FileNotFoundError, r"^Path 'file_that_doesnt_exist.txt' doesn't exist$"): check_is_file_empty('file_that_doesnt_exist.txt') + + def test_strip_ip_from_meta(self): + with open(get_testfile_path( + 'Mountain_3_-_Video_Background_HD_1080p-6iRV8liah8A.' + 'info.json') + ) as f: + vid_meta = json.load(f) + mod, new_meta = strip_ip_from_meta(vid_meta) + self.assertTrue(mod) + self.assertNotEqual(f.read(), json.dumps(new_meta)) + self.assertNotRegex(json.dumps(new_meta), r'36\.73\.93\.234') diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index 7dd6801..98bdae7 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -10,7 +10,7 @@ from internetarchive.config import parse_config_file from datetime import datetime from yt_dlp import YoutubeDL -from .utils import (get_itemname, check_is_file_empty, +from .utils import (get_itemname, check_is_file_empty, strip_ip_from_meta, EMPTY_ANNOTATION_FILE) from logging import getLogger from urllib.parse import urlparse @@ -324,6 +324,11 @@ def upload_ia(self, videobasename, custom_meta=None): with open(json_metadata_filepath, 'r', encoding='utf-8') as f: vid_meta = json.load(f) + mod, new_meta = strip_ip_from_meta(vid_meta) + if mod: + with open(json_metadata_filepath, 'w') as f: + json.dump(new_meta, f) + # Exit if video download did not complete, don't upload .part files to IA for ext in ['*.part', '*.f303.*', '*.f302.*', '*.ytdl', '*.f251.*', '*.248.*', '*.f247.*', '*.temp']: if glob.glob(videobasename + ext): diff --git a/tubeup/utils.py b/tubeup/utils.py index bc12845..2be5b86 100644 --- a/tubeup/utils.py +++ b/tubeup/utils.py @@ -1,5 +1,6 @@ import os import re +from urllib.parse import urlparse, parse_qs, urlencode EMPTY_ANNOTATION_FILE = ('' @@ -29,3 +30,39 @@ def check_is_file_empty(filepath): return os.stat(filepath).st_size == 0 else: raise FileNotFoundError("Path '%s' doesn't exist" % filepath) + + +def strip_ip_from_url(url): + """ + Strip occurence of IP address as found in path segments like in /ip/1.2.3.4/ + or in an "ip" query-parameter, like in ?ip=1.2.3.4 + """ + u = urlparse(url) + u = u._replace(path=re.sub(r'/ip/[^/]+', r'/ip/REDACTED', u.path)) + if u.query != '': + qs = parse_qs(u.query) + try: + del (qs['ip']) + u = u._replace(query=urlencode(qs, True)) + except KeyError: + pass + return u.geturl() + + +def strip_ip_from_meta(meta): + modified = False + if 'url' in meta: + redacted_url = strip_ip_from_url(meta['url']) + if redacted_url != meta['url']: + meta['url'] = redacted_url + modified = True + + for _format in meta['formats']: + for field in ['manifest_url', 'fragment_base_url', 'url']: + if field in _format: + redacted_url = strip_ip_from_url(_format[field]) + if redacted_url != _format[field]: + _format[field] = redacted_url + modified = True + + return modified, meta