Clean up URL-unsafe characters in filenames

ID

2234ced

date

2025-12-17 08:36:11+00:00

author

Alex Chan <alex@alexwlchan.net>

parent

1b907c1

message

Clean up URL-unsafe characters in filenames

Fixes #27

changed files

3 files, 37 additions, 1 deletion

.github/workflows/test.yml
test_yt-dlp_alexwlchan.py
yt-dlp_alexwlchan.py

Changed files

.github/workflows/test.yml (952) → .github/workflows/test.yml (952)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 60df6c2..f7216c0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       matrix:
         python-version:
-          - "3.13"
+          - "3.14"
 
     runs-on: ubuntu-latest

test_yt-dlp_alexwlchan.py (1793) → test_yt-dlp_alexwlchan.py (2310)

diff --git a/test_yt-dlp_alexwlchan.py b/test_yt-dlp_alexwlchan.py
index 0194045..99e4ffa 100644
--- a/test_yt-dlp_alexwlchan.py
+++ b/test_yt-dlp_alexwlchan.py
@@ -32,6 +32,21 @@ def test_youtube_video() -> None:
     assert video_info["video_path"].endswith(" [TUQaGhPdlxs].mp4")
 
 
+def test_youtube_path_is_cleaned_up() -> None:
+    """
+    Paths of YouTube videos get cleaned up during the download.
+    """
+    video = download_video("https://www.youtube.com/shorts/eso8JB7q0a0")
+    assert (
+        video["title"]
+        == "3D Printing Everyday for 365 Days 176/365  #stem #3dprinting #3dprint #ideas #useful"
+    )
+    assert (
+        os.path.basename(video["video_path"])
+        == "3D Printing Everyday for 365 Days 176-365 stem 3dprinting 3dprint ideas useful [eso8JB7q0a0].mp4"
+    )
+
+
 def test_instagram_video() -> None:
     """
     Download an Instagram video and check we get the expected output.

yt-dlp_alexwlchan.py (6246) → yt-dlp_alexwlchan.py (6777)

diff --git a/yt-dlp_alexwlchan.py b/yt-dlp_alexwlchan.py
index 1d0912d..fc76ef8 100755
--- a/yt-dlp_alexwlchan.py
+++ b/yt-dlp_alexwlchan.py
@@ -3,6 +3,7 @@
 from datetime import datetime, timezone
 import json
 from pathlib import Path
+import re
 import subprocess
 import sys
 import tempfile
@@ -141,6 +142,24 @@ class VideoInfo(TypedDict):
     site: str
 
 
+def cleanup_paths(dir_path: Path) -> None:
+    """
+    For every file in `dir_path`, remove URL-unsafe characters from
+    the filenames.
+    """
+    for p in dir_path.iterdir():
+        old_name = p.name
+
+        new_name = p.name.replace("#", " ").replace("？", " ").replace("⧸", "-")
+        new_name = re.sub(r"\s+", " ", new_name)
+
+        if old_name == new_name:
+            continue
+
+        assert not (dir_path / new_name).exists(), new_name
+        p.move(dir_path / new_name)
+
+
 def download_video(url: str) -> VideoInfo:
     # Download all the videos to a temp directory; this allows the caller
     # to decide exactly where they want the video later.
@@ -150,6 +169,8 @@ def download_video(url: str) -> VideoInfo:
     with YoutubeDL(ydl_opts) as ydl:
         video_info: Any = ydl.extract_info(url)
 
+    cleanup_paths(tmp_dir)
+
     video_path = next(p for p in tmp_dir.iterdir() if p.suffix == ".mp4")
     thumbnail_path = next(p for p in tmp_dir.iterdir() if p.suffix == ".jpg")
     try: