Skip to main content

Improve the way we detect already-downloaded files

ID
0d77cb6
date
2024-02-15 21:51:07+00:00
author
Alex Chan <alex@alexwlchan.net>
parent
77e00da
message
Improve the way we detect already-downloaded files
changed files
2 files, 76 additions, 25 deletions

Changed files

web/save_youtube_videos.py (4282) → web/save_youtube_videos.py (4881)

diff --git a/web/save_youtube_videos.py b/web/save_youtube_videos.py
index c795e64..e74a884 100755
--- a/web/save_youtube_videos.py
+++ b/web/save_youtube_videos.py
@@ -84,6 +84,43 @@ def log_result(format_template):
     return decorator
 
 
+def classify_file_type(
+    video_id: str, filename: str
+) -> Literal["video", "info", "thumbnail"] | None:
+    """
+    Given an already-downloaded file, work out what sort of file it is.
+    """
+    if filename.endswith(".part"):
+        return None
+
+    if filename.endswith(
+        (
+            f"-{video_id}.mp4",
+            f"-{video_id}.webm",
+            f"-{video_id}.mkv",
+            f" [{video_id}].mp4",
+            f" [{video_id}].mkv",
+            f" [{video_id}].webm",
+        )
+    ):
+        return "video"
+
+    if filename.endswith(
+        (
+            f"-{video_id}.jpg",
+            f"-{video_id}.webp",
+            f" [{video_id}].jpg",
+            f" [{video_id}].webp",
+        )
+    ):
+        return "thumbnail"
+
+    if filename.endswith((f"-{video_id}.info.json", f" [{video_id}].info.json")):
+        return "info"
+
+    raise ValueError(f"Unrecognised filename: {filename}")
+
+
 @log_result("https://youtube.com/watch?v={video_id}")
 def download_video(*, video_id, download_root):
     uploader = get_uploader(video_id=video_id, db_path=download_root / "uploaders.db")
@@ -97,31 +134,15 @@ def download_video(*, video_id, download_root):
 
     # Look to see if this video has been downloaded before.  If it has, skip any
     # further processing.
-    matching_filenames = [
-        filename for filename in os.listdir(download_dir) if video_id in filename
-    ]
-
-    has_video = any(
-        f.endswith(
-            (
-                f"-{video_id}.mp4",
-                f"-{video_id}.webm",
-                f"-{video_id}.mkv",
-                f" [{video_id}].webm",
-            )
-        )
-        for f in matching_filenames
-    )
-
-    has_info = any(
-        f.endswith((f"-{video_id}.info.json", f" [{video_id}].info.json"))
-        for f in matching_filenames
-    )
-
-    has_thumbnail = any(
-        f.endswith((f"-{video_id}.jpg", f" [{video_id}].jpg"))
-        for f in matching_filenames
-    )
+    matching_filenames = {
+        filename: classify_file_type(video_id, filename)
+        for filename in os.listdir(download_dir)
+        if video_id in filename
+    }
+
+    has_video = "video" in matching_filenames.values()
+    has_info = "info" in matching_filenames.values()
+    has_thumbnail = "thumbnail" in matching_filenames.values()
 
     if has_video and has_thumbnail and has_info:
         return

web/test_save_youtube_videos.py (0) → web/test_save_youtube_videos.py (861)

diff --git a/web/test_save_youtube_videos.py b/web/test_save_youtube_videos.py
new file mode 100644
index 0000000..d16a7e3
--- /dev/null
+++ b/web/test_save_youtube_videos.py
@@ -0,0 +1,30 @@
+import pytest
+
+from save_youtube_videos import classify_file_type
+
+
+@pytest.mark.parametrize(
+    ["video_id", "filename", "file_type"],
+    [
+        ("3VvioE0ziPk", "Who is the loudest sea lion? [3VvioE0ziPk].mkv", "video"),
+        (
+            "TE8KMnGm2Xw",
+            "Warning, biters ! - A Factorio Short [TE8KMnGm2Xw].webp",
+            "thumbnail",
+        ),
+        ("AfsnHVaScjg", "Ravens can talk! [AfsnHVaScjg].info.json", "info"),
+        (
+            "X1ynZm1wI18",
+            "Rami Ismail, Vlambeer - XOXO Festival (2015)-X1ynZm1wI18.webp",
+            "thumbnail",
+        ),
+        (
+        "wGS53t8ZbO8",
+        "The World's Most Remote Buildings-wGS53t8ZbO8.f251.webm.part",
+        None
+
+        )
+    ],
+)
+def test_classify_file_type(video_id, filename, file_type):
+    assert classify_file_type(video_id, filename) == file_type