Improve the way we detect already-downloaded files
- ID
0d77cb6- date
2024-02-15 21:51:07+00:00- author
Alex Chan <alex@alexwlchan.net>- parent
77e00da- message
Improve the way we detect already-downloaded files- changed files
2 files, 76 additions, 25 deletions
Changed files
web/save_youtube_videos.py (4282) → web/save_youtube_videos.py (4881)
diff --git a/web/save_youtube_videos.py b/web/save_youtube_videos.py
index c795e64..e74a884 100755
--- a/web/save_youtube_videos.py
+++ b/web/save_youtube_videos.py
@@ -84,6 +84,43 @@ def log_result(format_template):
return decorator
+def classify_file_type(
+ video_id: str, filename: str
+) -> Literal["video", "info", "thumbnail"] | None:
+ """
+ Given an already-downloaded file, work out what sort of file it is.
+ """
+ if filename.endswith(".part"):
+ return None
+
+ if filename.endswith(
+ (
+ f"-{video_id}.mp4",
+ f"-{video_id}.webm",
+ f"-{video_id}.mkv",
+ f" [{video_id}].mp4",
+ f" [{video_id}].mkv",
+ f" [{video_id}].webm",
+ )
+ ):
+ return "video"
+
+ if filename.endswith(
+ (
+ f"-{video_id}.jpg",
+ f"-{video_id}.webp",
+ f" [{video_id}].jpg",
+ f" [{video_id}].webp",
+ )
+ ):
+ return "thumbnail"
+
+ if filename.endswith((f"-{video_id}.info.json", f" [{video_id}].info.json")):
+ return "info"
+
+ raise ValueError(f"Unrecognised filename: {filename}")
+
+
@log_result("https://youtube.com/watch?v={video_id}")
def download_video(*, video_id, download_root):
uploader = get_uploader(video_id=video_id, db_path=download_root / "uploaders.db")
@@ -97,31 +134,15 @@ def download_video(*, video_id, download_root):
# Look to see if this video has been downloaded before. If it has, skip any
# further processing.
- matching_filenames = [
- filename for filename in os.listdir(download_dir) if video_id in filename
- ]
-
- has_video = any(
- f.endswith(
- (
- f"-{video_id}.mp4",
- f"-{video_id}.webm",
- f"-{video_id}.mkv",
- f" [{video_id}].webm",
- )
- )
- for f in matching_filenames
- )
-
- has_info = any(
- f.endswith((f"-{video_id}.info.json", f" [{video_id}].info.json"))
- for f in matching_filenames
- )
-
- has_thumbnail = any(
- f.endswith((f"-{video_id}.jpg", f" [{video_id}].jpg"))
- for f in matching_filenames
- )
+ matching_filenames = {
+ filename: classify_file_type(video_id, filename)
+ for filename in os.listdir(download_dir)
+ if video_id in filename
+ }
+
+ has_video = "video" in matching_filenames.values()
+ has_info = "info" in matching_filenames.values()
+ has_thumbnail = "thumbnail" in matching_filenames.values()
if has_video and has_thumbnail and has_info:
return
web/test_save_youtube_videos.py (0) → web/test_save_youtube_videos.py (861)
diff --git a/web/test_save_youtube_videos.py b/web/test_save_youtube_videos.py
new file mode 100644
index 0000000..d16a7e3
--- /dev/null
+++ b/web/test_save_youtube_videos.py
@@ -0,0 +1,30 @@
+import pytest
+
+from save_youtube_videos import classify_file_type
+
+
+@pytest.mark.parametrize(
+ ["video_id", "filename", "file_type"],
+ [
+ ("3VvioE0ziPk", "Who is the loudest sea lion? [3VvioE0ziPk].mkv", "video"),
+ (
+ "TE8KMnGm2Xw",
+ "Warning, biters ! - A Factorio Short [TE8KMnGm2Xw].webp",
+ "thumbnail",
+ ),
+ ("AfsnHVaScjg", "Ravens can talk! [AfsnHVaScjg].info.json", "info"),
+ (
+ "X1ynZm1wI18",
+ "Rami Ismail, Vlambeer - XOXO Festival (2015)-X1ynZm1wI18.webp",
+ "thumbnail",
+ ),
+ (
+ "wGS53t8ZbO8",
+ "The World's Most Remote Buildings-wGS53t8ZbO8.f251.webm.part",
+ None
+
+ )
+ ],
+)
+def test_classify_file_type(video_id, filename, file_type):
+ assert classify_file_type(video_id, filename) == file_type