Merge pull request #16 from alexwlchan/dont-save-youtube-description

ID

20ff716

date

2024-02-15 22:13:21+00:00

author

Alex Chan <alex@alexwlchan.net>

parents

047d549, 12f7ace

message

Merge pull request #16 from alexwlchan/dont-save-youtube-description

Improve my scripts for saving from YouTube

changed files

3 files, 84 additions, 35 deletions

.github/workflows/test.yml
web/save_youtube_videos.py
web/test_save_youtube_videos.py

Changed files

.github/workflows/test.yml (1233) → .github/workflows/test.yml (1281)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 6356a05..44a57af 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -45,3 +45,4 @@ jobs:
         py.test text/test_fix_twitter_thread.py
         py.test textexpander/test_get_mastodon_text.py
         py.test web/test_save_ao3_links.py
+        py.test web/test_save_youtube_videos.py

web/save_youtube_videos.py (4444) → web/save_youtube_videos.py (4983)

diff --git a/web/save_youtube_videos.py b/web/save_youtube_videos.py
index f82b811..5ae3104 100755
--- a/web/save_youtube_videos.py
+++ b/web/save_youtube_videos.py
@@ -10,6 +10,7 @@ import pathlib
 import subprocess
 import sys
 import textwrap
+from typing import Literal
 
 import hyperlink
 from sqlite_utils import Database
@@ -76,7 +77,8 @@ def log_result(format_template):
                 print(termcolor.colored(f"✘ {description}\n{wrapped_error}", "red"))
                 raise
             else:
-                print(termcolor.colored(f"✔ {description}", "green"))
+                if result == "downloaded":
+                    print(termcolor.colored(f"✔ {description}", "green"))
                 return result
 
         return wrapper
@@ -84,6 +86,43 @@ def log_result(format_template):
     return decorator
 
 
+def classify_file_type(
+    video_id: str, filename: str
+) -> Literal["video", "info", "thumbnail"] | None:
+    """
+    Given an already-downloaded file, work out what sort of file it is.
+    """
+    if filename.endswith(".part"):
+        return None
+
+    if filename.endswith(
+        (
+            f"-{video_id}.mp4",
+            f"-{video_id}.webm",
+            f"-{video_id}.mkv",
+            f" [{video_id}].mp4",
+            f" [{video_id}].mkv",
+            f" [{video_id}].webm",
+        )
+    ):
+        return "video"
+
+    if filename.endswith(
+        (
+            f"-{video_id}.jpg",
+            f"-{video_id}.webp",
+            f" [{video_id}].jpg",
+            f" [{video_id}].webp",
+        )
+    ):
+        return "thumbnail"
+
+    if filename.endswith((f"-{video_id}.info.json", f" [{video_id}].info.json")):
+        return "info"
+
+    raise ValueError(f"Unrecognised filename: {filename}")
+
+
 @log_result("https://youtube.com/watch?v={video_id}")
 def download_video(*, video_id, download_root):
     uploader = get_uploader(video_id=video_id, db_path=download_root / "uploaders.db")
@@ -97,38 +136,17 @@ def download_video(*, video_id, download_root):
 
     # Look to see if this video has been downloaded before.  If it has, skip any
     # further processing.
-    matching_filenames = [
-        filename for filename in os.listdir(download_dir) if video_id in filename
-    ]
-
-    has_video = any(
-        f.endswith(
-            (
-                f"-{video_id}.mp4",
-                f"-{video_id}.webm",
-                f"-{video_id}.mkv",
-                f" [{video_id}].webm",
-            )
-        )
-        for f in matching_filenames
-    )
-
-    has_description = any(
-        f.endswith((f"-{video_id}.description", f" [{video_id}].description"))
-        for f in matching_filenames
-    )
+    matching_filenames = {
+        filename: classify_file_type(video_id, filename)
+        for filename in os.listdir(download_dir)
+        if video_id in filename
+    }
 
-    has_info = any(
-        f.endswith((f"-{video_id}.info.json", f" [{video_id}].info.json"))
-        for f in matching_filenames
-    )
-
-    has_thumbnail = any(
-        f.endswith((f"-{video_id}.jpg", f" [{video_id}].jpg"))
-        for f in matching_filenames
-    )
+    has_video = "video" in matching_filenames.values()
+    has_info = "info" in matching_filenames.values()
+    has_thumbnail = "thumbnail" in matching_filenames.values()
 
-    if has_video and has_description and has_info:
+    if has_video and has_thumbnail and has_info:
         return
 
     # Construct the command.  The expensive bit is redownloading the
@@ -139,9 +157,6 @@ def download_video(*, video_id, download_root):
     if has_video:
         cmd.append("--skip-download")
 
-    if not has_description:
-        cmd.append("--write-description")
-
     if not has_info:
         cmd.append("--write-info-json")
 
@@ -151,6 +166,7 @@ def download_video(*, video_id, download_root):
     try:
         youtube_dl(*cmd, cwd=download_dir)
         print(download_dir)
+        return "downloaded"
     except subprocess.CalledProcessError as err:  # pragma: no cover
         print(f"Unable to download {video_url}: {err}", file=sys.stderr)
         raise
@@ -159,4 +175,7 @@ def download_video(*, video_id, download_root):
 if __name__ == "__main__":
     for url in sys.argv[1:]:
         video_id = get_video_id(url)
-        download_video(video_id=video_id, download_root=BACKUP_ROOT)
+        try:
+            download_video(video_id=video_id, download_root=BACKUP_ROOT)
+        except Exception:
+            pass

web/test_save_youtube_videos.py (0) → web/test_save_youtube_videos.py (874)

diff --git a/web/test_save_youtube_videos.py b/web/test_save_youtube_videos.py
new file mode 100644
index 0000000..c182a8a
--- /dev/null
+++ b/web/test_save_youtube_videos.py
@@ -0,0 +1,29 @@
+import pytest
+
+from save_youtube_videos import classify_file_type
+
+
+@pytest.mark.parametrize(
+    ["video_id", "filename", "file_type"],
+    [
+        ("3VvioE0ziPk", "Who is the loudest sea lion？ [3VvioE0ziPk].mkv", "video"),
+        (
+            "TE8KMnGm2Xw",
+            "Warning, biters ! - A Factorio Short [TE8KMnGm2Xw].webp",
+            "thumbnail",
+        ),
+        ("AfsnHVaScjg", "Ravens can talk! [AfsnHVaScjg].info.json", "info"),
+        (
+            "X1ynZm1wI18",
+            "Rami Ismail, Vlambeer - XOXO Festival (2015)-X1ynZm1wI18.webp",
+            "thumbnail",
+        ),
+        (
+            "wGS53t8ZbO8",
+            "The World's Most Remote Buildings-wGS53t8ZbO8.f251.webm.part",
+            None,
+        ),
+    ],
+)
+def test_classify_file_type(video_id, filename, file_type):
+    assert classify_file_type(video_id, filename) == file_type