Skip to main content

Revert “Remove my yt-dlp.py script from this repo”

ID
9e40bda
date
2024-05-14 17:36:06+00:00
author
Alex Chan <alex@alexwlchan.net>
parent
046cf4d
message
Revert "Remove my yt-dlp.py script from this repo"

046cf4d4e4e0d51e8171686817ee004dc258f4cb
changed files
2 files, 124 additions, 1 deletion

Changed files

web/README.md (4722) → web/README.md (5225)

diff --git a/web/README.md b/web/README.md
index 3e4518a..ba7e757 100644
--- a/web/README.md
+++ b/web/README.md
@@ -71,6 +71,12 @@ scripts = [
         scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.<br/><br/><img src="really_useful_boxes.png">
         """
     },
+    {
+        "name": "yt-dlp.py",
+        "description": """
+        this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
+        """
+    },
 ]
 
 cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
@@ -157,5 +163,14 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
   <dd>
     scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.<br/><br/><img src="really_useful_boxes.png">
   </dd>
+
+  <dt>
+    <a href="https://github.com/alexwlchan/scripts/blob/main/web/yt-dlp.py">
+      <code>yt-dlp.py</code>
+    </a>
+  </dt>
+  <dd>
+    this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
+  </dd>
 </dl>
-<!-- [[[end]]] (checksum: 43e1e90a4f72b89531ee5307b85e6385) -->
+<!-- [[[end]]] (checksum: 54f08d9f43084b85f2d56d5caced4e10) -->

web/yt-dlp.py (0) → web/yt-dlp.py (3425)

diff --git a/web/yt-dlp.py b/web/yt-dlp.py
new file mode 100755
index 0000000..11202ff
--- /dev/null
+++ b/web/yt-dlp.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+This is a wrapper around yt-dlp that has a couple of special behaviours:
+
+*   It does parallel downloads for YouTube playlists, which is must
+    faster than vanilla yt-dlp.
+
+*   It enforces a couple of rules around downloading subtitles, to ensure
+    I always remember to download them in a consistent way.
+
+The goal is that this is a drop-in replacement for vanilla yt-dlp: if it
+downloads something, it downloads the exact same set of files.  You could
+copy any command that uses this script onto a machine running the regular
+tool and it would work as-is.  It might check extra rules or run faster,
+but it should never download something different to the regular tool.
+"""
+
+import os
+import subprocess
+import sys
+
+import hyperlink
+
+
+def is_youtube_playlist(url: str) -> bool:
+    """
+    Returns True if a YouTube URL is a playlist, false otherwise.
+    """
+    u = hyperlink.DecodedURL.from_text(url)
+    assert "youtube.com" in u.host
+    return bool(u.get("list"))
+
+
+def download_parallel_playlist(youtube_url: str, remaining_args: list[str]) -> None:
+    """
+    Download a YouTube playlist in parallel.
+
+    See https://alexwlchan.net/2020/how-to-do-parallel-downloads-with-youtube-dl/
+    """
+    get_ids_proc = subprocess.Popen(
+        [yt_dlp_path, "--get-id", youtube_url], stdout=subprocess.PIPE
+    )
+
+    subprocess.check_call(
+        ["xargs", "-I", "{}", "-P", "5", yt_dlp_path]
+        + remaining_args
+        + ["https://youtube.com/watch?v={}"],
+        stdin=get_ids_proc.stdout,
+    )
+
+    get_ids_proc.wait()
+
+
+def check_arguments(argv: list[str]) -> None:
+    """
+    Validate the arguments I'm using.
+
+    This will never modify the arguments, but it might give an error
+    message telling me to use arguments differently.
+    """
+    # I always want subtitles in srt format, so make sure that if I'm
+    # downloading subtitles I'm doing that conversion.
+    #
+    # I could do this after the fact, but it's slightly quicker to do
+    # it on the initial download, especially if I'm invoking `yt-dlp`
+    # with some sort of dynamic variable e.g. `pbpaste` or `furl`.
+    download_subtitle_args = (
+        "--write-sub",
+        "--write-subs",
+        "--write-auto-sub",
+        "--write-auto-subs",
+    )
+
+    if (
+        any(dsa in argv for dsa in download_subtitle_args)
+        and "--convert-subtitles=srt" not in argv
+    ):
+        sys.exit("Did you forget to add --convert-subtitles=srt?")
+
+
+if __name__ == "__main__":
+    argv = sys.argv[1:]
+
+    check_arguments(argv)
+
+    # Where is yt-dlp?
+    #
+    # sys.executable returns the path to the currently running Python,
+    # and we can go from there to get the path to yt-dlp.
+    yt_dlp_path = os.path.join(os.path.dirname(sys.executable), "yt-dlp")
+
+    # Look for a YouTube URL in the argument list.  If we don't find one,
+    # assume we're downloading some other source and call yt-dlp as usual.
+    youtube_url_matches = [a for a in argv if "youtube.com" in a]
+    remaining_args = [a for a in argv if "youtube.com" not in a]
+
+    if len(youtube_url_matches) != 1:
+        subprocess.check_call([yt_dlp_path] + argv)
+        sys.exit(0)
+
+    youtube_url = youtube_url_matches[0]
+
+    if is_youtube_playlist(youtube_url):
+        download_parallel_playlist(
+            youtube_url=youtube_url, remaining_args=remaining_args
+        )
+    else:
+        subprocess.check_call([yt_dlp_path] + argv)