Skip to main content

Add a script for downloading YouTube videos

ID
06901a5
date
2024-02-15 08:32:20+00:00
author
Alex Chan <alex@alexwlchan.net>
parent
a116e29
message
Add a script for downloading YouTube videos
changed files
4 files, 220 additions, 2 deletions

Changed files

requirements.in (268) → requirements.in (328)

diff --git a/requirements.in b/requirements.in
index 2ff08a0..7f46249 100644
--- a/requirements.in
+++ b/requirements.in
@@ -8,6 +8,8 @@ datasette-render-image-tags
 flake8
 flickr-photos-api
 flickr-url-parser
+google-api-python-client==1.7.2
+google-auth-oauthlib==0.4.1
 httpx
 humanize
 hyperlink

requirements.txt (4085) → requirements.txt (4907)

diff --git a/requirements.txt b/requirements.txt
index 6f80336..b9af2e5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,6 +28,8 @@ brotli==1.1.0
     # via yt-dlp
 build==1.0.3
     # via pip-tools
+cachetools==5.3.2
+    # via google-auth
 certifi==2023.11.17
     # via
     #   httpcore
@@ -64,12 +66,27 @@ flickr-url-parser==1.7.1
     # via
     #   -r requirements.in
     #   flickr-photos-api
+google-api-python-client==1.7.2
+    # via -r requirements.in
+google-auth==2.27.0
+    # via
+    #   google-api-python-client
+    #   google-auth-httplib2
+    #   google-auth-oauthlib
+google-auth-httplib2==0.2.0
+    # via google-api-python-client
+google-auth-oauthlib==0.4.1
+    # via -r requirements.in
 h11==0.14.0
     # via
     #   httpcore
     #   uvicorn
 httpcore==1.0.2
     # via httpx
+httplib2==0.22.0
+    # via
+    #   google-api-python-client
+    #   google-auth-httplib2
 httpx==0.25.2
     # via
     #   -r requirements.in
@@ -124,6 +141,8 @@ mypy-extensions==1.0.0
     # via black
 naturalsort==1.5.1
     # via -r requirements.in
+oauthlib==3.2.2
+    # via requests-oauthlib
 packaging==23.2
     # via
     #   black
@@ -148,12 +167,20 @@ pluggy==1.3.0
     #   datasette
     #   pytest
     #   sqlite-utils
+pyasn1==0.5.1
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.3.0
+    # via google-auth
 pycodestyle==2.11.1
     # via flake8
 pycryptodomex==3.19.0
     # via yt-dlp
 pyflakes==3.1.0
     # via flake8
+pyparsing==3.1.1
+    # via httplib2
 pypdf==3.17.2
     # via -r requirements.in
 pyproject-hooks==1.0.0
@@ -173,11 +200,18 @@ regex==2023.12.25
 requests==2.31.0
     # via
     #   instaloader
+    #   requests-oauthlib
     #   yt-dlp
+requests-oauthlib==1.3.1
+    # via google-auth-oauthlib
+rsa==4.9
+    # via google-auth
 s3transfer==0.8.2
     # via boto3
 six==1.16.0
-    # via python-dateutil
+    # via
+    #   google-api-python-client
+    #   python-dateutil
 sniffio==1.3.0
     # via
     #   anyio
@@ -200,6 +234,8 @@ typing-extensions==4.9.0
     # via
     #   janus
     #   pint
+uritemplate==3.0.1
+    # via google-api-python-client
 urllib3==2.0.7
     # via
     #   botocore

web/README.md (4832) → web/README.md (5225)

diff --git a/web/README.md b/web/README.md
index d05aede..ba7e757 100644
--- a/web/README.md
+++ b/web/README.md
@@ -60,6 +60,12 @@ scripts = [
         """
     },
     {
+        "usage": "save_youtube_videos.py [URL...]",
+        "description": """
+        save a local copy of one or more YouTube videos
+        """
+    },
+    {
         "name": "scrape_really_useful_boxes.py",
         "description": """
         scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.<br/><br/><img src="really_useful_boxes.png">
@@ -141,6 +147,15 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
   </dd>
 
   <dt>
+    <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_youtube_videos.py">
+      <code>save_youtube_videos.py [URL...]</code>
+    </a>
+  </dt>
+  <dd>
+    save a local copy of one or more YouTube videos
+  </dd>
+
+  <dt>
     <a href="https://github.com/alexwlchan/scripts/blob/main/web/scrape_really_useful_boxes.py">
       <code>scrape_really_useful_boxes.py</code>
     </a>
@@ -158,4 +173,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
     this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
   </dd>
 </dl>
-<!-- [[[end]]] (checksum: 1bdadef9eb851d3d98d6e7d78e37aa11) -->
+<!-- [[[end]]] (checksum: 54f08d9f43084b85f2d56d5caced4e10) -->

web/save_youtube_videos.py (0) → web/save_youtube_videos.py (4484)

diff --git a/web/save_youtube_videos.py b/web/save_youtube_videos.py
new file mode 100755
index 0000000..26f2144
--- /dev/null
+++ b/web/save_youtube_videos.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+"""
+Make a local copy of one or more YouTube videos.
+"""
+
+import functools
+import json
+import os
+import re
+import pathlib
+import sqlite3
+import subprocess
+import sys
+import textwrap
+import zipfile
+
+import hyperlink
+from sqlite_utils import Database
+from sqlite_utils.db import NotFoundError
+import termcolor
+
+
+BACKUP_ROOT = pathlib.Path("/Volumes/Media (Sapphire)/backups/youtube/videos")
+
+
+def youtube_dl(*args, **kwargs):
+    return (
+        subprocess.check_output(["yt-dlp"] + list(args), **kwargs)
+        .strip()
+        .decode("utf8")
+    )
+
+
+def get_video_id(url):
+    """
+    Given the URL of a YouTube video, return the video ID.
+    """
+    parsed_url = hyperlink.URL.from_text(url)
+
+    if parsed_url.host != "www.youtube.com":
+        raise ValueError(f"Not the URL of a YouTube video: {url!r}")
+
+    video_id = parsed_url.get("v")
+
+    if len(video_id) == 1:
+        return video_id[0]
+    else:
+        raise ValueError(f"Not the URL of a YouTube video: {url!r}")
+
+
+def get_uploader(*, video_id, db_path):
+    db = Database(db_path)
+
+    try:
+        return db["youtube_uploaders"].get(video_id)["uploader"]
+    except NotFoundError:
+        uploader = json.loads(
+            youtube_dl("--dump-json", f"https://www.youtube.com/watch?v={video_id}")
+        )["uploader"]
+
+        db["youtube_uploaders"].insert(
+            {"video_id": video_id, "uploader": uploader}, pk="video_id"
+        )
+
+        return uploader
+
+
+def log_result(format_template):
+    def decorator(inner_fn):
+        @functools.wraps(inner_fn)
+        def wrapper(**kwargs):
+            description = format_template.format(**kwargs)
+            try:
+                result = inner_fn(**kwargs)
+            except Exception as exc:
+                wrapped_error = textwrap.indent(
+                    textwrap.fill(str(exc), width=85), prefix=" " * 4
+                )
+                print(termcolor.colored(f"✘ {description}\n{wrapped_error}", "red"))
+                raise
+            else:
+                print(termcolor.colored(f"✔ {description}", "green"))
+                return result
+
+        return wrapper
+
+    return decorator
+
+
+@log_result("https://youtube.com/watch?v={video_id}")
+def download_video(*, video_id, download_root):
+    uploader = get_uploader(video_id=video_id, db_path=download_root / "uploaders.db")
+
+    # I save enough videos that saving them all into a single directory is
+    # impractical.  Instead, sort videos by the first character of their uploader.
+    #
+    download_dir = download_root / uploader.lower()[0] / uploader
+
+    download_dir.mkdir(exist_ok=True, parents=True)
+
+    # Look to see if this video has been downloaded before.  If it has, skip any
+    # further processing.
+    matching_filenames = [
+        filename for filename in os.listdir(download_dir) if video_id in filename
+    ]
+
+    has_video = any(
+        f.endswith(
+            (
+                f"-{video_id}.mp4",
+                f"-{video_id}.webm",
+                f"-{video_id}.mkv",
+                f" [{video_id}].webm",
+            )
+        )
+        for f in matching_filenames
+    )
+
+    has_description = any(
+        f.endswith((f"-{video_id}.description", f" [{video_id}].description"))
+        for f in matching_filenames
+    )
+
+    has_info = any(
+        f.endswith((f"-{video_id}.info.json", f" [{video_id}].info.json"))
+        for f in matching_filenames
+    )
+
+    has_thumbnail = any(
+        f.endswith((f"-{video_id}.jpg", f" [{video_id}].jpg"))
+        for f in matching_filenames
+    )
+
+    if has_video and has_description and has_info:
+        return
+
+    # Construct the command.  The expensive bit is redownloading the
+    # video file, so don't do that if it's already downloaded.
+    video_url = f"https://youtube.com/watch?v={video_id}"
+    cmd = [video_url]
+
+    if has_video:
+        cmd.append("--skip-download")
+
+    if not has_description:
+        cmd.append("--write-description")
+
+    if not has_info:
+        cmd.append("--write-info-json")
+
+    if not has_thumbnail:
+        cmd.append("--write-thumbnail")
+
+    try:
+        youtube_dl(*cmd, cwd=download_dir)
+        print(download_dir)
+    except subprocess.CalledProcessError as err:  # pragma: no cover
+        print(f"Unable to download {video_url}: {err}", file=sys.stderr)
+        raise
+
+
+if __name__ == "__main__":
+    for url in sys.argv[1:]:
+        video_id = get_video_id(url)
+        download_video(video_id=video_id, download_root=BACKUP_ROOT)