Add a script for downloading YouTube videos
- ID
06901a5- date
2024-02-15 08:32:20+00:00- author
Alex Chan <alex@alexwlchan.net>- parent
a116e29- message
Add a script for downloading YouTube videos- changed files
4 files, 220 additions, 2 deletions
Changed files
requirements.in (268) → requirements.in (328)
diff --git a/requirements.in b/requirements.in
index 2ff08a0..7f46249 100644
--- a/requirements.in
+++ b/requirements.in
@@ -8,6 +8,8 @@ datasette-render-image-tags
flake8
flickr-photos-api
flickr-url-parser
+google-api-python-client==1.7.2
+google-auth-oauthlib==0.4.1
httpx
humanize
hyperlink
requirements.txt (4085) → requirements.txt (4907)
diff --git a/requirements.txt b/requirements.txt
index 6f80336..b9af2e5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,6 +28,8 @@ brotli==1.1.0
# via yt-dlp
build==1.0.3
# via pip-tools
+cachetools==5.3.2
+ # via google-auth
certifi==2023.11.17
# via
# httpcore
@@ -64,12 +66,27 @@ flickr-url-parser==1.7.1
# via
# -r requirements.in
# flickr-photos-api
+google-api-python-client==1.7.2
+ # via -r requirements.in
+google-auth==2.27.0
+ # via
+ # google-api-python-client
+ # google-auth-httplib2
+ # google-auth-oauthlib
+google-auth-httplib2==0.2.0
+ # via google-api-python-client
+google-auth-oauthlib==0.4.1
+ # via -r requirements.in
h11==0.14.0
# via
# httpcore
# uvicorn
httpcore==1.0.2
# via httpx
+httplib2==0.22.0
+ # via
+ # google-api-python-client
+ # google-auth-httplib2
httpx==0.25.2
# via
# -r requirements.in
@@ -124,6 +141,8 @@ mypy-extensions==1.0.0
# via black
naturalsort==1.5.1
# via -r requirements.in
+oauthlib==3.2.2
+ # via requests-oauthlib
packaging==23.2
# via
# black
@@ -148,12 +167,20 @@ pluggy==1.3.0
# datasette
# pytest
# sqlite-utils
+pyasn1==0.5.1
+ # via
+ # pyasn1-modules
+ # rsa
+pyasn1-modules==0.3.0
+ # via google-auth
pycodestyle==2.11.1
# via flake8
pycryptodomex==3.19.0
# via yt-dlp
pyflakes==3.1.0
# via flake8
+pyparsing==3.1.1
+ # via httplib2
pypdf==3.17.2
# via -r requirements.in
pyproject-hooks==1.0.0
@@ -173,11 +200,18 @@ regex==2023.12.25
requests==2.31.0
# via
# instaloader
+ # requests-oauthlib
# yt-dlp
+requests-oauthlib==1.3.1
+ # via google-auth-oauthlib
+rsa==4.9
+ # via google-auth
s3transfer==0.8.2
# via boto3
six==1.16.0
- # via python-dateutil
+ # via
+ # google-api-python-client
+ # python-dateutil
sniffio==1.3.0
# via
# anyio
@@ -200,6 +234,8 @@ typing-extensions==4.9.0
# via
# janus
# pint
+uritemplate==3.0.1
+ # via google-api-python-client
urllib3==2.0.7
# via
# botocore
web/README.md (4832) → web/README.md (5225)
diff --git a/web/README.md b/web/README.md
index d05aede..ba7e757 100644
--- a/web/README.md
+++ b/web/README.md
@@ -60,6 +60,12 @@ scripts = [
"""
},
{
+ "usage": "save_youtube_videos.py [URL...]",
+ "description": """
+ save a local copy of one or more YouTube videos
+ """
+ },
+ {
"name": "scrape_really_useful_boxes.py",
"description": """
scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.<br/><br/><img src="really_useful_boxes.png">
@@ -141,6 +147,15 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
</dd>
<dt>
+ <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_youtube_videos.py">
+ <code>save_youtube_videos.py [URL...]</code>
+ </a>
+ </dt>
+ <dd>
+ save a local copy of one or more YouTube videos
+ </dd>
+
+ <dt>
<a href="https://github.com/alexwlchan/scripts/blob/main/web/scrape_really_useful_boxes.py">
<code>scrape_really_useful_boxes.py</code>
</a>
@@ -158,4 +173,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
</dd>
</dl>
-<!-- [[[end]]] (checksum: 1bdadef9eb851d3d98d6e7d78e37aa11) -->
+<!-- [[[end]]] (checksum: 54f08d9f43084b85f2d56d5caced4e10) -->
web/save_youtube_videos.py (0) → web/save_youtube_videos.py (4484)
diff --git a/web/save_youtube_videos.py b/web/save_youtube_videos.py
new file mode 100755
index 0000000..26f2144
--- /dev/null
+++ b/web/save_youtube_videos.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+"""
+Make a local copy of one or more YouTube videos.
+"""
+
+import functools
+import json
+import os
+import re
+import pathlib
+import sqlite3
+import subprocess
+import sys
+import textwrap
+import zipfile
+
+import hyperlink
+from sqlite_utils import Database
+from sqlite_utils.db import NotFoundError
+import termcolor
+
+
+BACKUP_ROOT = pathlib.Path("/Volumes/Media (Sapphire)/backups/youtube/videos")
+
+
+def youtube_dl(*args, **kwargs):
+ return (
+ subprocess.check_output(["yt-dlp"] + list(args), **kwargs)
+ .strip()
+ .decode("utf8")
+ )
+
+
+def get_video_id(url):
+ """
+ Given the URL of a YouTube video, return the video ID.
+ """
+ parsed_url = hyperlink.URL.from_text(url)
+
+ if parsed_url.host != "www.youtube.com":
+ raise ValueError(f"Not the URL of a YouTube video: {url!r}")
+
+ video_id = parsed_url.get("v")
+
+ if len(video_id) == 1:
+ return video_id[0]
+ else:
+ raise ValueError(f"Not the URL of a YouTube video: {url!r}")
+
+
+def get_uploader(*, video_id, db_path):
+ db = Database(db_path)
+
+ try:
+ return db["youtube_uploaders"].get(video_id)["uploader"]
+ except NotFoundError:
+ uploader = json.loads(
+ youtube_dl("--dump-json", f"https://www.youtube.com/watch?v={video_id}")
+ )["uploader"]
+
+ db["youtube_uploaders"].insert(
+ {"video_id": video_id, "uploader": uploader}, pk="video_id"
+ )
+
+ return uploader
+
+
+def log_result(format_template):
+ def decorator(inner_fn):
+ @functools.wraps(inner_fn)
+ def wrapper(**kwargs):
+ description = format_template.format(**kwargs)
+ try:
+ result = inner_fn(**kwargs)
+ except Exception as exc:
+ wrapped_error = textwrap.indent(
+ textwrap.fill(str(exc), width=85), prefix=" " * 4
+ )
+ print(termcolor.colored(f"✘ {description}\n{wrapped_error}", "red"))
+ raise
+ else:
+ print(termcolor.colored(f"✔ {description}", "green"))
+ return result
+
+ return wrapper
+
+ return decorator
+
+
+@log_result("https://youtube.com/watch?v={video_id}")
+def download_video(*, video_id, download_root):
+ uploader = get_uploader(video_id=video_id, db_path=download_root / "uploaders.db")
+
+ # I save enough videos that saving them all into a single directory is
+ # impractical. Instead, sort videos by the first character of their uploader.
+ #
+ download_dir = download_root / uploader.lower()[0] / uploader
+
+ download_dir.mkdir(exist_ok=True, parents=True)
+
+ # Look to see if this video has been downloaded before. If it has, skip any
+ # further processing.
+ matching_filenames = [
+ filename for filename in os.listdir(download_dir) if video_id in filename
+ ]
+
+ has_video = any(
+ f.endswith(
+ (
+ f"-{video_id}.mp4",
+ f"-{video_id}.webm",
+ f"-{video_id}.mkv",
+ f" [{video_id}].webm",
+ )
+ )
+ for f in matching_filenames
+ )
+
+ has_description = any(
+ f.endswith((f"-{video_id}.description", f" [{video_id}].description"))
+ for f in matching_filenames
+ )
+
+ has_info = any(
+ f.endswith((f"-{video_id}.info.json", f" [{video_id}].info.json"))
+ for f in matching_filenames
+ )
+
+ has_thumbnail = any(
+ f.endswith((f"-{video_id}.jpg", f" [{video_id}].jpg"))
+ for f in matching_filenames
+ )
+
+ if has_video and has_description and has_info:
+ return
+
+ # Construct the command. The expensive bit is redownloading the
+ # video file, so don't do that if it's already downloaded.
+ video_url = f"https://youtube.com/watch?v={video_id}"
+ cmd = [video_url]
+
+ if has_video:
+ cmd.append("--skip-download")
+
+ if not has_description:
+ cmd.append("--write-description")
+
+ if not has_info:
+ cmd.append("--write-info-json")
+
+ if not has_thumbnail:
+ cmd.append("--write-thumbnail")
+
+ try:
+ youtube_dl(*cmd, cwd=download_dir)
+ print(download_dir)
+ except subprocess.CalledProcessError as err: # pragma: no cover
+ print(f"Unable to download {video_url}: {err}", file=sys.stderr)
+ raise
+
+
+if __name__ == "__main__":
+ for url in sys.argv[1:]:
+ video_id = get_video_id(url)
+ download_video(video_id=video_id, download_root=BACKUP_ROOT)