Remove a now-unused script
- ID
a2282f8- date
2025-01-28 07:31:07+00:00- author
Alex Chan <alex@alexwlchan.net>- parent
b37068a- message
Remove a now-unused script- changed files
2 files, 1 addition, 233 deletions
Changed files
web/README.md (4300) → web/README.md (3907)
diff --git a/web/README.md b/web/README.md
index 79e56b6..efd7941 100644
--- a/web/README.md
+++ b/web/README.md
@@ -48,12 +48,6 @@ scripts = [
"""
},
{
- "usage": "save_youtube_videos.py [URL...]",
- "description": """
- save a local copy of one or more YouTube videos
- """
- },
- {
"name": "scrape_really_useful_boxes.py",
"description": """
scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.<br/><br/><img src="really_useful_boxes.png">
@@ -117,15 +111,6 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
</dd>
<dt>
- <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_youtube_videos.py">
- <code>save_youtube_videos.py [URL...]</code>
- </a>
- </dt>
- <dd>
- save a local copy of one or more YouTube videos
- </dd>
-
- <dt>
<a href="https://github.com/alexwlchan/scripts/blob/main/web/scrape_really_useful_boxes.py">
<code>scrape_really_useful_boxes.py</code>
</a>
@@ -143,4 +128,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
</dd>
</dl>
-<!-- [[[end]]] (checksum: 044d860fd40ad2bb36ed34ee0bf20fc5) -->
+<!-- [[[end]]] (checksum: 4741614413c679d66cec750e6cb1ca7e) -->
web/save_youtube_videos.py (5836) → web/save_youtube_videos.py (0)
diff --git a/web/save_youtube_videos.py b/web/save_youtube_videos.py
deleted file mode 100755
index be37cb0..0000000
--- a/web/save_youtube_videos.py
+++ /dev/null
@@ -1,217 +0,0 @@
-#!/usr/bin/env python3
-"""
-Make a local copy of one or more YouTube videos.
-"""
-
-import functools
-import json
-import os
-from pathlib import Path
-import subprocess
-import sys
-import textwrap
-from typing import Literal
-
-import hyperlink
-from sqlite_utils import Database
-from sqlite_utils.db import NotFoundError
-import termcolor
-
-
-BACKUP_ROOT = Path("/Volumes/Media (Sapphire)/backups/youtube/videos")
-
-
-def youtube_dl(*args, **kwargs):
- return (
- subprocess.check_output(["yt-dlp"] + list(args), **kwargs)
- .strip()
- .decode("utf8")
- )
-
-
-def get_video_id(url):
- """
- Given the URL of a YouTube video, return the video ID.
- """
- parsed_url = hyperlink.URL.from_text(url)
-
- if parsed_url.host != "www.youtube.com":
- raise ValueError(f"Not the URL of a YouTube video: {url!r}")
-
- video_id = parsed_url.get("v")
-
- if len(video_id) == 1:
- return video_id[0]
- else:
- raise ValueError(f"Not the URL of a YouTube video: {url!r}")
-
-
-def get_uploader(*, video_id, db_path):
- db = Database(db_path)
-
- try:
- return db["youtube_uploaders"].get(video_id)["uploader"]
- except NotFoundError:
- uploader = json.loads(
- youtube_dl("--dump-json", f"https://www.youtube.com/watch?v={video_id}")
- )["uploader"]
-
- db["youtube_uploaders"].insert(
- {"video_id": video_id, "uploader": uploader}, pk="video_id"
- )
-
- return uploader
-
-
-def log_result(format_template):
- def decorator(inner_fn):
- @functools.wraps(inner_fn)
- def wrapper(**kwargs):
- description = format_template.format(**kwargs)
- try:
- result = inner_fn(**kwargs)
- except Exception as exc:
- wrapped_error = textwrap.indent(
- textwrap.fill(str(exc), width=85), prefix=" " * 4
- )
- print(termcolor.colored(f"✘ {description}\n{wrapped_error}", "red"))
- raise
- else:
- if result == "downloaded":
- print(termcolor.colored(f"✔ {description}", "green"))
- return result
-
- return wrapper
-
- return decorator
-
-
-def classify_file_type(
- video_id: str, filename: Path
-) -> Literal["video", "info", "thumbnail", "subtitles"] | None:
- """
- Given an already-downloaded file, work out what sort of file it is.
- """
- if filename.name.endswith(".part"):
- os.unlink(filename)
- return None
-
- if filename.name.endswith(
- (
- f" [{video_id}].mp4",
- f" [{video_id}].mkv",
- f" [{video_id}].webm",
- )
- ):
- return "video"
-
- if filename.name.endswith(
- (
- f" [{video_id}].jpg",
- f" [{video_id}].webp",
- )
- ):
- return "thumbnail"
-
- if filename.name.endswith(f" [{video_id}].info.json"):
- return "info"
-
- if filename.name.endswith(
- (
- ".vtt",
- f" [{video_id}].live_chat.json",
- )
- ):
- return "subtitles"
-
- raise ValueError(f"Unrecognised filename: {filename}")
-
-
-def fix_info_json(path: Path) -> None:
- """
- Tidy up the contents of the info.json fie.
- """
- with open(path) as in_file:
- data = json.load(in_file)
-
- # These are a couple of fields which are very large, don't contain
- # much useful metadata, and point to transient URLs that don't work
- # later.
- for key in (
- "formats",
- "automatic_captions",
- "thumbnails",
- "heatmap",
- "_format_sort_fields",
- "subtitles",
- ):
- if key in data:
- del data[key]
-
- json_string = json.dumps(data, indent=2)
-
- with open(path, "w") as out_file:
- out_file.write(json_string)
-
-
-@log_result("https://youtube.com/watch?v={video_id}")
-def download_video(*, video_id, download_root):
- uploader = get_uploader(video_id=video_id, db_path=download_root / "uploaders.db")
-
- # I save enough videos that saving them all into a single directory is
- # impractical. Instead, sort videos by the first character of their uploader.
- #
- download_dir = download_root / uploader.lower()[0] / uploader
-
- download_dir.mkdir(exist_ok=True, parents=True)
-
- # Look to see if this video has been downloaded before. If it has, skip any
- # further processing.
- matching_filenames = {
- filename: classify_file_type(video_id, download_dir / filename)
- for filename in os.listdir(download_dir)
- if video_id in filename
- }
-
- has_video = "video" in matching_filenames.values()
- has_info = "info" in matching_filenames.values()
- has_thumbnail = "thumbnail" in matching_filenames.values()
-
- if has_video and has_thumbnail and has_info:
- return
-
- # Construct the command. The expensive bit is redownloading the
- # video file, so don't do that if it's already downloaded.
- video_url = f"https://youtube.com/watch?v={video_id}"
- cmd = [video_url, "--write-sub"]
-
- if has_video:
- cmd.append("--skip-download")
-
- if not has_info:
- cmd.append("--write-info-json")
-
- if not has_thumbnail:
- cmd.append("--write-thumbnail")
-
- try:
- youtube_dl(*cmd, cwd=download_dir)
- print(download_dir)
-
- for f in os.listdir(download_dir):
- if f.endswith(".info.json"):
- fix_info_json(download_dir / f)
-
- return "downloaded"
- except subprocess.CalledProcessError as err: # pragma: no cover
- print(f"Unable to download {video_url}: {err}", file=sys.stderr)
- raise
-
-
-if __name__ == "__main__":
- for url in sys.argv[1:]:
- video_id = get_video_id(url)
- try:
- download_video(video_id=video_id, download_root=BACKUP_ROOT)
- except Exception:
- pass