yt-dlp_alexwlchan.py – yt-dlp_alexwlchan

`yt-dlp_alexwlchan.py`

9.1 kB
Python
View raw
1#!/usr/bin/env python3
2"""
3yt-dlp_alexwlchan is a personal wrapper around yt-dlp that downloads a video
4with my preferred settings.
5"""
6
7from datetime import datetime, timezone
8import json
9from pathlib import Path
10import re
11import subprocess
12import sys
13import tempfile
14from typing import Any, TypedDict
15import urllib.parse
16
17from chives.fetch import download_image
18from chives.media import create_video_entity, VideoEntity
19from yt_dlp import YoutubeDL
20from yt_dlp.networking.exceptions import HTTPError as YouTubeDLHTTPError
21from yt_dlp.utils import DownloadError
22
23
24ydl_opts: Any = {
25    # Print progress output to stderr, not stdout
26    "logtostderr": True,
27    #
28    # Download the thumbnail
29    "writethumbnail": True,
30    #
31    # Download subtitles, or YouTube's automatic subtitles if there
32    # aren't any.
33    "writesubtitles": True,
34    #
35    # Ignore AI-upscaled videos from YouTube.
36    # See https://alexwlchan.net/til/2025/ignore-ai-scaled-videos/
37    "format": "bestvideo*[format_id!*=-sr]+bestaudio/best[format_id!*=-sr]",
38    #
39    # Download video files as MP4 and thumbnails as JPEG, or convert
40    # to those formats if they aren't the best available.
41    "format_sort": ["res", "ext:mp4:m4a"],
42    "postprocessors": [
43        {
44            "key": "FFmpegVideoConvertor",
45            "preferedformat": "mp4",
46        },
47        {
48            "key": "FFmpegThumbnailsConvertor",
49            "format": "jpg",
50            "when": "before_dl",
51        },
52    ],
53}
54
55
56def normalise_url(url: str) -> str:
57    """
58    Remove unnecessary tracking parameters from a URL.
59    """
60    u = urllib.parse.urlsplit(url)
61
62    # If it's a YouTube URL, remove all query parameters except video ID (v)
63    if u.netloc == "www.youtube.com":
64        qs = urllib.parse.parse_qsl(u.query)
65        qs = [(k, v) for k, v in qs if k == "v"]
66        query = urllib.parse.urlencode(qs)
67        return urllib.parse.urlunsplit((u.scheme, u.netloc, u.path, query, ""))
68
69    return url
70
71
72def get_youtube_avatar(tmp_dir: Path, channel_url: str) -> Path:
73    """
74    Download the avatar of a YouTube channel.
75    """
76    ydl_opts: Any = {
77        # Print progress output to stderr, not stdout
78        "logtostderr": True,
79        #
80        # Don't download every page of results for the channel.
81        #
82        # This tells yt-dlp that we're only interested in the first video,
83        # which is technically a lie because we don't care about any videos,
84        # but it has the desired effect.
85        "playlist_items": "0",
86    }
87
88    # Get the URL of the YouTube avatar.
89    with YoutubeDL(ydl_opts) as ydl:
90        channel_info: Any = ydl.extract_info(channel_url, download=False)
91
92    thumbnails = channel_info["thumbnails"]
93    best_thumbnail = next(t for t in thumbnails if t["id"] == "avatar_uncropped")
94    thumbnail_url = best_thumbnail["url"]
95
96    # Work out the base filename, e.g. "https://www.youtube.com/@networkrail"
97    # becomes "networkrail"
98    u = urllib.parse.urlsplit(channel_url)
99    basename = u.path.split("/")[1].replace("@", "")
100
101    return download_image(url=thumbnail_url, out_prefix=tmp_dir / basename)
102
103
104def get_instagram_avatar(tmp_dir: Path, uploader_name: str) -> Path:
105    """
106    Download the avatar of an Instagram channel.
107    """
108    # Somewhere gallery-dl is caching avatars, so the URL it returns
109    # can have an expired signature. For now, just clear the cache before
110    # every run to flush any stale data.
111    subprocess.check_call(["gallery-dl", "--clear-cache", "ALL"])
112
113    output = subprocess.check_output(
114        [
115            "gallery-dl",
116            "--get-urls",
117            f"https://www.instagram.com/{uploader_name}/avatar",
118            "--cookies-from-browser",
119            "firefox",
120        ],
121        text=True,
122    )
123    avatar_url = output.strip()
124
125    return download_image(url=avatar_url, out_prefix=tmp_dir / uploader_name)
126
127
128class UploaderInfo(TypedDict):
129    """
130    Information about a video's uploader.
131    """
132
133    id: str
134    name: str
135    url: str
136    avatar_path: Path
137
138
139class VideoInfo(TypedDict):
140    """
141    Information about a downloaded video.
142    """
143
144    id: str
145    url: str
146    title: str
147    description: str
148    date_uploaded: str
149    video_path: Path
150    thumbnail_path: Path
151    subtitle_path: Path | None
152    folder_path: Path
153    uploader: UploaderInfo
154    entity: VideoEntity
155    site: str
156
157
158def cleanup_paths(dir_path: Path) -> None:
159    """
160    For every file in `dir_path`, remove URL-unsafe characters from
161    the filenames.
162    """
163    for p in dir_path.iterdir():
164        old_name = p.name
165        new_name = p.name
166
167        for old, new in [
168            ("#", " "),
169            ("？", " "),
170            ("⧸", "-"),
171            ("：", "-"),
172            ("｜", "-"),
173            ("＂", ""),
174        ]:
175            new_name = new_name.replace(old, new)
176
177        new_name = re.sub(r"\s+", " ", new_name).strip()
178
179        if old_name == new_name:
180            continue
181
182        assert not (dir_path / new_name).exists(), new_name
183        p.move(dir_path / new_name)
184
185
186def download_video(url: str) -> VideoInfo:
187    """
188    Download a video with yt-dlp and return metadata about the video.
189    """
190    # Download all the videos to a temp directory; this allows the caller
191    # to decide exactly where they want the video later.
192    tmp_dir = Path(tempfile.mkdtemp())
193    ydl_opts["outtmpl"] = str(tmp_dir / "%(title)s [%(id)s].%(ext)s")
194
195    with YoutubeDL(ydl_opts) as ydl:
196        video_info: Any = ydl.extract_info(url)
197
198    # Try to download automatic subtitles for a YouTube video.
199    #
200    # If you try to download autosubs for a video which doesn't have any,
201    # YouTube rturns an HTTP 429 "Too Many Requests" error. Ignore this
202    # error, but raise all others.
203    if video_info["extractor"] == "youtube" and not any(
204        p.suffix == ".vtt" for p in tmp_dir.iterdir()
205    ):
206        ydl_auto_subtitle_opts: Any = {
207            "logtostderr": True,
208            "writeautomaticsub": True,
209            "skip_download": True,
210            "outtmpl": ydl_opts["outtmpl"],
211        }
212        with YoutubeDL(ydl_auto_subtitle_opts) as ydl:
213            try:
214                ydl.extract_info(url)
215            except DownloadError as e:
216                if (
217                    e.exc_info is not None
218                    and isinstance(e.exc_info[1], YouTubeDLHTTPError)
219                    and e.exc_info[1].status == 429
220                ):
221                    pass
222                else:
223                    raise
224
225    cleanup_paths(tmp_dir)
226
227    video_path = next(p for p in tmp_dir.iterdir() if p.suffix == ".mp4")
228    thumbnail_path = next(p for p in tmp_dir.iterdir() if p.suffix == ".jpg")
229    try:
230        subtitle_path = next(p for p in tmp_dir.iterdir() if p.suffix == ".vtt")
231    except StopIteration:
232        subtitle_path = None
233
234    uploader: UploaderInfo
235
236    if video_info["extractor"] == "youtube":
237        site = "youtube"
238        uploader = {
239            "id": video_info["uploader_id"],
240            "name": video_info["uploader"],
241            "url": video_info["uploader_url"],
242            "avatar_path": get_youtube_avatar(tmp_dir, video_info["uploader_url"]),
243        }
244    elif video_info["extractor"] == "Instagram":
245        site = "instagram"
246        uploader = {
247            "id": video_info["uploader_id"],
248            "name": video_info["uploader"],
249            "url": f"https://www.instagram.com/{video_info['channel']}/",
250            "avatar_path": get_instagram_avatar(
251                tmp_dir, uploader_name=video_info["channel"]
252            ),
253        }
254    else:
255        sys.exit(f"Unsupported extractor: {video_info['extractor']}")
256
257    date_uploaded = datetime.fromtimestamp(video_info["timestamp"], tz=timezone.utc)
258
259    return {
260        "id": video_info["id"],
261        "url": url,
262        "title": video_info["title"],
263        "description": video_info["description"],
264        "date_uploaded": date_uploaded.isoformat().replace("+00:00", "Z"),
265        "video_path": video_path,
266        "thumbnail_path": thumbnail_path,
267        "subtitle_path": subtitle_path,
268        "uploader": uploader,
269        "entity": create_video_entity(
270            video_path,
271            poster_path=thumbnail_path,
272            subtitles_path=subtitle_path,
273            source_url=url,
274            background="#222222",
275        ),
276        "folder_path": tmp_dir,
277        "site": site,
278    }
279
280
281class PathEncoder(json.JSONEncoder):
282    """
283    Custom JSON encoder that encodes paths as a string.
284    """
285
286    def default(self, o: Any) -> Any:
287        """
288        Encode paths as a string; everything else us the default encoder.
289        """
290        if isinstance(o, Path):
291            return str(o.absolute())
292        else:
293            return super().default(o)
294
295
296if __name__ == "__main__":
297    try:
298        url = normalise_url(sys.argv[1])
299    except IndexError:
300        sys.exit(f"Usage: {__file__} URL")
301
302    video_info = download_video(url)
303
304    json_string = json.dumps(video_info, indent=2, cls=PathEncoder, ensure_ascii=False)
305
306    print(json_string)