Merge pull request #49 from alexwlchan/remove-web-archiving-scripts
- ID
457d975- date
2024-06-05 19:01:26+00:00- author
Alex Chan <alex@alexwlchan.net>- parents
786e819,43b13ed- message
Merge pull request #49 from alexwlchan/remove-web-archiving-scripts Remove a couple of web archiving scripts that have "graduated"- changed files
6 files, 1 addition, 509 deletions
Changed files
web/.gitattributes (47) → web/.gitattributes (0)
diff --git a/web/.gitattributes b/web/.gitattributes
deleted file mode 100644
index c226feb..0000000
--- a/web/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-save_safari_webarchive linguist-language=Swift
web/README.md (5642) → web/README.md (4339)
diff --git a/web/README.md b/web/README.md
index a33c788..bdf266a 100644
--- a/web/README.md
+++ b/web/README.md
@@ -42,18 +42,6 @@ scripts = [
"""
},
{
- "usage": "save_ao3_links.py [URL...]",
- "description": """
- save a copy of a story on AO3, including exports in every available format.
- """
- },
- {
- "name": "save_pinboard_bookmarks.py",
- "description": """
- save a complete copy of all my Pinboard bookmarks, including my archive backups.
- """
- },
- {
"name": "save_tumblr_likes.py",
"description": """
save a copy of all the posts I've liked on Tumblr to my backup drive.
@@ -66,12 +54,6 @@ scripts = [
"""
},
{
- "usage": "save_safari_webarchive [URL] [OUTPUT_PATH]",
- "description": """
- save a copy of a web page as a Safari webarchive
- """
- },
- {
"name": "scrape_really_useful_boxes.py",
"description": """
scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.<br/><br/><img src="really_useful_boxes.png">
@@ -126,24 +108,6 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
</dd>
<dt>
- <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_ao3_links.py">
- <code>save_ao3_links.py [URL...]</code>
- </a>
- </dt>
- <dd>
- save a copy of a story on AO3, including exports in every available format.
- </dd>
-
- <dt>
- <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_pinboard_bookmarks.py">
- <code>save_pinboard_bookmarks.py</code>
- </a>
- </dt>
- <dd>
- save a complete copy of all my Pinboard bookmarks, including my archive backups.
- </dd>
-
- <dt>
<a href="https://github.com/alexwlchan/scripts/blob/main/web/save_tumblr_likes.py">
<code>save_tumblr_likes.py</code>
</a>
@@ -162,15 +126,6 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
</dd>
<dt>
- <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_safari_webarchive">
- <code>save_safari_webarchive [URL] [OUTPUT_PATH]</code>
- </a>
- </dt>
- <dd>
- save a copy of a web page as a Safari webarchive
- </dd>
-
- <dt>
<a href="https://github.com/alexwlchan/scripts/blob/main/web/scrape_really_useful_boxes.py">
<code>scrape_really_useful_boxes.py</code>
</a>
@@ -188,4 +143,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
</dd>
</dl>
-<!-- [[[end]]] (checksum: a31c60eca24c6488caaa93e8bb5f6b44) -->
+<!-- [[[end]]] (checksum: 93b152a3a4162f174022195ee107ad46) -->
web/save_ao3_links.py (2891) → web/save_ao3_links.py (0)
diff --git a/web/save_ao3_links.py b/web/save_ao3_links.py
deleted file mode 100755
index dd210ed..0000000
--- a/web/save_ao3_links.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import pathlib
-import shutil
-import subprocess
-import sys
-
-import hyperlink
-
-
-BACKUP_ROOT = pathlib.Path("/Volumes/Media (Sapphire)/backups/ao3")
-
-
-def get_ao3_id(url: str) -> str:
- # e.g. 'https://archiveofourown.org/works/1234' ~> '1234'
- u = hyperlink.DecodedURL.from_text(url)
-
- if u.path[0] == "works" and u.path[1].isnumeric():
- return u.path[1]
- elif (
- len(u.path) >= 4
- and u.path[0] == "collections"
- and u.path[2] == "works"
- and u.path[3].isnumeric()
- ):
- return u.path[3]
- else:
- raise ValueError(url)
-
-
-def save_ao3_url(url: str):
- ao3_id = get_ao3_id(url)
-
- # Check if the fic is already downloaded -- if it is, nothing to do.
- if any(
- name.startswith(f"{ao3_id}-") and os.path.isdir(BACKUP_ROOT / name)
- for name in os.listdir(BACKUP_ROOT)
- ):
- return
-
- print(f"Saving {url}...")
-
- # Otherwise, create a temporary directory for the download.
- #
- # Delete any partial downloads first.
- tmp_dir = BACKUP_ROOT / f"{ao3_id}.tmp"
-
- try:
- shutil.rmtree(tmp_dir)
- except FileNotFoundError:
- pass
-
- for ext in ["azw", "epub", "mobi", "pdf", "html"]:
- wget(
- "--no-verbose",
- "--output-file",
- "-",
- # The Content-Disposition header is sent by the server to say
- # what the file "should" be called. By telling wget to respect this,
- # it means we can request "a.html", the header from AO3 will specify
- # the correct filename (including the fic title), and the file will
- # be named correctly.
- "--content-disposition",
- "--directory-prefix",
- tmp_dir,
- f"https://archiveofourown.org/downloads/{ao3_id}/a.{ext}",
- )
-
- try:
- title = os.listdir(tmp_dir)[0].rsplit(".")[0]
- except FileNotFoundError:
- return
-
- out_dir = BACKUP_ROOT / f"{ao3_id}-{title}"
-
- os.rename(tmp_dir, out_dir)
-
- print(f" ~> {out_dir}")
-
-
-def wget(*args):
- subprocess.call(["wget"] + list(args), stdout=subprocess.DEVNULL)
-
-
-if __name__ == "__main__":
- for url in sys.argv[1:]:
- if url == "https://archiveofourown.org/series/136245":
- for story_url in [
- "https://archiveofourown.org/works/1854957",
- "https://archiveofourown.org/works/2089398",
- "https://archiveofourown.org/works/2218554",
- "https://archiveofourown.org/works/2249544",
- "https://archiveofourown.org/works/2330390",
- "https://archiveofourown.org/works/2399867",
- "https://archiveofourown.org/works/2467277",
- "https://archiveofourown.org/works/2802287",
- ]:
- save_ao3_url(story_url)
- else:
- save_ao3_url(url)
web/save_pinboard_bookmarks.py (6225) → web/save_pinboard_bookmarks.py (0)
diff --git a/web/save_pinboard_bookmarks.py b/web/save_pinboard_bookmarks.py
deleted file mode 100755
index ed0c061..0000000
--- a/web/save_pinboard_bookmarks.py
+++ /dev/null
@@ -1,228 +0,0 @@
-#!/usr/bin/env python3
-
-import contextlib
-import datetime
-import json
-import os
-import pathlib
-import shutil
-import subprocess
-import tarfile
-import tempfile
-
-import bs4
-import httpx
-import keyring
-
-
-BACKUP_ROOT = pathlib.Path("/Volumes/Media (Sapphire)/backups/pinboard")
-
-
-def write_to_file(name: str, contents: str) -> None:
- """
- Write a string to a text file, and log that you're doing it.
- """
- path = BACKUP_ROOT / name
- print(f" ~> {path}")
- path.write_text(contents)
-
-
-def get_bookmarks_data(username: str, password: str) -> str:
- """
- Call the Pinboard API to get a complete list of my bookmarks.
-
- Return the result as a pretty-printed JSON string.
- """
- resp = httpx.get(
- "https://api.pinboard.in/v1/posts/all",
- params={"format": "json"},
- auth=(username, password),
- )
-
- resp.raise_for_status()
-
- return resp.json()
-
-
-def get_cache_ids(username: str, password: str) -> dict[str, str]:
- """
- Get a list of cache IDs for bookmarks in my account.
-
- These are the URLs where Pinboard takes archived snapshots of
- web pages, e.g. https://pinboard.in/cached/1234567890/
-
- Returns a dict (bookmarked URL) -> (cache ID).
- """
- # Start by logging in to Pinbaord, so we have the appropriate
- # cookies in our session.
- client = httpx.Client(follow_redirects=True)
-
- resp = client.post(
- "https://pinboard.in/auth/", data={"username": username, "password": password}
- )
- resp.raise_for_status()
-
- # Now start fetching cache IDs from my account.
- cache_ids: dict[str, str] = {}
-
- url = f"https://pinboard.in/u:{username}"
-
- while True:
- print(f" ... fetching cache IDs from {url}")
- resp = client.get(url, params={"per_page": "160"})
- resp.raise_for_status()
-
- soup = bs4.BeautifulSoup(resp.text, "html.parser")
-
- # The structure of the page is of the form:
- #
- # <div id="bookmarks">
- # <div class="bookmark">
- # <a class="bookmark_title" href="http://example.net">Example</a>
- # <a class="cached" href="/cached/1234567890/">☑</a>
- # …
- #
- bookmarks_div = soup.find("div", attrs={"id": "bookmarks"})
- bookmarks = bookmarks_div.find_all("div", attrs={"class": "bookmark"})
-
- for b in bookmarks:
- href = b.find("a", attrs={"class": "bookmark_title"}).attrs["href"]
- cache_link = b.find("a", attrs={"class": "cached"})
-
- if cache_link is None:
- continue
-
- this_cache_id = cache_link.attrs["href"].split("/")[-2]
-
- cache_ids[href] = this_cache_id
-
- # The pagination link, if present, will be something like:
- #
- # <a class="next_prev" href="/u:alexwlchan/before:1234">« earlier</a>
- #
- pagination_link = soup.find("a", attrs={"class": "next_prev"})
-
- if "earlier" not in pagination_link.text:
- break
-
- url = "https://pinboard.in" + pagination_link.attrs["href"]
-
- return cache_ids
-
-
-def wget(*args):
- subprocess.call(["wget"] + list(args), stdout=subprocess.DEVNULL)
-
-
-@contextlib.contextmanager
-def wget_context(username: str, password: str):
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
-
- wget(
- "--save-cookies",
- "pinboard-cookies.txt",
- "--keep-session-cookies",
- "--delete-after",
- "--output-file",
- "-",
- "--post-data",
- f"username={username}&password={password}",
- "https://pinboard.in/auth/",
- )
-
- yield
-
- os.unlink("pinboard-cookies.txt")
-
-
-def download_single_archive(url: str, cache_id: str):
- cache_dir = BACKUP_ROOT / "archive" / cache_id[0] / cache_id
- cache_path = cache_dir.with_suffix(".tar.gz")
-
- # If the archive is already downloaded, there's nothing to do.
- if cache_path.exists():
- return
-
- print(f" ... saving https://pinboard.in/cached/{cache_id}/")
- print(f" {url}")
-
- # Otherwise, start downloading the archive into a temporary directory.
- # Clear any pending downloads first.
- tmp_dir = cache_dir.with_suffix(".tmp")
- tmp_dir.mkdir(parents=True, exist_ok=True)
-
- shutil.rmtree(tmp_dir)
-
- wget(
- "--adjust-extension",
- "--span-hosts",
- "--no-verbose",
- "--convert-links",
- "--page-requisites",
- "--no-directories",
- "-e",
- "robots=off",
- "--load-cookies",
- "pinboard-cookies.txt",
- "--output-file",
- "-",
- "--directory-prefix",
- str(tmp_dir),
- f"https://pinboard.in/cached/{cache_id}/",
- )
-
- with tarfile.open(cache_path, "w:gz") as tf:
- tf.add(tmp_dir, arcname=cache_id)
-
- print(f" {cache_path}")
-
- shutil.rmtree(tmp_dir)
-
-
-if __name__ == "__main__":
- username = "alexwlchan"
-
- password = keyring.get_password("pinboard", "password")
- assert password is not None
-
- now = datetime.date.today().strftime("%Y-%m-%d")
-
- print("*** Getting a JSON copy of my bookmarks data")
- bookmarks = get_bookmarks_data(username, password)
- json_string = json.dumps(bookmarks, indent=2, sort_keys=True)
-
- for name in (f"bookmarks.{now}.json", "bookmarks.json"):
- write_to_file(name, contents=json_string)
-
- print("")
-
- print("*** Getting a list of cache IDs")
- all_cache_ids = get_cache_ids(username, password)
-
- for name in (f"cache_ids.{now}.json", "cache_ids.json"):
- write_to_file(name, contents=json.dumps(all_cache_ids))
-
- all_cache_ids = json.load(open(BACKUP_ROOT / "cache_ids.json"))
-
- print("")
-
- print("*** Saving archive files using wget")
-
- with wget_context(username, password):
- for url, cache_id in all_cache_ids.items():
- download_single_archive(url, cache_id)
-
- print("")
-
- print("*** Saving stories from AO3")
-
- ao3_urls = [b["href"] for b in bookmarks if "archiveofourown.org" in b["href"]]
-
- subprocess.check_call(
- [
- "python3",
- "/Users/alexwlchan/repos/scripts/web/save_ao3_links.py",
- ]
- + ao3_urls
- )
web/save_safari_webarchive (3217) → web/save_safari_webarchive (0)
diff --git a/web/save_safari_webarchive b/web/save_safari_webarchive
deleted file mode 100755
index 5ca3556..0000000
--- a/web/save_safari_webarchive
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/env swift
-/// Save a web page as a Safari webarchive.
-///
-/// Usage: save_safari_webarchive [URL] [OUTPUT_PATH]
-///
-/// This will save the page to the desired file, but may fail for
-/// several reasons:
-///
-/// - the web page can't be loaded
-/// - the web page returns a non-200 status code
-/// - there's already a file at that path (it won't overwrite an existing
-/// webarchive)
-///
-/// For a detailed explanation of the code in this script, see
-/// https://alexwlchan.net/2024/creating-a-safari-webarchive/
-
-import WebKit
-
-/// Print an error message and terminate the process if there are
-/// any errors while loading a page.
-class ExitOnFailureDelegate: NSObject, WKNavigationDelegate {
- func webView(_: WKWebView, didFail: WKNavigation!, withError error: Error) {
- fputs("Failed to load web page: \(error.localizedDescription)\n", stderr)
- exit(1)
- }
-
- func webView(
- _: WKWebView,
- didFailProvisionalNavigation: WKNavigation!,
- withError error: Error
- ) {
- fputs("Failed to load web page: \(error.localizedDescription)\n", stderr)
- exit(1)
- }
-
- func webView(
- _: WKWebView,
- decidePolicyFor navigationResponse: WKNavigationResponse,
- decisionHandler: (WKNavigationResponsePolicy) -> Void
- ) {
- if let httpUrlResponse = (navigationResponse.response as? HTTPURLResponse) {
- if httpUrlResponse.statusCode != 200 {
- fputs("Loading web page failed with status code \(httpUrlResponse.statusCode)\n", stderr)
- exit(1)
- }
- }
-
- decisionHandler(.allow)
- }
-}
-
-let webView = WKWebView()
-
-let delegate = ExitOnFailureDelegate()
-webView.navigationDelegate = delegate
-
-extension WKWebView {
-
- /// Load the given URL in the web view.
- ///
- /// This method will block until the URL has finished loading.
- func load(_ urlString: String) {
- if let url = URL(string: urlString) {
- let request = URLRequest(url: url)
- self.load(request)
-
- while (self.isLoading) {
- RunLoop.main.run(until: Date(timeIntervalSinceNow: 0.1))
- }
- } else {
- fputs("Unable to use \(urlString) as a URL\n", stderr)
- exit(1)
- }
- }
-
- /// Save a copy of the web view's contents as a webarchive file.
- ///
- /// This method will block until the webarchive has been saved,
- /// or the save has failed for some reason.
- func saveAsWebArchive(savePath: URL) {
- var isSaving = true
-
- self.createWebArchiveData(completionHandler: { result in
- do {
- let data = try result.get()
- try data.write(
- to: savePath,
- options: [Data.WritingOptions.withoutOverwriting]
- )
- isSaving = false
- } catch {
- fputs("Unable to save webarchive file: \(error.localizedDescription)\n", stderr)
- exit(1)
- }
- })
-
- while (isSaving) {
- RunLoop.main.run(until: Date(timeIntervalSinceNow: 0.1))
- }
- }
-}
-
-guard CommandLine.arguments.count == 3 else {
- print("Usage: \(CommandLine.arguments[0]) <URL> <OUTPUT_PATH>")
- exit(1)
-}
-
-let urlString = CommandLine.arguments[1]
-let savePath = URL(fileURLWithPath: CommandLine.arguments[2])
-
-webView.load(urlString)
-webView.saveAsWebArchive(savePath: savePath)
-
-print("Saved webarchive to \(savePath)")
web/test_save_ao3_links.py (536) → web/test_save_ao3_links.py (0)
diff --git a/web/test_save_ao3_links.py b/web/test_save_ao3_links.py
deleted file mode 100644
index 72916a7..0000000
--- a/web/test_save_ao3_links.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import pytest
-
-from save_ao3_links import get_ao3_id
-
-
-@pytest.mark.parametrize(
- ["url", "ao3_id"],
- [
- ("https://archiveofourown.org/works/1234", "1234"),
- ("https://archiveofourown.org/works/1234?view_adult=true", "1234"),
- (
- "https://archiveofourown.org/works/1234/chapters/5678?view_adult=true",
- "1234",
- ),
- ("https://archiveofourown.org/collections/yuletide2022/works/1234", "1234"),
- ],
-)
-def test_get_ao3_id(url, ao3_id):
- assert get_ao3_id(url) == ao3_id