Skip to main content

Merge pull request #49 from alexwlchan/remove-web-archiving-scripts

ID
457d975
date
2024-06-05 19:01:26+00:00
author
Alex Chan <alex@alexwlchan.net>
parents
786e819, 43b13ed
message
Merge pull request #49 from alexwlchan/remove-web-archiving-scripts

Remove a couple of web archiving scripts that have "graduated"
changed files
6 files, 1 addition, 509 deletions

Changed files

web/.gitattributes (47) → web/.gitattributes (0)

diff --git a/web/.gitattributes b/web/.gitattributes
deleted file mode 100644
index c226feb..0000000
--- a/web/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-save_safari_webarchive linguist-language=Swift

web/README.md (5642) → web/README.md (4339)

diff --git a/web/README.md b/web/README.md
index a33c788..bdf266a 100644
--- a/web/README.md
+++ b/web/README.md
@@ -42,18 +42,6 @@ scripts = [
         """
     },
     {
-        "usage": "save_ao3_links.py [URL...]",
-        "description": """
-        save a copy of a story on AO3, including exports in every available format.
-        """
-    },
-    {
-        "name": "save_pinboard_bookmarks.py",
-        "description": """
-        save a complete copy of all my Pinboard bookmarks, including my archive backups.
-        """
-    },
-    {
         "name": "save_tumblr_likes.py",
         "description": """
         save a copy of all the posts I've liked on Tumblr to my backup drive.
@@ -66,12 +54,6 @@ scripts = [
         """
     },
     {
-        "usage": "save_safari_webarchive [URL] [OUTPUT_PATH]",
-        "description": """
-        save a copy of a web page as a Safari webarchive
-        """
-    },
-    {
         "name": "scrape_really_useful_boxes.py",
         "description": """
         scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.<br/><br/><img src="really_useful_boxes.png">
@@ -126,24 +108,6 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
   </dd>
 
   <dt>
-    <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_ao3_links.py">
-      <code>save_ao3_links.py [URL...]</code>
-    </a>
-  </dt>
-  <dd>
-    save a copy of a story on AO3, including exports in every available format.
-  </dd>
-
-  <dt>
-    <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_pinboard_bookmarks.py">
-      <code>save_pinboard_bookmarks.py</code>
-    </a>
-  </dt>
-  <dd>
-    save a complete copy of all my Pinboard bookmarks, including my archive backups.
-  </dd>
-
-  <dt>
     <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_tumblr_likes.py">
       <code>save_tumblr_likes.py</code>
     </a>
@@ -162,15 +126,6 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
   </dd>
 
   <dt>
-    <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_safari_webarchive">
-      <code>save_safari_webarchive [URL] [OUTPUT_PATH]</code>
-    </a>
-  </dt>
-  <dd>
-    save a copy of a web page as a Safari webarchive
-  </dd>
-
-  <dt>
     <a href="https://github.com/alexwlchan/scripts/blob/main/web/scrape_really_useful_boxes.py">
       <code>scrape_really_useful_boxes.py</code>
     </a>
@@ -188,4 +143,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
     this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
   </dd>
 </dl>
-<!-- [[[end]]] (checksum: a31c60eca24c6488caaa93e8bb5f6b44) -->
+<!-- [[[end]]] (checksum: 93b152a3a4162f174022195ee107ad46) -->

web/save_ao3_links.py (2891) → web/save_ao3_links.py (0)

diff --git a/web/save_ao3_links.py b/web/save_ao3_links.py
deleted file mode 100755
index dd210ed..0000000
--- a/web/save_ao3_links.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import pathlib
-import shutil
-import subprocess
-import sys
-
-import hyperlink
-
-
-BACKUP_ROOT = pathlib.Path("/Volumes/Media (Sapphire)/backups/ao3")
-
-
-def get_ao3_id(url: str) -> str:
-    # e.g. 'https://archiveofourown.org/works/1234' ~> '1234'
-    u = hyperlink.DecodedURL.from_text(url)
-
-    if u.path[0] == "works" and u.path[1].isnumeric():
-        return u.path[1]
-    elif (
-        len(u.path) >= 4
-        and u.path[0] == "collections"
-        and u.path[2] == "works"
-        and u.path[3].isnumeric()
-    ):
-        return u.path[3]
-    else:
-        raise ValueError(url)
-
-
-def save_ao3_url(url: str):
-    ao3_id = get_ao3_id(url)
-
-    # Check if the fic is already downloaded -- if it is, nothing to do.
-    if any(
-        name.startswith(f"{ao3_id}-") and os.path.isdir(BACKUP_ROOT / name)
-        for name in os.listdir(BACKUP_ROOT)
-    ):
-        return
-
-    print(f"Saving {url}...")
-
-    # Otherwise, create a temporary directory for the download.
-    #
-    # Delete any partial downloads first.
-    tmp_dir = BACKUP_ROOT / f"{ao3_id}.tmp"
-
-    try:
-        shutil.rmtree(tmp_dir)
-    except FileNotFoundError:
-        pass
-
-    for ext in ["azw", "epub", "mobi", "pdf", "html"]:
-        wget(
-            "--no-verbose",
-            "--output-file",
-            "-",
-            # The Content-Disposition header is sent by the server to say
-            # what the file "should" be called.  By telling wget to respect this,
-            # it means we can request "a.html", the header from AO3 will specify
-            # the correct filename (including the fic title), and the file will
-            # be named correctly.
-            "--content-disposition",
-            "--directory-prefix",
-            tmp_dir,
-            f"https://archiveofourown.org/downloads/{ao3_id}/a.{ext}",
-        )
-
-    try:
-        title = os.listdir(tmp_dir)[0].rsplit(".")[0]
-    except FileNotFoundError:
-        return
-
-    out_dir = BACKUP_ROOT / f"{ao3_id}-{title}"
-
-    os.rename(tmp_dir, out_dir)
-
-    print(f" ~> {out_dir}")
-
-
-def wget(*args):
-    subprocess.call(["wget"] + list(args), stdout=subprocess.DEVNULL)
-
-
-if __name__ == "__main__":
-    for url in sys.argv[1:]:
-        if url == "https://archiveofourown.org/series/136245":
-            for story_url in [
-                "https://archiveofourown.org/works/1854957",
-                "https://archiveofourown.org/works/2089398",
-                "https://archiveofourown.org/works/2218554",
-                "https://archiveofourown.org/works/2249544",
-                "https://archiveofourown.org/works/2330390",
-                "https://archiveofourown.org/works/2399867",
-                "https://archiveofourown.org/works/2467277",
-                "https://archiveofourown.org/works/2802287",
-            ]:
-                save_ao3_url(story_url)
-        else:
-            save_ao3_url(url)

web/save_pinboard_bookmarks.py (6225) → web/save_pinboard_bookmarks.py (0)

diff --git a/web/save_pinboard_bookmarks.py b/web/save_pinboard_bookmarks.py
deleted file mode 100755
index ed0c061..0000000
--- a/web/save_pinboard_bookmarks.py
+++ /dev/null
@@ -1,228 +0,0 @@
-#!/usr/bin/env python3
-
-import contextlib
-import datetime
-import json
-import os
-import pathlib
-import shutil
-import subprocess
-import tarfile
-import tempfile
-
-import bs4
-import httpx
-import keyring
-
-
-BACKUP_ROOT = pathlib.Path("/Volumes/Media (Sapphire)/backups/pinboard")
-
-
-def write_to_file(name: str, contents: str) -> None:
-    """
-    Write a string to a text file, and log that you're doing it.
-    """
-    path = BACKUP_ROOT / name
-    print(f"    ~> {path}")
-    path.write_text(contents)
-
-
-def get_bookmarks_data(username: str, password: str) -> str:
-    """
-    Call the Pinboard API to get a complete list of my bookmarks.
-
-    Return the result as a pretty-printed JSON string.
-    """
-    resp = httpx.get(
-        "https://api.pinboard.in/v1/posts/all",
-        params={"format": "json"},
-        auth=(username, password),
-    )
-
-    resp.raise_for_status()
-
-    return resp.json()
-
-
-def get_cache_ids(username: str, password: str) -> dict[str, str]:
-    """
-    Get a list of cache IDs for bookmarks in my account.
-
-    These are the URLs where Pinboard takes archived snapshots of
-    web pages, e.g. https://pinboard.in/cached/1234567890/
-
-    Returns a dict (bookmarked URL) -> (cache ID).
-    """
-    # Start by logging in to Pinbaord, so we have the appropriate
-    # cookies in our session.
-    client = httpx.Client(follow_redirects=True)
-
-    resp = client.post(
-        "https://pinboard.in/auth/", data={"username": username, "password": password}
-    )
-    resp.raise_for_status()
-
-    # Now start fetching cache IDs from my account.
-    cache_ids: dict[str, str] = {}
-
-    url = f"https://pinboard.in/u:{username}"
-
-    while True:
-        print(f"    ... fetching cache IDs from {url}")
-        resp = client.get(url, params={"per_page": "160"})
-        resp.raise_for_status()
-
-        soup = bs4.BeautifulSoup(resp.text, "html.parser")
-
-        # The structure of the page is of the form:
-        #
-        #     <div id="bookmarks">
-        #       <div class="bookmark">
-        #         <a class="bookmark_title" href="http://example.net">Example</a>
-        #         <a class="cached" href="/cached/1234567890/">☑</a>
-        #         …
-        #
-        bookmarks_div = soup.find("div", attrs={"id": "bookmarks"})
-        bookmarks = bookmarks_div.find_all("div", attrs={"class": "bookmark"})
-
-        for b in bookmarks:
-            href = b.find("a", attrs={"class": "bookmark_title"}).attrs["href"]
-            cache_link = b.find("a", attrs={"class": "cached"})
-
-            if cache_link is None:
-                continue
-
-            this_cache_id = cache_link.attrs["href"].split("/")[-2]
-
-            cache_ids[href] = this_cache_id
-
-        # The pagination link, if present, will be something like:
-        #
-        #      <a class="next_prev" href="/u:alexwlchan/before:1234">« earlier</a>
-        #
-        pagination_link = soup.find("a", attrs={"class": "next_prev"})
-
-        if "earlier" not in pagination_link.text:
-            break
-
-        url = "https://pinboard.in" + pagination_link.attrs["href"]
-
-    return cache_ids
-
-
-def wget(*args):
-    subprocess.call(["wget"] + list(args), stdout=subprocess.DEVNULL)
-
-
-@contextlib.contextmanager
-def wget_context(username: str, password: str):
-    with tempfile.TemporaryDirectory() as tmpdir:
-        os.chdir(tmpdir)
-
-        wget(
-            "--save-cookies",
-            "pinboard-cookies.txt",
-            "--keep-session-cookies",
-            "--delete-after",
-            "--output-file",
-            "-",
-            "--post-data",
-            f"username={username}&password={password}",
-            "https://pinboard.in/auth/",
-        )
-
-        yield
-
-        os.unlink("pinboard-cookies.txt")
-
-
-def download_single_archive(url: str, cache_id: str):
-    cache_dir = BACKUP_ROOT / "archive" / cache_id[0] / cache_id
-    cache_path = cache_dir.with_suffix(".tar.gz")
-
-    # If the archive is already downloaded, there's nothing to do.
-    if cache_path.exists():
-        return
-
-    print(f"    ... saving https://pinboard.in/cached/{cache_id}/")
-    print(f"        {url}")
-
-    # Otherwise, start downloading the archive into a temporary directory.
-    # Clear any pending downloads first.
-    tmp_dir = cache_dir.with_suffix(".tmp")
-    tmp_dir.mkdir(parents=True, exist_ok=True)
-
-    shutil.rmtree(tmp_dir)
-
-    wget(
-        "--adjust-extension",
-        "--span-hosts",
-        "--no-verbose",
-        "--convert-links",
-        "--page-requisites",
-        "--no-directories",
-        "-e",
-        "robots=off",
-        "--load-cookies",
-        "pinboard-cookies.txt",
-        "--output-file",
-        "-",
-        "--directory-prefix",
-        str(tmp_dir),
-        f"https://pinboard.in/cached/{cache_id}/",
-    )
-
-    with tarfile.open(cache_path, "w:gz") as tf:
-        tf.add(tmp_dir, arcname=cache_id)
-
-    print(f"        {cache_path}")
-
-    shutil.rmtree(tmp_dir)
-
-
-if __name__ == "__main__":
-    username = "alexwlchan"
-
-    password = keyring.get_password("pinboard", "password")
-    assert password is not None
-
-    now = datetime.date.today().strftime("%Y-%m-%d")
-
-    print("*** Getting a JSON copy of my bookmarks data")
-    bookmarks = get_bookmarks_data(username, password)
-    json_string = json.dumps(bookmarks, indent=2, sort_keys=True)
-
-    for name in (f"bookmarks.{now}.json", "bookmarks.json"):
-        write_to_file(name, contents=json_string)
-
-    print("")
-
-    print("*** Getting a list of cache IDs")
-    all_cache_ids = get_cache_ids(username, password)
-
-    for name in (f"cache_ids.{now}.json", "cache_ids.json"):
-        write_to_file(name, contents=json.dumps(all_cache_ids))
-
-    all_cache_ids = json.load(open(BACKUP_ROOT / "cache_ids.json"))
-
-    print("")
-
-    print("*** Saving archive files using wget")
-
-    with wget_context(username, password):
-        for url, cache_id in all_cache_ids.items():
-            download_single_archive(url, cache_id)
-
-    print("")
-
-    print("*** Saving stories from AO3")
-
-    ao3_urls = [b["href"] for b in bookmarks if "archiveofourown.org" in b["href"]]
-
-    subprocess.check_call(
-        [
-            "python3",
-            "/Users/alexwlchan/repos/scripts/web/save_ao3_links.py",
-        ]
-        + ao3_urls
-    )

web/save_safari_webarchive (3217) → web/save_safari_webarchive (0)

diff --git a/web/save_safari_webarchive b/web/save_safari_webarchive
deleted file mode 100755
index 5ca3556..0000000
--- a/web/save_safari_webarchive
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/env swift
-/// Save a web page as a Safari webarchive.
-///
-/// Usage: save_safari_webarchive [URL] [OUTPUT_PATH]
-///
-/// This will save the page to the desired file, but may fail for
-/// several reasons:
-///
-///   - the web page can't be loaded
-///   - the web page returns a non-200 status code
-///   - there's already a file at that path (it won't overwrite an existing
-///     webarchive)
-///
-/// For a detailed explanation of the code in this script, see
-/// https://alexwlchan.net/2024/creating-a-safari-webarchive/
-
-import WebKit
-
-/// Print an error message and terminate the process if there are
-/// any errors while loading a page.
-class ExitOnFailureDelegate: NSObject, WKNavigationDelegate {
-  func webView(_: WKWebView, didFail: WKNavigation!, withError error: Error) {
-    fputs("Failed to load web page: \(error.localizedDescription)\n", stderr)
-    exit(1)
-  }
-
-  func webView(
-    _: WKWebView,
-    didFailProvisionalNavigation: WKNavigation!,
-    withError error: Error
-  ) {
-    fputs("Failed to load web page: \(error.localizedDescription)\n", stderr)
-    exit(1)
-  }
-
-  func webView(
-    _: WKWebView,
-    decidePolicyFor navigationResponse: WKNavigationResponse,
-    decisionHandler: (WKNavigationResponsePolicy) -> Void
-  ) {
-    if let httpUrlResponse = (navigationResponse.response as? HTTPURLResponse) {
-      if httpUrlResponse.statusCode != 200 {
-        fputs("Loading web page failed with status code \(httpUrlResponse.statusCode)\n", stderr)
-        exit(1)
-      }
-    }
-
-    decisionHandler(.allow)
-  }
-}
-
-let webView = WKWebView()
-
-let delegate = ExitOnFailureDelegate()
-webView.navigationDelegate = delegate
-
-extension WKWebView {
-
-  /// Load the given URL in the web view.
-  ///
-  /// This method will block until the URL has finished loading.
-  func load(_ urlString: String) {
-    if let url = URL(string: urlString) {
-      let request = URLRequest(url: url)
-      self.load(request)
-
-      while (self.isLoading) {
-        RunLoop.main.run(until: Date(timeIntervalSinceNow: 0.1))
-      }
-    } else {
-      fputs("Unable to use \(urlString) as a URL\n", stderr)
-      exit(1)
-    }
-  }
-
-  /// Save a copy of the web view's contents as a webarchive file.
-  ///
-  /// This method will block until the webarchive has been saved,
-  /// or the save has failed for some reason.
-  func saveAsWebArchive(savePath: URL) {
-    var isSaving = true
-
-    self.createWebArchiveData(completionHandler: { result in
-      do {
-        let data = try result.get()
-        try data.write(
-          to: savePath,
-          options: [Data.WritingOptions.withoutOverwriting]
-        )
-        isSaving = false
-      } catch {
-        fputs("Unable to save webarchive file: \(error.localizedDescription)\n", stderr)
-        exit(1)
-      }
-    })
-
-    while (isSaving) {
-      RunLoop.main.run(until: Date(timeIntervalSinceNow: 0.1))
-    }
-  }
-}
-
-guard CommandLine.arguments.count == 3 else {
-    print("Usage: \(CommandLine.arguments[0]) <URL> <OUTPUT_PATH>")
-    exit(1)
-}
-
-let urlString = CommandLine.arguments[1]
-let savePath = URL(fileURLWithPath: CommandLine.arguments[2])
-
-webView.load(urlString)
-webView.saveAsWebArchive(savePath: savePath)
-
-print("Saved webarchive to \(savePath)")

web/test_save_ao3_links.py (536) → web/test_save_ao3_links.py (0)

diff --git a/web/test_save_ao3_links.py b/web/test_save_ao3_links.py
deleted file mode 100644
index 72916a7..0000000
--- a/web/test_save_ao3_links.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import pytest
-
-from save_ao3_links import get_ao3_id
-
-
-@pytest.mark.parametrize(
-    ["url", "ao3_id"],
-    [
-        ("https://archiveofourown.org/works/1234", "1234"),
-        ("https://archiveofourown.org/works/1234?view_adult=true", "1234"),
-        (
-            "https://archiveofourown.org/works/1234/chapters/5678?view_adult=true",
-            "1234",
-        ),
-        ("https://archiveofourown.org/collections/yuletide2022/works/1234", "1234"),
-    ],
-)
-def test_get_ao3_id(url, ao3_id):
-    assert get_ao3_id(url) == ao3_id