Merge pull request #13 from alexwlchan/save-pinboard-bookmarks
- ID
56686e1- date
2024-02-14 23:28:21+00:00- author
Alex Chan <alex@alexwlchan.net>- parents
e95f0ad,0deb9c3- message
Merge pull request #13 from alexwlchan/save-pinboard-bookmarks [WIP] Add my script for backing up Pinboard bookmarks- changed files
5 files, 363 additions, 1 deletion
Changed files
.github/workflows/test.yml (1190) → .github/workflows/test.yml (1233)
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 3171640..6356a05 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -44,3 +44,4 @@ jobs:
py.test aws/test_s3tree.py
py.test text/test_fix_twitter_thread.py
py.test textexpander/test_get_mastodon_text.py
+ py.test web/test_save_ao3_links.py
web/README.md (3467) → web/README.md (4353)
diff --git a/web/README.md b/web/README.md
index 5ada617..491c4a6 100644
--- a/web/README.md
+++ b/web/README.md
@@ -36,6 +36,18 @@ scripts = [
"""
},
{
+ "usage": "save_ao3_links.py [URL...]",
+ "description": """
+ save a copy of a story on AO3, including exports in every available format.
+ """
+ },
+ {
+ "name": "save_pinboard_bookmarks.py",
+ "description": """
+ save a complete copy of all my Pinboard bookmarks, including my archive backups.
+ """
+ },
+ {
"name": "save_tumblr_likes.py",
"description": """
save a copy of all the posts I've liked on Tumblr to my backup drive.
@@ -87,6 +99,24 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
</dd>
<dt>
+ <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_ao3_links.py">
+ <code>save_ao3_links.py [URL...]</code>
+ </a>
+ </dt>
+ <dd>
+ save a copy of a story on AO3, including exports in every available format.
+ </dd>
+
+ <dt>
+ <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_pinboard_bookmarks.py">
+ <code>save_pinboard_bookmarks.py</code>
+ </a>
+ </dt>
+ <dd>
+ save a complete copy of all my Pinboard bookmarks, including my archive backups.
+ </dd>
+
+ <dt>
<a href="https://github.com/alexwlchan/scripts/blob/main/web/save_tumblr_likes.py">
<code>save_tumblr_likes.py</code>
</a>
@@ -113,4 +143,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
</dd>
</dl>
-<!-- [[[end]]] (checksum: 0d41d07b707085af6913de92b942a152) -->
+<!-- [[[end]]] (checksum: a4f4aaedc92d2ce7e499f50a87c39d22) -->
web/save_ao3_links.py (0) → web/save_ao3_links.py (2160)
diff --git a/web/save_ao3_links.py b/web/save_ao3_links.py
new file mode 100755
index 0000000..b1d9f37
--- /dev/null
+++ b/web/save_ao3_links.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+
+import os
+import pathlib
+import shutil
+import subprocess
+import sys
+import tarfile
+
+import hyperlink
+
+
+BACKUP_ROOT = pathlib.Path("/Volumes/Media (Sapphire)/backups/ao3")
+
+
+def get_ao3_id(url: str) -> str:
+ # e.g. 'https://archiveofourown.org/works/1234' ~> '1234'
+ u = hyperlink.DecodedURL.from_text(url)
+
+ if u.path[0] == "works" and u.path[1].isnumeric():
+ return u.path[1]
+ else:
+ raise ValueError(url)
+
+
+def save_ao3_url(url: str):
+ ao3_id = get_ao3_id(url)
+
+ # Check if the fic is already downloaded -- if it is, nothing to do.
+ if any(
+ name.startswith(f"{ao3_id}-") and name.endswith(".tar.gz")
+ for name in os.listdir(BACKUP_ROOT)
+ ):
+ return
+
+ print(f"Saving {url}...")
+
+ # Otherwise, create a temporary directory for the download.
+ #
+ # Delete any partial downloads first.
+ tmp_dir = BACKUP_ROOT / f"{ao3_id}.tmp"
+
+ try:
+ shutil.rmtree(tmp_dir)
+ except FileNotFoundError:
+ pass
+
+ for ext in ["azw", "epub", "mobi", "pdf", "html"]:
+ wget(
+ "--no-verbose",
+ "--output-file",
+ "-",
+ # The Content-Disposition header is sent by the server to say
+ # what the file "should" be called. By telling wget to respect this,
+ # it means we can request "a.html", the header from AO3 will specify
+ # the correct filename (including the fic title), and the file will
+ # be named correctly.
+ "--content-disposition",
+ "--directory-prefix",
+ tmp_dir,
+ f"https://archiveofourown.org/downloads/{ao3_id}/a.{ext}",
+ )
+
+ try:
+ title = os.listdir(tmp_dir)[0].rsplit(".")[0]
+ except FileNotFoundError:
+ return
+
+ out_path = BACKUP_ROOT / f"{ao3_id}-{title}.tar.gz"
+
+ with tarfile.open(out_path, "w:gz") as tf:
+ tf.add(tmp_dir, arcname=ao3_id)
+
+ shutil.rmtree(tmp_dir)
+
+ print(f" ~> {out_path}")
+
+
+def wget(*args):
+ subprocess.call(["wget"] + list(args), stdout=subprocess.DEVNULL)
+
+
+if __name__ == "__main__":
+ for url in sys.argv[1:]:
+ save_ao3_url(url)
web/save_pinboard_bookmarks.py (0) → web/save_pinboard_bookmarks.py (6277)
diff --git a/web/save_pinboard_bookmarks.py b/web/save_pinboard_bookmarks.py
new file mode 100755
index 0000000..1d41c80
--- /dev/null
+++ b/web/save_pinboard_bookmarks.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+
+import contextlib
+import datetime
+import json
+import os
+import pathlib
+import shutil
+import subprocess
+import tarfile
+import tempfile
+
+import bs4
+import httpx
+import keyring
+
+
+BACKUP_ROOT = pathlib.Path("/Volumes/Media (Sapphire)/backups/pinboard")
+
+
+def write_to_file(name: str, contents: str) -> None:
+ """
+ Write a string to a text file, and log that you're doing it.
+ """
+ path = BACKUP_ROOT / name
+ print(f" ~> {path}")
+ path.write_text(contents)
+
+
+def get_bookmarks_data(username: str, password: str) -> str:
+ """
+ Call the Pinboard API to get a complete list of my bookmarks.
+
+ Return the result as a pretty-printed JSON string.
+ """
+ resp = httpx.get(
+ "https://api.pinboard.in/v1/posts/all",
+ params={"format": "json"},
+ auth=(username, password),
+ )
+
+ resp.raise_for_status()
+
+ return resp.json()
+
+
+def get_cache_ids(username: str, password: str) -> dict[str, str]:
+ """
+ Get a list of cache IDs for bookmarks in my account.
+
+ These are the URLs where Pinboard takes archived snapshots of
+ web pages, e.g. https://pinboard.in/cached/1234567890/
+
+ Returns a dict (bookmarked URL) -> (cache ID).
+ """
+ # Start by logging in to Pinbaord, so we have the appropriate
+ # cookies in our session.
+ client = httpx.Client(follow_redirects=True)
+
+ resp = client.post(
+ "https://pinboard.in/auth/", data={"username": username, "password": password}
+ )
+ resp.raise_for_status()
+
+ # Now start fetching cache IDs from my account.
+ cache_ids: dict[str, str] = {}
+
+ url = f"https://pinboard.in/u:{username}"
+
+ while True:
+ print(f" ... fetching cache IDs from {url}")
+ resp = client.get(url, params={"per_page": "160"})
+ resp.raise_for_status()
+
+ soup = bs4.BeautifulSoup(resp.text, "html.parser")
+
+ # The structure of the page is of the form:
+ #
+ # <div id="bookmarks">
+ # <div class="bookmark">
+ # <a class="bookmark_title" href="http://example.net">Example</a>
+ # <a class="cached" href="/cached/1234567890/">☑</a>
+ # …
+ #
+ bookmarks_div = soup.find("div", attrs={"id": "bookmarks"})
+ bookmarks = bookmarks_div.find_all("div", attrs={"class": "bookmark"})
+
+ for b in bookmarks:
+ href = b.find("a", attrs={"class": "bookmark_title"}).attrs["href"]
+ cache_link = b.find("a", attrs={"class": "cached"})
+
+ if cache_link is None:
+ continue
+
+ this_cache_id = cache_link.attrs["href"].split("/")[-2]
+
+ cache_ids[href] = this_cache_id
+
+ # The pagination link, if present, will be something like:
+ #
+ # <a class="next_prev" href="/u:alexwlchan/before:1234">« earlier</a>
+ #
+ pagination_link = soup.find("a", attrs={"class": "next_prev"})
+
+ if "earlier" not in pagination_link.text:
+ break
+
+ url = "https://pinboard.in" + pagination_link.attrs["href"]
+
+ return cache_ids
+
+
+def wget(*args):
+ subprocess.call(["wget"] + list(args), stdout=subprocess.DEVNULL)
+
+
+@contextlib.contextmanager
+def wget_context(username: str, password: str):
+ with tempfile.TemporaryDirectory() as tmpdir:
+ os.chdir(tmpdir)
+
+ wget(
+ "--save-cookies",
+ "pinboard-cookies.txt",
+ "--keep-session-cookies",
+ "--delete-after",
+ "--output-file",
+ "-",
+ "--post-data",
+ f"username={username}&password={password}",
+ "https://pinboard.in/auth/",
+ )
+
+ yield
+
+ os.unlink("pinboard-cookies.txt")
+
+
+def download_single_archive(url: str, cache_id: str):
+ cache_dir = BACKUP_ROOT / "archive" / cache_id[0] / cache_id
+ cache_path = cache_dir.with_suffix(".tar.gz")
+
+ # If the archive is already downloaded, there's nothing to do.
+ if cache_path.exists():
+ return
+
+ print(f" ... saving https://pinboard.in/cached/{cache_id}/")
+ print(f" {url}")
+
+ # Otherwise, start downloading the archive into a temporary directory.
+ # Clear any pending downloads first.
+ tmp_dir = cache_dir.with_suffix(".tmp")
+ tmp_dir.mkdir(parents=True, exist_ok=True)
+
+ shutil.rmtree(tmp_dir)
+
+ wget(
+ "--adjust-extension",
+ "--span-hosts",
+ "--no-verbose",
+ "--convert-links",
+ "--page-requisites",
+ "--no-directories",
+ "-e",
+ "robots=off",
+ "--load-cookies",
+ "pinboard-cookies.txt",
+ "--output-file",
+ "-",
+ "--directory-prefix",
+ str(tmp_dir),
+ f"https://pinboard.in/cached/{cache_id}/",
+ )
+
+ with tarfile.open(cache_path, "w:gz") as tf:
+ tf.add(tmp_dir, arcname=cache_id)
+
+ print(f" {cache_path}")
+
+ shutil.rmtree(tmp_dir)
+
+
+if __name__ == "__main__":
+ username = "alexwlchan"
+
+ password = keyring.get_password("pinboard", "password")
+ assert password is not None
+
+ now = datetime.date.today().strftime("%Y-%m-%d")
+
+ print("*** Getting a JSON copy of my bookmarks data")
+ bookmarks = get_bookmarks_data(username, password)
+ json_string = json.dumps(bookmarks, indent=2, sort_keys=True)
+
+ for name in (f"bookmarks.{now}.json", "bookmarks.json"):
+ write_to_file(name, contents=json_string)
+
+ print("")
+
+ # print("*** Getting a list of cache IDs")
+ # all_cache_ids = get_cache_ids(username, password)
+ #
+ # for name in (f"cache_ids.{now}.json", "cache_ids.json"):
+ # write_to_file(name, contents=json.dumps(all_cache_ids))
+ #
+ # all_cache_ids = json.load(open(BACKUP_ROOT / "cache_ids.json"))
+ #
+ # print("")
+ #
+ # print("*** Saving archive files using wget")
+ #
+ # with wget_context(username, password):
+ # for url, cache_id in all_cache_ids.items():
+ # download_single_archive(url, cache_id)
+ #
+ # print("")
+
+ print("*** Saving stories from AO3")
+
+ ao3_urls = [b["href"] for b in bookmarks if "archiveofourown.org" in b["href"]]
+
+ subprocess.check_call(
+ [
+ "python3",
+ "/Users/alexwlchan/repos/scripts/web/save_ao3_links.py",
+ ]
+ + ao3_urls
+ )
web/test_save_ao3_links.py (0) → web/test_save_ao3_links.py (451)
diff --git a/web/test_save_ao3_links.py b/web/test_save_ao3_links.py
new file mode 100644
index 0000000..b3b5b92
--- /dev/null
+++ b/web/test_save_ao3_links.py
@@ -0,0 +1,18 @@
+import pytest
+
+from save_ao3_links import get_ao3_id
+
+
+@pytest.mark.parametrize(
+ ["url", "ao3_id"],
+ [
+ ("https://archiveofourown.org/works/1234", "1234"),
+ ("https://archiveofourown.org/works/1234?view_adult=true", "1234"),
+ (
+ "https://archiveofourown.org/works/1234/chapters/5678?view_adult=true",
+ "1234",
+ ),
+ ],
+)
+def test_get_ao3_id(url, ao3_id):
+ assert get_ao3_id(url) == ao3_id