Remove my “save Pinboard bookmarks” script
- ID
d18d973- date
2024-06-05 18:57:04+00:00- author
Alex Chan <alex@alexwlchan.net>- parent
84732da- message
Remove my "save Pinboard bookmarks" script- changed files
2 files, 1 addition, 244 deletions
Changed files
web/README.md (5208) → web/README.md (4756)
diff --git a/web/README.md b/web/README.md
index 6859670..0eb9a1c 100644
--- a/web/README.md
+++ b/web/README.md
@@ -42,12 +42,6 @@ scripts = [
"""
},
{
- "name": "save_pinboard_bookmarks.py",
- "description": """
- save a complete copy of all my Pinboard bookmarks, including my archive backups.
- """
- },
- {
"name": "save_tumblr_likes.py",
"description": """
save a copy of all the posts I've liked on Tumblr to my backup drive.
@@ -120,15 +114,6 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
</dd>
<dt>
- <a href="https://github.com/alexwlchan/scripts/blob/main/web/save_pinboard_bookmarks.py">
- <code>save_pinboard_bookmarks.py</code>
- </a>
- </dt>
- <dd>
- save a complete copy of all my Pinboard bookmarks, including my archive backups.
- </dd>
-
- <dt>
<a href="https://github.com/alexwlchan/scripts/blob/main/web/save_tumblr_likes.py">
<code>save_tumblr_likes.py</code>
</a>
@@ -173,4 +158,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
</dd>
</dl>
-<!-- [[[end]]] (checksum: d5cc17500a34414cf4107db0dcc7b0e1) -->
+<!-- [[[end]]] (checksum: 7df3c46289e99cda17cd36adb479b0f0) -->
web/save_pinboard_bookmarks.py (6225) → web/save_pinboard_bookmarks.py (0)
diff --git a/web/save_pinboard_bookmarks.py b/web/save_pinboard_bookmarks.py
deleted file mode 100755
index ed0c061..0000000
--- a/web/save_pinboard_bookmarks.py
+++ /dev/null
@@ -1,228 +0,0 @@
-#!/usr/bin/env python3
-
-import contextlib
-import datetime
-import json
-import os
-import pathlib
-import shutil
-import subprocess
-import tarfile
-import tempfile
-
-import bs4
-import httpx
-import keyring
-
-
-BACKUP_ROOT = pathlib.Path("/Volumes/Media (Sapphire)/backups/pinboard")
-
-
-def write_to_file(name: str, contents: str) -> None:
- """
- Write a string to a text file, and log that you're doing it.
- """
- path = BACKUP_ROOT / name
- print(f" ~> {path}")
- path.write_text(contents)
-
-
-def get_bookmarks_data(username: str, password: str) -> str:
- """
- Call the Pinboard API to get a complete list of my bookmarks.
-
- Return the result as a pretty-printed JSON string.
- """
- resp = httpx.get(
- "https://api.pinboard.in/v1/posts/all",
- params={"format": "json"},
- auth=(username, password),
- )
-
- resp.raise_for_status()
-
- return resp.json()
-
-
-def get_cache_ids(username: str, password: str) -> dict[str, str]:
- """
- Get a list of cache IDs for bookmarks in my account.
-
- These are the URLs where Pinboard takes archived snapshots of
- web pages, e.g. https://pinboard.in/cached/1234567890/
-
- Returns a dict (bookmarked URL) -> (cache ID).
- """
- # Start by logging in to Pinbaord, so we have the appropriate
- # cookies in our session.
- client = httpx.Client(follow_redirects=True)
-
- resp = client.post(
- "https://pinboard.in/auth/", data={"username": username, "password": password}
- )
- resp.raise_for_status()
-
- # Now start fetching cache IDs from my account.
- cache_ids: dict[str, str] = {}
-
- url = f"https://pinboard.in/u:{username}"
-
- while True:
- print(f" ... fetching cache IDs from {url}")
- resp = client.get(url, params={"per_page": "160"})
- resp.raise_for_status()
-
- soup = bs4.BeautifulSoup(resp.text, "html.parser")
-
- # The structure of the page is of the form:
- #
- # <div id="bookmarks">
- # <div class="bookmark">
- # <a class="bookmark_title" href="http://example.net">Example</a>
- # <a class="cached" href="/cached/1234567890/">☑</a>
- # …
- #
- bookmarks_div = soup.find("div", attrs={"id": "bookmarks"})
- bookmarks = bookmarks_div.find_all("div", attrs={"class": "bookmark"})
-
- for b in bookmarks:
- href = b.find("a", attrs={"class": "bookmark_title"}).attrs["href"]
- cache_link = b.find("a", attrs={"class": "cached"})
-
- if cache_link is None:
- continue
-
- this_cache_id = cache_link.attrs["href"].split("/")[-2]
-
- cache_ids[href] = this_cache_id
-
- # The pagination link, if present, will be something like:
- #
- # <a class="next_prev" href="/u:alexwlchan/before:1234">« earlier</a>
- #
- pagination_link = soup.find("a", attrs={"class": "next_prev"})
-
- if "earlier" not in pagination_link.text:
- break
-
- url = "https://pinboard.in" + pagination_link.attrs["href"]
-
- return cache_ids
-
-
-def wget(*args):
- subprocess.call(["wget"] + list(args), stdout=subprocess.DEVNULL)
-
-
-@contextlib.contextmanager
-def wget_context(username: str, password: str):
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
-
- wget(
- "--save-cookies",
- "pinboard-cookies.txt",
- "--keep-session-cookies",
- "--delete-after",
- "--output-file",
- "-",
- "--post-data",
- f"username={username}&password={password}",
- "https://pinboard.in/auth/",
- )
-
- yield
-
- os.unlink("pinboard-cookies.txt")
-
-
-def download_single_archive(url: str, cache_id: str):
- cache_dir = BACKUP_ROOT / "archive" / cache_id[0] / cache_id
- cache_path = cache_dir.with_suffix(".tar.gz")
-
- # If the archive is already downloaded, there's nothing to do.
- if cache_path.exists():
- return
-
- print(f" ... saving https://pinboard.in/cached/{cache_id}/")
- print(f" {url}")
-
- # Otherwise, start downloading the archive into a temporary directory.
- # Clear any pending downloads first.
- tmp_dir = cache_dir.with_suffix(".tmp")
- tmp_dir.mkdir(parents=True, exist_ok=True)
-
- shutil.rmtree(tmp_dir)
-
- wget(
- "--adjust-extension",
- "--span-hosts",
- "--no-verbose",
- "--convert-links",
- "--page-requisites",
- "--no-directories",
- "-e",
- "robots=off",
- "--load-cookies",
- "pinboard-cookies.txt",
- "--output-file",
- "-",
- "--directory-prefix",
- str(tmp_dir),
- f"https://pinboard.in/cached/{cache_id}/",
- )
-
- with tarfile.open(cache_path, "w:gz") as tf:
- tf.add(tmp_dir, arcname=cache_id)
-
- print(f" {cache_path}")
-
- shutil.rmtree(tmp_dir)
-
-
-if __name__ == "__main__":
- username = "alexwlchan"
-
- password = keyring.get_password("pinboard", "password")
- assert password is not None
-
- now = datetime.date.today().strftime("%Y-%m-%d")
-
- print("*** Getting a JSON copy of my bookmarks data")
- bookmarks = get_bookmarks_data(username, password)
- json_string = json.dumps(bookmarks, indent=2, sort_keys=True)
-
- for name in (f"bookmarks.{now}.json", "bookmarks.json"):
- write_to_file(name, contents=json_string)
-
- print("")
-
- print("*** Getting a list of cache IDs")
- all_cache_ids = get_cache_ids(username, password)
-
- for name in (f"cache_ids.{now}.json", "cache_ids.json"):
- write_to_file(name, contents=json.dumps(all_cache_ids))
-
- all_cache_ids = json.load(open(BACKUP_ROOT / "cache_ids.json"))
-
- print("")
-
- print("*** Saving archive files using wget")
-
- with wget_context(username, password):
- for url, cache_id in all_cache_ids.items():
- download_single_archive(url, cache_id)
-
- print("")
-
- print("*** Saving stories from AO3")
-
- ao3_urls = [b["href"] for b in bookmarks if "archiveofourown.org" in b["href"]]
-
- subprocess.check_call(
- [
- "python3",
- "/Users/alexwlchan/repos/scripts/web/save_ao3_links.py",
- ]
- + ao3_urls
- )