Skip to main content

web: scrape Really Useful Boxes to a CSV, not SQLite

ID
4fb0bd2
date
2026-04-20 04:31:21+00:00
author
Alex Chan <alex@alexwlchan.net>
parent
4983db7
message
web: scrape Really Useful Boxes to a CSV, not SQLite

This lets me drop my sqlite-utils dependency.
changed files
6 files, 28 additions, 45 deletions

Changed files

.gitignore (6) → .gitignore (31)

diff --git a/.gitignore b/.gitignore
index 0d20b64..dd93418 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
 *.pyc
+
+really_useful_boxes.csv

requirements.in (176) → requirements.in (180)

diff --git a/requirements.in b/requirements.in
index 891ae15..5b0dfcd 100644
--- a/requirements.in
+++ b/requirements.in
@@ -1,5 +1,5 @@
+alexwlchan-chives[fetch]
 beautifulsoup4
-certifi
 cogapp
 humanize
 iterm2
@@ -9,6 +9,5 @@ pygments  # Used as 'pygmentize' in 'pp_xml.sh'
 pypdf
 pytest
 ruff
-sqlite-utils
 tqdm
 yt-dlp[default]>=2024.3.10

requirements.txt (1866) → requirements.txt (1510)

diff --git a/requirements.txt b/requirements.txt
index 0b7a915..994a70c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,22 +1,18 @@
 # This file was autogenerated by uv via the following command:
 #    uv pip compile requirements.in --output-file=requirements.txt --exclude-newer=P7D --exclude-newer-package alexwlchan-chives=false
+alexwlchan-chives==34
+    # via -r requirements.in
 beautifulsoup4==4.14.3
     # via -r requirements.in
 brotli==1.2.0
     # via yt-dlp
 certifi==2026.2.25
     # via
-    #   -r requirements.in
+    #   alexwlchan-chives
     #   requests
     #   yt-dlp
 charset-normalizer==3.4.7
     # via requests
-click==8.3.2
-    # via
-    #   click-default-group
-    #   sqlite-utils
-click-default-group==1.2.4
-    # via sqlite-utils
 cogapp==3.6.0
     # via -r requirements.in
 humanize==4.15.0
@@ -37,12 +33,8 @@ pillow==12.2.0
     #   pillow-heif
 pillow-heif==1.3.0
     # via -r requirements.in
-pip==26.0.1
-    # via sqlite-utils
 pluggy==1.6.0
-    # via
-    #   pytest
-    #   sqlite-utils
+    # via pytest
 protobuf==7.34.1
     # via iterm2
 pycryptodomex==3.23.0
@@ -55,22 +47,12 @@ pypdf==6.9.2
     # via -r requirements.in
 pytest==9.0.3
     # via -r requirements.in
-python-dateutil==2.9.0.post0
-    # via sqlite-utils
 requests==2.33.1
     # via yt-dlp
 ruff==0.15.9
     # via -r requirements.in
-six==1.17.0
-    # via python-dateutil
 soupsieve==2.8.3
     # via beautifulsoup4
-sqlite-fts4==1.0.3
-    # via sqlite-utils
-sqlite-utils==3.39
-    # via -r requirements.in
-tabulate==0.10.0
-    # via sqlite-utils
 tqdm==4.67.3
     # via -r requirements.in
 typing-extensions==4.15.0

web/README.md (2544) → web/README.md (2454)

diff --git a/web/README.md b/web/README.md
index 4ff6efa..a7b5244 100644
--- a/web/README.md
+++ b/web/README.md
@@ -30,7 +30,7 @@ scripts = [
     {
         "name": "scrape_really_useful_boxes.py",
         "description": """
-        scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.<br/><br/><img src="really_useful_boxes.png">
+        scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.
         """
     },
     {
@@ -69,7 +69,7 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
     </a>
   </dt>
   <dd>
-    scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.<br/><br/><img src="really_useful_boxes.png">
+    scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.
   </dd>
 
   <dt>
@@ -81,4 +81,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
     this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
   </dd>
 </dl>
-<!-- [[[end]]] (sum: vPvQ8JA/hi) -->
+<!-- [[[end]]] (sum: 2K2olXGTrf) -->

web/really_useful_boxes.png (842347) → web/really_useful_boxes.png (0)

diff --git a/web/really_useful_boxes.png b/web/really_useful_boxes.png
deleted file mode 100644
index 2678284..0000000
Binary files a/web/really_useful_boxes.png and /dev/null differ

web/scrape_really_useful_boxes.py (3933) → web/scrape_really_useful_boxes.py (3959)

diff --git a/web/scrape_really_useful_boxes.py b/web/scrape_really_useful_boxes.py
index b78c35e..3b6b322 100755
--- a/web/scrape_really_useful_boxes.py
+++ b/web/scrape_really_useful_boxes.py
@@ -12,26 +12,21 @@ See https://social.alexwlchan.net/@alex/111750446474991705
 """
 
 from collections.abc import Iterator
+import csv
 import time
-import ssl
 from typing import TypedDict
 from urllib.parse import urljoin
-import urllib.request
 
 import bs4
-import certifi
-from sqlite_utils import Database
-import tqdm
+from chives.fetch import fetch_url
 
 
 def get_soup(url: str) -> bs4.BeautifulSoup:
     """
     Fetch the contents of a URL, parse it as HTML, and return the parsed soup.
     """
-    ssl_context = ssl.create_default_context(cafile=certifi.where())
-    with urllib.request.urlopen(url, context=ssl_context) as resp:
-        html = resp.read()
-    
+    html = fetch_url(url)
+
     soup = bs4.BeautifulSoup(html, "html.parser")
 
     return soup
@@ -130,12 +125,17 @@ def get_box_info(url: str) -> BoxInfo:
 
 
 if __name__ == "__main__":
-    db = Database("really_useful_boxes.db")
-
-    product_urls = list(get_product_page_urls())
-
-    for url in tqdm.tqdm(product_urls):
-        db["boxes"].insert(get_box_info(url))
-
-        # This is to avoid getting rate-limited or upsetting the website
-        time.sleep(1)
+    with open("really_useful_boxes.csv", "x") as out_file:
+        writer = csv.DictWriter(
+            out_file,
+            fieldnames=["name", "url", "length", "width", "depth", "image_url"],
+        )
+        writer.writeheader()
+
+        for url in get_product_page_urls():
+            print(f"fetching {url}...")
+            row = get_box_info(url)
+            writer.writerow(row)
+
+            # This is to avoid getting rate-limited or upsetting the website
+            time.sleep(1)