web: scrape Really Useful Boxes to a CSV, not SQLite
- ID
4fb0bd2- date
2026-04-20 04:31:21+00:00- author
Alex Chan <alex@alexwlchan.net>- parent
4983db7- message
web: scrape Really Useful Boxes to a CSV, not SQLite This lets me drop my sqlite-utils dependency.- changed files
6 files, 28 additions, 45 deletions
Changed files
.gitignore (6) → .gitignore (31)
diff --git a/.gitignore b/.gitignore
index 0d20b64..dd93418 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
*.pyc
+
+really_useful_boxes.csv
requirements.in (176) → requirements.in (180)
diff --git a/requirements.in b/requirements.in
index 891ae15..5b0dfcd 100644
--- a/requirements.in
+++ b/requirements.in
@@ -1,5 +1,5 @@
+alexwlchan-chives[fetch]
beautifulsoup4
-certifi
cogapp
humanize
iterm2
@@ -9,6 +9,5 @@ pygments # Used as 'pygmentize' in 'pp_xml.sh'
pypdf
pytest
ruff
-sqlite-utils
tqdm
yt-dlp[default]>=2024.3.10
requirements.txt (1866) → requirements.txt (1510)
diff --git a/requirements.txt b/requirements.txt
index 0b7a915..994a70c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,22 +1,18 @@
# This file was autogenerated by uv via the following command:
# uv pip compile requirements.in --output-file=requirements.txt --exclude-newer=P7D --exclude-newer-package alexwlchan-chives=false
+alexwlchan-chives==34
+ # via -r requirements.in
beautifulsoup4==4.14.3
# via -r requirements.in
brotli==1.2.0
# via yt-dlp
certifi==2026.2.25
# via
- # -r requirements.in
+ # alexwlchan-chives
# requests
# yt-dlp
charset-normalizer==3.4.7
# via requests
-click==8.3.2
- # via
- # click-default-group
- # sqlite-utils
-click-default-group==1.2.4
- # via sqlite-utils
cogapp==3.6.0
# via -r requirements.in
humanize==4.15.0
@@ -37,12 +33,8 @@ pillow==12.2.0
# pillow-heif
pillow-heif==1.3.0
# via -r requirements.in
-pip==26.0.1
- # via sqlite-utils
pluggy==1.6.0
- # via
- # pytest
- # sqlite-utils
+ # via pytest
protobuf==7.34.1
# via iterm2
pycryptodomex==3.23.0
@@ -55,22 +47,12 @@ pypdf==6.9.2
# via -r requirements.in
pytest==9.0.3
# via -r requirements.in
-python-dateutil==2.9.0.post0
- # via sqlite-utils
requests==2.33.1
# via yt-dlp
ruff==0.15.9
# via -r requirements.in
-six==1.17.0
- # via python-dateutil
soupsieve==2.8.3
# via beautifulsoup4
-sqlite-fts4==1.0.3
- # via sqlite-utils
-sqlite-utils==3.39
- # via -r requirements.in
-tabulate==0.10.0
- # via sqlite-utils
tqdm==4.67.3
# via -r requirements.in
typing-extensions==4.15.0
web/README.md (2544) → web/README.md (2454)
diff --git a/web/README.md b/web/README.md
index 4ff6efa..a7b5244 100644
--- a/web/README.md
+++ b/web/README.md
@@ -30,7 +30,7 @@ scripts = [
{
"name": "scrape_really_useful_boxes.py",
"description": """
- scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.<br/><br/><img src="really_useful_boxes.png">
+ scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.
"""
},
{
@@ -69,7 +69,7 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
</a>
</dt>
<dd>
- scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.<br/><br/><img src="really_useful_boxes.png">
+ scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.
</dd>
<dt>
@@ -81,4 +81,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
</dd>
</dl>
-<!-- [[[end]]] (sum: vPvQ8JA/hi) -->
+<!-- [[[end]]] (sum: 2K2olXGTrf) -->
web/really_useful_boxes.png (842347) → web/really_useful_boxes.png (0)
diff --git a/web/really_useful_boxes.png b/web/really_useful_boxes.png
deleted file mode 100644
index 2678284..0000000
Binary files a/web/really_useful_boxes.png and /dev/null differ
web/scrape_really_useful_boxes.py (3933) → web/scrape_really_useful_boxes.py (3959)
diff --git a/web/scrape_really_useful_boxes.py b/web/scrape_really_useful_boxes.py
index b78c35e..3b6b322 100755
--- a/web/scrape_really_useful_boxes.py
+++ b/web/scrape_really_useful_boxes.py
@@ -12,26 +12,21 @@ See https://social.alexwlchan.net/@alex/111750446474991705
"""
from collections.abc import Iterator
+import csv
import time
-import ssl
from typing import TypedDict
from urllib.parse import urljoin
-import urllib.request
import bs4
-import certifi
-from sqlite_utils import Database
-import tqdm
+from chives.fetch import fetch_url
def get_soup(url: str) -> bs4.BeautifulSoup:
"""
Fetch the contents of a URL, parse it as HTML, and return the parsed soup.
"""
- ssl_context = ssl.create_default_context(cafile=certifi.where())
- with urllib.request.urlopen(url, context=ssl_context) as resp:
- html = resp.read()
-
+ html = fetch_url(url)
+
soup = bs4.BeautifulSoup(html, "html.parser")
return soup
@@ -130,12 +125,17 @@ def get_box_info(url: str) -> BoxInfo:
if __name__ == "__main__":
- db = Database("really_useful_boxes.db")
-
- product_urls = list(get_product_page_urls())
-
- for url in tqdm.tqdm(product_urls):
- db["boxes"].insert(get_box_info(url))
-
- # This is to avoid getting rate-limited or upsetting the website
- time.sleep(1)
+ with open("really_useful_boxes.csv", "x") as out_file:
+ writer = csv.DictWriter(
+ out_file,
+ fieldnames=["name", "url", "length", "width", "depth", "image_url"],
+ )
+ writer.writeheader()
+
+ for url in get_product_page_urls():
+ print(f"fetching {url}...")
+ row = get_box_info(url)
+ writer.writerow(row)
+
+ # This is to avoid getting rate-limited or upsetting the website
+ time.sleep(1)