Add my script for scraping the Really Useful Boxes catalogue
- ID
1879223- date
2024-01-13 22:49:57+00:00- author
Alex Chan <alex@alexwlchan.net>- parent
918abd9- message
Add my script for scraping the Really Useful Boxes catalogue- changed files
7 files, 184 additions, 3 deletions
Changed files
cog_helpers.py (2926) → cog_helpers.py (2988)
diff --git a/cog_helpers.py b/cog_helpers.py
index 77140d2..7a828b9 100644
--- a/cog_helpers.py
+++ b/cog_helpers.py
@@ -109,6 +109,9 @@ def create_description_table(
if f.startswith(("test_", "_", ".")):
continue
+ if f.endswith((".png", ".db")):
+ continue
+
if f in ignore_files:
continue
requirements.in (210) → requirements.in (256)
diff --git a/requirements.in b/requirements.in
index 1a4dd9d..268fe8b 100644
--- a/requirements.in
+++ b/requirements.in
@@ -4,6 +4,7 @@ boto3
beautifulsoup4
cogapp
datasette
+datasette-render-image-tags
flake8
flickr-photos-api
flickr-url-parser
@@ -18,5 +19,7 @@ pip-tools
pypdf
pytest
regex
+sqlite-utils
termcolor
+tqdm
yt-dlp
requirements.txt (3631) → requirements.txt (4008)
diff --git a/requirements.txt b/requirements.txt
index c64c463..de5ce86 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -42,12 +42,19 @@ click==8.1.7
# click-default-group
# datasette
# pip-tools
+ # sqlite-utils
# uvicorn
click-default-group==1.2.4
- # via datasette
+ # via
+ # datasette
+ # sqlite-utils
cogapp==3.3.0
# via -r requirements.in
datasette==0.64.6
+ # via
+ # -r requirements.in
+ # datasette-render-image-tags
+datasette-render-image-tags==0.1
# via -r requirements.in
flake8==6.1.0
# via -r requirements.in
@@ -138,6 +145,7 @@ pluggy==1.3.0
# via
# datasette
# pytest
+ # sqlite-utils
pycodestyle==2.11.1
# via flake8
pycryptodomex==3.19.0
@@ -151,7 +159,9 @@ pyproject-hooks==1.0.0
pytest==7.4.3
# via -r requirements.in
python-dateutil==2.8.2
- # via botocore
+ # via
+ # botocore
+ # sqlite-utils
python-multipart==0.0.6
# via asgi-csrf
pyyaml==6.0.1
@@ -170,10 +180,18 @@ sniffio==1.3.0
# httpx
soupsieve==2.5
# via beautifulsoup4
+sqlite-fts4==1.0.3
+ # via sqlite-utils
+sqlite-utils==3.36
+ # via -r requirements.in
+tabulate==0.9.0
+ # via sqlite-utils
tenacity==8.2.3
# via flickr-photos-api
termcolor==2.4.0
# via -r requirements.in
+tqdm==4.66.1
+ # via -r requirements.in
typing-extensions==4.9.0
# via
# janus
web/.gitignore (0) → web/.gitignore (5)
diff --git a/web/.gitignore b/web/.gitignore
new file mode 100644
index 0000000..98e6ef6
--- /dev/null
+++ b/web/.gitignore
@@ -0,0 +1 @@
+*.db
web/README.md (1432) → web/README.md (2215)
diff --git a/web/README.md b/web/README.md
index 6a4eab5..2a8b3bd 100644
--- a/web/README.md
+++ b/web/README.md
@@ -24,6 +24,12 @@ scripts = [
"""
},
{
+ "name": "scrape_really_useful_boxes.py",
+ "description": """
+ scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.<br/><br/><img src="really_useful_boxes.png">
+ """
+ },
+ {
"name": "yt-dlp.py",
"description": """
this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
@@ -45,6 +51,15 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
</dd>
<dt>
+ <a href="https://github.com/alexwlchan/scripts/blob/main/web/scrape_really_useful_boxes.py">
+ <code>scrape_really_useful_boxes.py</code>
+ </a>
+ </dt>
+ <dd>
+ scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.<br/><br/><img src="really_useful_boxes.png">
+ </dd>
+
+ <dt>
<a href="https://github.com/alexwlchan/scripts/blob/main/web/yt-dlp.py">
<code>yt-dlp.py</code>
</a>
@@ -53,4 +68,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
</dd>
</dl>
-<!-- [[[end]]] (checksum: ccfcee43f421ad3a8a3409045e8f5250) -->
+<!-- [[[end]]] (checksum: 248e56c72ea624b8450e3e39e63663e1) -->
web/really_useful_boxes.png (0) → web/really_useful_boxes.png (842347)
diff --git a/web/really_useful_boxes.png b/web/really_useful_boxes.png
new file mode 100644
index 0000000..2678284
Binary files /dev/null and b/web/really_useful_boxes.png differ
web/scrape_really_useful_boxes.py (0) → web/scrape_really_useful_boxes.py (3817)
diff --git a/web/scrape_really_useful_boxes.py b/web/scrape_really_useful_boxes.py
new file mode 100755
index 0000000..b73daad
--- /dev/null
+++ b/web/scrape_really_useful_boxes.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""
+Scrape the Really Useful Boxes catalogue, in particular their product page
+at https://www.reallyusefulproducts.co.uk/uk/html/onlineshop/fullrange_rub.php
+
+This sucks down information about the dimension of the boxes, which
+I use to do searches of boxes that need to fit into specific spaces,
+e.g. boxes that are between 20 and 30cm wide. It creates a SQLite
+database that I throw into datasette.
+
+See https://social.alexwlchan.net/@alex/111750446474991705
+"""
+
+from collections.abc import Iterator
+import time
+from typing import TypedDict
+from urllib.parse import urljoin
+
+import bs4
+import httpx
+from sqlite_utils import Database
+import tqdm
+
+
+client = httpx.Client()
+
+
+def get_soup(url: str) -> bs4.BeautifulSoup:
+ """
+ Fetch the contents of a URL, parse it as HTML, and return the parsed soup.
+ """
+ resp = client.get(url)
+ resp.raise_for_status()
+
+ soup = bs4.BeautifulSoup(resp.text, "html.parser")
+
+ return soup
+
+
+def get_product_page_urls() -> Iterator[str]:
+ """
+ Generate URLs to individual product pages.
+ """
+ base_url = (
+ "https://www.reallyusefulproducts.co.uk/uk/html/onlineshop/fullrange_rub.php"
+ )
+
+ soup = get_soup(base_url)
+
+ # The page is arranged something like:
+ #
+ # <ul class="productgallery">
+ # <li>
+ # <form action="https://www.romancart.com/cart.asp" method="post">
+ # <a href="./rub/b00_07litre.php">
+ # <img src="…">
+ # 0.07 litre<br>Really Useful Box
+ # </a>
+ # …
+ # </li>
+ #
+ product_gallery = soup.find("ul", attrs={"class": "productgallery"})
+
+ for li_elem in product_gallery.find_all("li"):
+ link = li_elem.find("a")
+
+ if link is None:
+ continue
+
+ name = link.text
+
+ if "Really Useful Box" not in name:
+ continue
+
+ if any(
+ w in name.lower()
+ for w in {"tray", "set", "pack", "bauble insert", "bonus pack"}
+ ):
+ continue
+
+ yield urljoin(base_url, link.attrs["href"])
+
+
+class BoxInfo(TypedDict):
+ image_url: str
+ url: str
+ name: str
+ length: int
+ width: int
+ depth: int
+
+
+def get_box_info(url: str) -> BoxInfo:
+ soup = get_soup(url)
+
+ name = soup.find("h2").text
+
+ # The dimensions are in a single paragraph like:
+ #
+ # <p><em class="type1">Dimensions</em><br>
+ # External: 120 x 85 x 45<br>
+ # Internal: 90 x 65 x 32<br>
+ # (length x width x depth in mm)</p>
+ #
+ dimensions = next(p for p in soup.find_all("p") if "Dimensions" in p.text)
+
+ lines = dimensions.text.splitlines()
+ assert len(lines) == 4
+ assert lines[0].startswith("Dimensions")
+ assert lines[-1] == "(length x width x depth in mm)"
+ assert lines[1].startswith("External: ")
+ length, width, depth = lines[1].replace("External: ", "").split(" x ")
+ length = int(length.split()[0].replace(",", ""))
+ width = int(width.split()[0].replace(",", ""))
+ depth = int(depth.split()[0].replace(",", ""))
+
+ image_url = urljoin(url, soup.find("img", attrs={"class": "rhsimage"}).attrs["src"])
+
+ # The order of this dict will become the order of columns in
+ # the SQLite database, which in turn will be used by datasette --
+ # make it the convenient order for viewing.
+ return {
+ "image_url": image_url,
+ "name": name,
+ "length": length,
+ "width": width,
+ "depth": depth,
+ "url": url,
+ }
+
+
+if __name__ == "__main__":
+ db = Database("really_useful_boxes.db")
+
+ product_urls = list(get_product_page_urls())
+
+ for url in tqdm.tqdm(product_urls):
+ db["boxes"].insert(get_box_info(url))
+
+ # This is to avoid getting rate-limited or upsetting the website
+ time.sleep(1)