Skip to main content

Add my script for scraping the Really Useful Boxes catalogue

ID
1879223
date
2024-01-13 22:49:57+00:00
author
Alex Chan <alex@alexwlchan.net>
parent
918abd9
message
Add my script for scraping the Really Useful Boxes catalogue
changed files
7 files, 184 additions, 3 deletions

Changed files

cog_helpers.py (2926) → cog_helpers.py (2988)

diff --git a/cog_helpers.py b/cog_helpers.py
index 77140d2..7a828b9 100644
--- a/cog_helpers.py
+++ b/cog_helpers.py
@@ -109,6 +109,9 @@ def create_description_table(
         if f.startswith(("test_", "_", ".")):
             continue
 
+        if f.endswith((".png", ".db")):
+            continue
+
         if f in ignore_files:
             continue
 

requirements.in (210) → requirements.in (256)

diff --git a/requirements.in b/requirements.in
index 1a4dd9d..268fe8b 100644
--- a/requirements.in
+++ b/requirements.in
@@ -4,6 +4,7 @@ boto3
 beautifulsoup4
 cogapp
 datasette
+datasette-render-image-tags
 flake8
 flickr-photos-api
 flickr-url-parser
@@ -18,5 +19,7 @@ pip-tools
 pypdf
 pytest
 regex
+sqlite-utils
 termcolor
+tqdm
 yt-dlp

requirements.txt (3631) → requirements.txt (4008)

diff --git a/requirements.txt b/requirements.txt
index c64c463..de5ce86 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -42,12 +42,19 @@ click==8.1.7
     #   click-default-group
     #   datasette
     #   pip-tools
+    #   sqlite-utils
     #   uvicorn
 click-default-group==1.2.4
-    # via datasette
+    # via
+    #   datasette
+    #   sqlite-utils
 cogapp==3.3.0
     # via -r requirements.in
 datasette==0.64.6
+    # via
+    #   -r requirements.in
+    #   datasette-render-image-tags
+datasette-render-image-tags==0.1
     # via -r requirements.in
 flake8==6.1.0
     # via -r requirements.in
@@ -138,6 +145,7 @@ pluggy==1.3.0
     # via
     #   datasette
     #   pytest
+    #   sqlite-utils
 pycodestyle==2.11.1
     # via flake8
 pycryptodomex==3.19.0
@@ -151,7 +159,9 @@ pyproject-hooks==1.0.0
 pytest==7.4.3
     # via -r requirements.in
 python-dateutil==2.8.2
-    # via botocore
+    # via
+    #   botocore
+    #   sqlite-utils
 python-multipart==0.0.6
     # via asgi-csrf
 pyyaml==6.0.1
@@ -170,10 +180,18 @@ sniffio==1.3.0
     #   httpx
 soupsieve==2.5
     # via beautifulsoup4
+sqlite-fts4==1.0.3
+    # via sqlite-utils
+sqlite-utils==3.36
+    # via -r requirements.in
+tabulate==0.9.0
+    # via sqlite-utils
 tenacity==8.2.3
     # via flickr-photos-api
 termcolor==2.4.0
     # via -r requirements.in
+tqdm==4.66.1
+    # via -r requirements.in
 typing-extensions==4.9.0
     # via
     #   janus

web/.gitignore (0) → web/.gitignore (5)

diff --git a/web/.gitignore b/web/.gitignore
new file mode 100644
index 0000000..98e6ef6
--- /dev/null
+++ b/web/.gitignore
@@ -0,0 +1 @@
+*.db

web/README.md (1432) → web/README.md (2215)

diff --git a/web/README.md b/web/README.md
index 6a4eab5..2a8b3bd 100644
--- a/web/README.md
+++ b/web/README.md
@@ -24,6 +24,12 @@ scripts = [
         """
     },
     {
+        "name": "scrape_really_useful_boxes.py",
+        "description": """
+        scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.<br/><br/><img src="really_useful_boxes.png">
+        """
+    },
+    {
         "name": "yt-dlp.py",
         "description": """
         this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
@@ -45,6 +51,15 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
   </dd>
 
   <dt>
+    <a href="https://github.com/alexwlchan/scripts/blob/main/web/scrape_really_useful_boxes.py">
+      <code>scrape_really_useful_boxes.py</code>
+    </a>
+  </dt>
+  <dd>
+    scrape the Really Useful Boxes product catalogue, so I can search for boxes in ways their website doesn't allow – in particular, by dimensions, so I can find boxes that fit into specific spaces.<br/><br/><img src="really_useful_boxes.png">
+  </dd>
+
+  <dt>
     <a href="https://github.com/alexwlchan/scripts/blob/main/web/yt-dlp.py">
       <code>yt-dlp.py</code>
     </a>
@@ -53,4 +68,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
     this is a wrapper around <a href="https://github.com/yt-dlp/yt-dlp">yt-dlp</a> that does parallel downloads of videos in playlists.
   </dd>
 </dl>
-<!-- [[[end]]] (checksum: ccfcee43f421ad3a8a3409045e8f5250) -->
+<!-- [[[end]]] (checksum: 248e56c72ea624b8450e3e39e63663e1) -->

web/really_useful_boxes.png (0) → web/really_useful_boxes.png (842347)

diff --git a/web/really_useful_boxes.png b/web/really_useful_boxes.png
new file mode 100644
index 0000000..2678284
Binary files /dev/null and b/web/really_useful_boxes.png differ

web/scrape_really_useful_boxes.py (0) → web/scrape_really_useful_boxes.py (3817)

diff --git a/web/scrape_really_useful_boxes.py b/web/scrape_really_useful_boxes.py
new file mode 100755
index 0000000..b73daad
--- /dev/null
+++ b/web/scrape_really_useful_boxes.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""
+Scrape the Really Useful Boxes catalogue, in particular their product page
+at https://www.reallyusefulproducts.co.uk/uk/html/onlineshop/fullrange_rub.php
+
+This sucks down information about the dimension of the boxes, which
+I use to do searches of boxes that need to fit into specific spaces,
+e.g. boxes that are between 20 and 30cm wide.  It creates a SQLite
+database that I throw into datasette.
+
+See https://social.alexwlchan.net/@alex/111750446474991705
+"""
+
+from collections.abc import Iterator
+import time
+from typing import TypedDict
+from urllib.parse import urljoin
+
+import bs4
+import httpx
+from sqlite_utils import Database
+import tqdm
+
+
+client = httpx.Client()
+
+
+def get_soup(url: str) -> bs4.BeautifulSoup:
+    """
+    Fetch the contents of a URL, parse it as HTML, and return the parsed soup.
+    """
+    resp = client.get(url)
+    resp.raise_for_status()
+
+    soup = bs4.BeautifulSoup(resp.text, "html.parser")
+
+    return soup
+
+
+def get_product_page_urls() -> Iterator[str]:
+    """
+    Generate URLs to individual product pages.
+    """
+    base_url = (
+        "https://www.reallyusefulproducts.co.uk/uk/html/onlineshop/fullrange_rub.php"
+    )
+
+    soup = get_soup(base_url)
+
+    # The page is arranged something like:
+    #
+    #     <ul class="productgallery">
+    #     <li>
+    #       <form action="https://www.romancart.com/cart.asp" method="post">
+    #         <a href="./rub/b00_07litre.php">
+    #           <img src="…">
+    #           0.07 litre<br>Really Useful Box
+    #         </a>
+    #         …
+    #     </li>
+    #
+    product_gallery = soup.find("ul", attrs={"class": "productgallery"})
+
+    for li_elem in product_gallery.find_all("li"):
+        link = li_elem.find("a")
+
+        if link is None:
+            continue
+
+        name = link.text
+
+        if "Really Useful Box" not in name:
+            continue
+
+        if any(
+            w in name.lower()
+            for w in {"tray", "set", "pack", "bauble insert", "bonus pack"}
+        ):
+            continue
+
+        yield urljoin(base_url, link.attrs["href"])
+
+
+class BoxInfo(TypedDict):
+    image_url: str
+    url: str
+    name: str
+    length: int
+    width: int
+    depth: int
+
+
+def get_box_info(url: str) -> BoxInfo:
+    soup = get_soup(url)
+
+    name = soup.find("h2").text
+
+    # The dimensions are in a single paragraph like:
+    #
+    #     <p><em class="type1">Dimensions</em><br>
+    #     External: 120 x 85 x 45<br>
+    #     Internal: 90 x 65 x 32<br>
+    #     (length x width x depth in mm)</p>
+    #
+    dimensions = next(p for p in soup.find_all("p") if "Dimensions" in p.text)
+
+    lines = dimensions.text.splitlines()
+    assert len(lines) == 4
+    assert lines[0].startswith("Dimensions")
+    assert lines[-1] == "(length x width x depth in mm)"
+    assert lines[1].startswith("External: ")
+    length, width, depth = lines[1].replace("External: ", "").split(" x ")
+    length = int(length.split()[0].replace(",", ""))
+    width = int(width.split()[0].replace(",", ""))
+    depth = int(depth.split()[0].replace(",", ""))
+
+    image_url = urljoin(url, soup.find("img", attrs={"class": "rhsimage"}).attrs["src"])
+
+    # The order of this dict will become the order of columns in
+    # the SQLite database, which in turn will be used by datasette --
+    # make it the convenient order for viewing.
+    return {
+        "image_url": image_url,
+        "name": name,
+        "length": length,
+        "width": width,
+        "depth": depth,
+        "url": url,
+    }
+
+
+if __name__ == "__main__":
+    db = Database("really_useful_boxes.db")
+
+    product_urls = list(get_product_page_urls())
+
+    for url in tqdm.tqdm(product_urls):
+        db["boxes"].insert(get_box_info(url))
+
+        # This is to avoid getting rate-limited or upsetting the website
+        time.sleep(1)