Skip to main content

web/scrape_really_useful_boxes.py

1#!/usr/bin/env python3
2"""
3Scrape the Really Useful Boxes catalogue, in particular their product page
4at https://www.reallyusefulproducts.co.uk/uk/html/onlineshop/fullrange_rub.php
6This sucks down information about the dimension of the boxes, which
7I use to do searches of boxes that need to fit into specific spaces,
8e.g. boxes that are between 20 and 30cm wide. It creates a SQLite
9database that I throw into datasette.
11See https://social.alexwlchan.net/@alex/111750446474991705
12"""
14from collections.abc import Iterator
15import csv
16import time
17from typing import TypedDict
18from urllib.parse import urljoin
20import bs4
21from chives.fetch import fetch_url
24def get_soup(url: str) -> bs4.BeautifulSoup:
25 """
26 Fetch the contents of a URL, parse it as HTML, and return the parsed soup.
27 """
28 html = fetch_url(url)
30 soup = bs4.BeautifulSoup(html, "html.parser")
32 return soup
35def get_product_page_urls() -> Iterator[str]:
36 """
37 Generate URLs to individual product pages.
38 """
39 base_url = (
40 "https://www.reallyusefulproducts.co.uk/uk/html/onlineshop/fullrange_rub.php"
41 )
43 soup = get_soup(base_url)
45 # The page is arranged something like:
46 #
47 # <ul class="productgallery">
48 # <li>
49 # <form action="https://www.romancart.com/cart.asp" method="post">
50 # <a href="./rub/b00_07litre.php">
51 # <img src="…">
52 # 0.07 litre<br>Really Useful Box
53 # </a>
54 # …
55 # </li>
56 #
57 product_gallery = soup.find("ul", attrs={"class": "productgallery"})
59 for li_elem in product_gallery.find_all("li"):
60 link = li_elem.find("a")
62 if link is None:
63 continue
65 name = link.text
67 if "Really Useful Box" not in name:
68 continue
70 if any(
71 w in name.lower()
72 for w in {"tray", "set", "pack", "bauble insert", "bonus pack"}
73 ):
74 continue
76 yield urljoin(base_url, link.attrs["href"])
79class BoxInfo(TypedDict):
80 image_url: str
81 url: str
82 name: str
83 length: int
84 width: int
85 depth: int
88def get_box_info(url: str) -> BoxInfo:
89 soup = get_soup(url)
91 name = soup.find("h2").text
93 # The dimensions are in a single paragraph like:
94 #
95 # <p><em class="type1">Dimensions</em><br>
96 # External: 120 x 85 x 45<br>
97 # Internal: 90 x 65 x 32<br>
98 # (length x width x depth in mm)</p>
99 #
100 dimensions = next(p for p in soup.find_all("p") if "Dimensions" in p.text)
102 lines = dimensions.text.splitlines()
103 assert len(lines) == 4
104 assert lines[0].startswith("Dimensions")
105 assert lines[-1] == "(length x width x depth in mm)"
106 assert lines[1].startswith("External: ")
107 length, width, depth = lines[1].replace("External: ", "").split(" x ")
108 length = int(length.split()[0].replace(",", ""))
109 width = int(width.split()[0].replace(",", ""))
110 depth = int(depth.split()[0].replace(",", ""))
112 image_url = urljoin(url, soup.find("img", attrs={"class": "rhsimage"}).attrs["src"])
114 # The order of this dict will become the order of columns in
115 # the SQLite database, which in turn will be used by datasette --
116 # make it the convenient order for viewing.
117 return {
118 "image_url": image_url,
119 "name": name,
120 "length": length,
121 "width": width,
122 "depth": depth,
123 "url": url,
124 }
127if __name__ == "__main__":
128 with open("really_useful_boxes.csv", "x") as out_file:
129 writer = csv.DictWriter(
130 out_file,
131 fieldnames=["name", "url", "length", "width", "depth", "image_url"],
132 )
133 writer.writeheader()
135 for url in get_product_page_urls():
136 print(f"fetching {url}...")
137 row = get_box_info(url)
138 writer.writerow(row)
140 # This is to avoid getting rate-limited or upsetting the website
141 time.sleep(1)