3Scrape the Really Useful Boxes catalogue, in particular their product page
4at https://www.reallyusefulproducts.co.uk/uk/html/onlineshop/fullrange_rub.php
6This sucks down information about the dimension of the boxes, which
7I use to do searches of boxes that need to fit into specific spaces,
8e.g. boxes that are between 20 and 30cm wide. It creates a SQLite
9database that I throw into datasette.
11See https://social.alexwlchan.net/@alex/111750446474991705
14from collections.abc import Iterator
17from typing import TypedDict
18from urllib.parse import urljoin
21from chives.fetch import fetch_url
24def get_soup(url: str) -> bs4.BeautifulSoup:
26 Fetch the contents of a URL, parse it as HTML, and return the parsed soup.
30 soup = bs4.BeautifulSoup(html, "html.parser")
35def get_product_page_urls() -> Iterator[str]:
37 Generate URLs to individual product pages.
40 "https://www.reallyusefulproducts.co.uk/uk/html/onlineshop/fullrange_rub.php"
43 soup = get_soup(base_url)
45 # The page is arranged something like:
47 # <ul class="productgallery">
49 # <form action="https://www.romancart.com/cart.asp" method="post">
50 # <a href="./rub/b00_07litre.php">
52 # 0.07 litre<br>Really Useful Box
57 product_gallery = soup.find("ul", attrs={"class": "productgallery"})
59 for li_elem in product_gallery.find_all("li"):
60 link = li_elem.find("a")
67 if "Really Useful Box" not in name:
72 for w in {"tray", "set", "pack", "bauble insert", "bonus pack"}
76 yield urljoin(base_url, link.attrs["href"])
79class BoxInfo(TypedDict):
88def get_box_info(url: str) -> BoxInfo:
91 name = soup.find("h2").text
93 # The dimensions are in a single paragraph like:
95 # <p><em class="type1">Dimensions</em><br>
96 # External: 120 x 85 x 45<br>
97 # Internal: 90 x 65 x 32<br>
98 # (length x width x depth in mm)</p>
100 dimensions = next(p for p in soup.find_all("p") if "Dimensions" in p.text)
102 lines = dimensions.text.splitlines()
103 assert len(lines) == 4
104 assert lines[0].startswith("Dimensions")
105 assert lines[-1] == "(length x width x depth in mm)"
106 assert lines[1].startswith("External: ")
107 length, width, depth = lines[1].replace("External: ", "").split(" x ")
108 length = int(length.split()[0].replace(",", ""))
109 width = int(width.split()[0].replace(",", ""))
110 depth = int(depth.split()[0].replace(",", ""))
112 image_url = urljoin(url, soup.find("img", attrs={"class": "rhsimage"}).attrs["src"])
114 # The order of this dict will become the order of columns in
115 # the SQLite database, which in turn will be used by datasette --
116 # make it the convenient order for viewing.
118 "image_url": image_url,
127if __name__ == "__main__":
128 with open("really_useful_boxes.csv", "x") as out_file:
129 writer = csv.DictWriter(
131 fieldnames=["name", "url", "length", "width", "depth", "image_url"],
135 for url in get_product_page_urls():
136 print(f"fetching {url}...")
137 row = get_box_info(url)
140 # This is to avoid getting rate-limited or upsetting the website