web/scrape_really_useful_boxes.py – scripts

`web/scrape_really_useful_boxes.py`

4.0 kB
Python
View raw
1#!/usr/bin/env python3
2"""
3Scrape the Really Useful Boxes catalogue, in particular their product page
4at https://www.reallyusefulproducts.co.uk/uk/html/onlineshop/fullrange_rub.php
5
6This sucks down information about the dimension of the boxes, which
7I use to do searches of boxes that need to fit into specific spaces,
8e.g. boxes that are between 20 and 30cm wide.  It creates a SQLite
9database that I throw into datasette.
10
11See https://social.alexwlchan.net/@alex/111750446474991705
12"""
13
14from collections.abc import Iterator
15import csv
16import time
17from typing import TypedDict
18from urllib.parse import urljoin
19
20import bs4
21from chives.fetch import fetch_url
22
23
24def get_soup(url: str) -> bs4.BeautifulSoup:
25    """
26    Fetch the contents of a URL, parse it as HTML, and return the parsed soup.
27    """
28    html = fetch_url(url)
29
30    soup = bs4.BeautifulSoup(html, "html.parser")
31
32    return soup
33
34
35def get_product_page_urls() -> Iterator[str]:
36    """
37    Generate URLs to individual product pages.
38    """
39    base_url = (
40        "https://www.reallyusefulproducts.co.uk/uk/html/onlineshop/fullrange_rub.php"
41    )
42
43    soup = get_soup(base_url)
44
45    # The page is arranged something like:
46    #
47    #     <ul class="productgallery">
48    #     <li>
49    #       <form action="https://www.romancart.com/cart.asp" method="post">
50    #         <a href="./rub/b00_07litre.php">
51    #           <img src="…">
52    #           0.07 litre<br>Really Useful Box
53    #         </a>
54    #         …
55    #     </li>
56    #
57    product_gallery = soup.find("ul", attrs={"class": "productgallery"})
58
59    for li_elem in product_gallery.find_all("li"):
60        link = li_elem.find("a")
61
62        if link is None:
63            continue
64
65        name = link.text
66
67        if "Really Useful Box" not in name:
68            continue
69
70        if any(
71            w in name.lower()
72            for w in {"tray", "set", "pack", "bauble insert", "bonus pack"}
73        ):
74            continue
75
76        yield urljoin(base_url, link.attrs["href"])
77
78
79class BoxInfo(TypedDict):
80    image_url: str
81    url: str
82    name: str
83    length: int
84    width: int
85    depth: int
86
87
88def get_box_info(url: str) -> BoxInfo:
89    soup = get_soup(url)
90
91    name = soup.find("h2").text
92
93    # The dimensions are in a single paragraph like:
94    #
95    #     <p><em class="type1">Dimensions</em><br>
96    #     External: 120 x 85 x 45<br>
97    #     Internal: 90 x 65 x 32<br>
98    #     (length x width x depth in mm)</p>
99    #
100    dimensions = next(p for p in soup.find_all("p") if "Dimensions" in p.text)
101
102    lines = dimensions.text.splitlines()
103    assert len(lines) == 4
104    assert lines[0].startswith("Dimensions")
105    assert lines[-1] == "(length x width x depth in mm)"
106    assert lines[1].startswith("External: ")
107    length, width, depth = lines[1].replace("External: ", "").split(" x ")
108    length = int(length.split()[0].replace(",", ""))
109    width = int(width.split()[0].replace(",", ""))
110    depth = int(depth.split()[0].replace(",", ""))
111
112    image_url = urljoin(url, soup.find("img", attrs={"class": "rhsimage"}).attrs["src"])
113
114    # The order of this dict will become the order of columns in
115    # the SQLite database, which in turn will be used by datasette --
116    # make it the convenient order for viewing.
117    return {
118        "image_url": image_url,
119        "name": name,
120        "length": length,
121        "width": width,
122        "depth": depth,
123        "url": url,
124    }
125
126
127if __name__ == "__main__":
128    with open("really_useful_boxes.csv", "x") as out_file:
129        writer = csv.DictWriter(
130            out_file,
131            fieldnames=["name", "url", "length", "width", "depth", "image_url"],
132        )
133        writer.writeheader()
134
135        for url in get_product_page_urls():
136            print(f"fetching {url}...")
137            row = get_box_info(url)
138            writer.writerow(row)
139
140            # This is to avoid getting rate-limited or upsetting the website
141            time.sleep(1)