Skip to main content

Merge pull request #1 from alexwlchan/replace-with-node

ID
ffe50b2
date
2024-06-19 22:01:49+00:00
author
Alex Chan <alex@alexwlchan.net>
parents
6b22e6b, 5287b6a
message
Merge pull request #1 from alexwlchan/replace-with-node

Write a Node version of the script
changed files
5 files, 158 additions, 48 deletions

Changed files

.github/workflows/test.yml (0) → .github/workflows/test.yml (812)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..2b45792
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,34 @@
+name: Test
+
+on:
+  push:
+    branches:
+    - main
+
+  pull_request:
+    branches:
+    - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-node@v4
+
+    # This is a crude test harness that checks the script is working correctly: it
+    # runs the script, and checks the generated files are created as expected.
+    - name: Run tests
+      run: |
+        node measure https://wellcomecollection.org/collections collections
+
+        ls out
+        if [ ! -f "out/collections.html" ]; then exit 1; fi
+        if [ ! -f "out/collections.json" ]; then exit 1; fi
+
+        node measure https://wellcomecollection.org/collections
+
+        ls out
+        if [ ! -f "out/export.html" ]; then exit 1; fi
+        if [ ! -f "out/export.json" ]; then exit 1; fi

.gitignore (5) → .gitignore (4)

diff --git a/.gitignore b/.gitignore
index c1d18d8..1fcb152 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1 @@
-_out
+out

README.md (1835) → README.md (1876)

diff --git a/README.md b/README.md
index b4adfca..e9fed71 100644
--- a/README.md
+++ b/README.md
@@ -36,17 +36,21 @@ I use this script to analyse the props, to identity ways we can reduce the size 
 
 ## Usage
 
-You need Python 3, then download the script from this repo.
+You need Node installed.
+Download the `measure.js` from this repo.
 
-The script takes two arguments: a URL, and a name to identify the results.
+The script takes two arguments:
+
+* the URL to fetch (required)
+* a label for the downloaded files (optional)
 
 Example:
 
 ```console
-$ python3 measure.py https://wellcomecollection.org/collections collections
-html      = 210.22 kB
-next_data =  35.71 kB (16%)
+$ node measure.js https://wellcomecollection.org/collections collections
+HTML       =  196.45 kB
+NEXT_DATA =   52.17 kB (26.6%)
 
-Saved HTML to _out/collections.html
-Saved JSON to _out/collections.json
+Saved HTML to out/collections.html
+Saved JSON to out/collections.json
 ```

measure.js (0) → measure.js (2906)

diff --git a/measure.js b/measure.js
new file mode 100644
index 0000000..f2ceafd
--- /dev/null
+++ b/measure.js
@@ -0,0 +1,112 @@
+const fs = require('fs');
+const https = require('https');
+
+// Write text to a file in the `out` directory.
+//
+// This takes an `options` object with two parameters:
+//
+//    - `filename` -- the name of the file to write
+//    - `contents` -- the text to write to the file
+//
+function writeToFile(options) {
+  let filePath = `out/${options.filename}`;
+
+  fs.mkdir('out', { recursive: true }, (err) => {
+    if (err) {
+      console.error('Error creating `out` directory:', err);
+      process.exit(1);
+    }
+  });
+
+  fs.writeFile(filePath, options.contents, (err) => {
+    if (err) {
+      console.error('Error writing file:', err);
+      process.exit(1);
+    }
+  });
+}
+
+// Format a number of bytes as a human-readable string.
+//
+// Example: naturalsize(1234) ~> "1.21 kB"
+function naturalSize(byteCount) {
+  return `${(byteCount / 1024).toFixed(2)} kB`;
+}
+
+// Left-pad a string with spaces for consistent indentation.
+function leftPad(str, length) {
+  while (str.length < length) {
+    str = ' ' + str;
+  }
+
+  return str;
+}
+
+// Parse command-line arguments.
+//
+// The script takes one or two arguments:
+//
+//  * the URL to fetch (required)
+//  * a label for the downloaded files (optional)
+//
+const args = process.argv.slice(2);
+
+let url = '';
+let label = '';
+
+if (args.length === 0) {
+  console.error("Usage: measure.js URL [LABEL]");
+  process.exit(1);
+} else if (args.length === 1) {
+  url = args[0];
+  label = "export";
+} else if (args.length === 2) {
+  url = args[0];
+  label = args[1];
+} else {
+  console.error("Usage: measure.js URL [LABEL]");
+  process.exit(1);
+}
+
+// Actually fetch the URL, and save the HTML
+//
+// Note: I add a custom User-Agent because CloudFront seems to reject fetches that
+// come from Node's builtin HTTP library.
+const options = {
+  headers: {
+    'User-Agent': 'Mozilla/5.0 (Android 4.4; Mobile; rv:41.0) Gecko/41.0 Firefox/41.0',
+  }
+};
+
+https.get(url, options, (res) => {
+  let html = '';
+
+  res.on('data', (chunk) => {
+    html += chunk;
+  });
+
+  // We've got the whole HTML file.  Parse it, and save the results.
+  res.on('end', () => {
+    let htmlByteCount = Buffer.byteLength(html, 'utf8');
+    console.log(`HTML       = ${leftPad(naturalSize(htmlByteCount), 10)}`);
+
+    let nextData = html
+        .split('<script id="__NEXT_DATA__" type="application/json">')[1]
+        .split("</script>")[0];
+
+    let nextDataByteCount = Buffer.byteLength(nextData, 'utf8');
+    console.log(`NEXT_DATA = ${leftPad(naturalSize(nextDataByteCount), 10)} (${(nextDataByteCount / htmlByteCount * 100).toFixed(1)}%)`);
+
+    console.log();
+
+    writeToFile({ filename: `${label}.html`, contents: html });
+    console.log(`Saved HTML to out/${label}.html`);
+
+    writeToFile({ filename: `${label}.json`, contents: nextData });
+    console.log(`Saved JSON to out/${label}.json`);
+  });
+
+}).on('error', (err) => {
+  console.error('Error fetching the URL: ', err);
+  process.exit(1);
+});

measure.py (1035) → measure.py (0)

diff --git a/measure.py b/measure.py
deleted file mode 100755
index 9e8559d..0000000
--- a/measure.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env python3
-
-import json
-import os
-import sys
-import urllib.request
-
-
-def naturalsize(bytes):
-    return "%3.2f kB" % (bytes / 1024)
-
-
-def analyse(*, url, name):
-    os.makedirs("_out", exist_ok=True)
-    urllib.request.urlretrieve(url, f"_out/{name}.html")
-
-    html = open(f"_out/{name}.html").read()
-
-    next_data = html.split('<script id="__NEXT_DATA__" type="application/json">')[
-        1
-    ].split("</script>")[0]
-
-    with open(f'_out/{name}.json', 'w') as outfile:
-        outfile.write(json.dumps(json.loads(next_data), indent=2, sort_keys=True))
-
-    print(f"html      = {naturalsize(len(html))}")
-    print(f"next_data = {naturalsize(len(next_data)).rjust(9)} ({int(len(next_data) / len(html) * 100)}%)")
-    print("")
-    print(f"Saved HTML to _out/{name}.html")
-    print(f"Saved JSON to _out/{name}.json")
-
-
-if __name__ == "__main__":
-    try:
-        url = sys.argv[1]
-        name = sys.argv[2]
-    except IndexError:
-        sys.exit(f"Usage: {__file__} <URL> <NAME>")
-
-    analyse(url=url, name=name)