Skip to main content

get my AWS credentials first

ID
c3a8fcd
date
2023-04-30 08:18:27+00:00
author
Alex Chan <alex@alexwlchan.net>
parent
0f9d749
message
get my AWS credentials first
changed files
7 files, 378 additions, 362 deletions

Changed files

aws/_common.py (1803) → aws/_common.py (1924)

diff --git a/aws/_common.py b/aws/_common.py
index 69be857..95e9424 100755
--- a/aws/_common.py
+++ b/aws/_common.py
@@ -7,6 +7,7 @@ import hyperlink
 ACCOUNT_NAMES = {
     "760097843905": "platform",
     "299497370133": "workflow",
+    "975596993436": "storage",
 }
 
 
@@ -40,7 +41,9 @@ def guess_account(s3_identifier):
 
     """
     if "wellcomedigitalworkflow" in s3_identifier:
-        account_id = '299497370133'
+        account_id = "299497370133"
+    if "wellcomecollection-storage" in s3_identifier:
+        account_id = "975596993436"
     else:
         return None
 

aws/bag (171) → aws/bag (106)

diff --git a/aws/bag b/aws/bag
index c400fcf..a2d1bda 100755
--- a/aws/bag
+++ b/aws/bag
@@ -3,4 +3,4 @@
 set -o errexit
 set -o nounset
 
-AWS_PROFILE=storage-read_only python3 ~/repos/pathscripts/aws/s3tree.py "s3://wellcomecollection-storage/digitised/$@"
+s3tree "s3://wellcomecollection-storage/digitised/$@"

aws/s3ls (2056) → aws/s3ls (99)

diff --git a/aws/s3ls b/aws/s3ls
index 5ab9569..4b3dfbd 100755
--- a/aws/s3ls
+++ b/aws/s3ls
@@ -1,82 +1,7 @@
-#!/usr/bin/env python3
-"""
-A script for listing all the objects in an S3 prefix.
+#!/usr/bin/env bash
 
-Objects are printed to stdout as JSON, one object per line.
-"""
+set -o errexit
+set -o nounset
 
-import argparse
-import datetime
-import json
-import sys
-
-import tqdm
-
-from _common import create_s3_session, parse_s3_uri
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        prog="s3ls", description="List all the objects in an S3 prefix"
-    )
-
-    parser.add_argument("S3_URI")
-    parser.add_argument(
-        "--with-versions",
-        action="store_true",
-        help="List every version of the objects in S3, not just the latest version",
-    )
-    parser.add_argument(
-        "--start-after", help="Start listing objects at the given key", default=""
-    )
-
-    return parser.parse_args()
-
-
-class DatetimeEncoder(json.JSONEncoder):
-    def default(self, obj):
-        if isinstance(obj, datetime.datetime):
-            return obj.isoformat()
-
-
-def get_objects(sess, **kwargs):
-    """
-    Generates every object in an S3 bucket.
-    """
-    paginator = sess.client("s3").get_paginator("list_objects_v2")
-
-    for page in paginator.paginate(**kwargs):
-        yield from page["Contents"]
-
-
-def get_object_versions(sess, **kwargs):
-    """
-    Generates every version of an object in an S3 bucket.
-    """
-    s3_client = sess.client("s3")
-    paginator = s3_client.get_paginator("list_object_versions")
-
-    for page in paginator.paginate(**kwargs):
-        for key in ("Versions", "DeleteMarkers"):
-            try:
-                yield from page[key]
-            except KeyError:
-                pass
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    s3_list_args = parse_s3_uri(args.S3_URI)
-
-    sess = create_s3_session(args.S3_URI)
-
-    if "--with-versions" in sys.argv:
-        iterator = get_object_versions
-        s3_list_args["KeyMarker"] = args.start_after
-    else:
-        iterator = get_objects
-        s3_list_args["StartAfter"] = args.start_after
-
-    for s3_obj in tqdm.tqdm(iterator(sess, **s3_list_args)):
-        print(json.dumps(s3_obj, cls=DatetimeEncoder))
+_ensure_aws_credentials_are_fresh
+s3ls.py "$@"

aws/s3ls.py (0) → aws/s3ls.py (2056)

diff --git a/aws/s3ls.py b/aws/s3ls.py
new file mode 100755
index 0000000..5ab9569
--- /dev/null
+++ b/aws/s3ls.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""
+A script for listing all the objects in an S3 prefix.
+
+Objects are printed to stdout as JSON, one object per line.
+"""
+
+import argparse
+import datetime
+import json
+import sys
+
+import tqdm
+
+from _common import create_s3_session, parse_s3_uri
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        prog="s3ls", description="List all the objects in an S3 prefix"
+    )
+
+    parser.add_argument("S3_URI")
+    parser.add_argument(
+        "--with-versions",
+        action="store_true",
+        help="List every version of the objects in S3, not just the latest version",
+    )
+    parser.add_argument(
+        "--start-after", help="Start listing objects at the given key", default=""
+    )
+
+    return parser.parse_args()
+
+
+class DatetimeEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, datetime.datetime):
+            return obj.isoformat()
+
+
+def get_objects(sess, **kwargs):
+    """
+    Generates every object in an S3 bucket.
+    """
+    paginator = sess.client("s3").get_paginator("list_objects_v2")
+
+    for page in paginator.paginate(**kwargs):
+        yield from page["Contents"]
+
+
+def get_object_versions(sess, **kwargs):
+    """
+    Generates every version of an object in an S3 bucket.
+    """
+    s3_client = sess.client("s3")
+    paginator = s3_client.get_paginator("list_object_versions")
+
+    for page in paginator.paginate(**kwargs):
+        for key in ("Versions", "DeleteMarkers"):
+            try:
+                yield from page[key]
+            except KeyError:
+                pass
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    s3_list_args = parse_s3_uri(args.S3_URI)
+
+    sess = create_s3_session(args.S3_URI)
+
+    if "--with-versions" in sys.argv:
+        iterator = get_object_versions
+        s3_list_args["KeyMarker"] = args.start_after
+    else:
+        iterator = get_objects
+        s3_list_args["StartAfter"] = args.start_after
+
+    for s3_obj in tqdm.tqdm(iterator(sess, **s3_list_args)):
+        print(json.dumps(s3_obj, cls=DatetimeEncoder))

aws/s3tree (8144) → aws/s3tree (101)

diff --git a/aws/s3tree b/aws/s3tree
index d6a722e..63289f9 100755
--- a/aws/s3tree
+++ b/aws/s3tree
@@ -1,281 +1,7 @@
-#!/usr/bin/env python3
-"""
-Prints a tree showing the structure of an S3 prefix.
+#!/usr/bin/env bash
 
-This is meant to give me an overview of what's in a prefix, not
-a complete listing.  Here's an example of what the output looks like:
+set -o errexit
+set -o nounset
 
-    .
-    └─ digitised/
-        └─ b12840889/
-            └─ v1/
-                ├─ bag-info.txt
-                ├─ bagit.txt
-                ├─ manifest-sha256.txt
-                ├─ tagmanifest-sha256.txt
-                └─ data/
-                    ├─ b12840889.xml
-                    ├─ b12840889_0001.xml
-                    └─ objects/
-                        ├─ b12840889_0001_0001.jp2
-                        ├─ b12840889_0001_0002.jp2
-                        ├─ b12840889_0001_0003.jp2
-                        └─ ...2785 other objects
-
-The folder names link to the S3 console, so I can jump into exploring the
-objects in more detail if useful.
-
-"""
-
-import argparse
-import collections
-import datetime
-import os
-import sys
-from typing import List
-
-import attr
-import boto3
-import humanize
-import natsort
-import termcolor
-
-from _common import create_s3_session, parse_s3_uri
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        prog="s3tree", description="Print a summary tree of an S3 prefix"
-    )
-
-    parser.add_argument("S3_URI")
-
-    return parser.parse_args()
-
-
-def list_s3_objects(sess, **kwargs):
-    s3 = sess.client("s3")
-
-    for page in s3.get_paginator("list_objects_v2").paginate(**kwargs):
-        yield from page.get("Contents", [])
-
-
-def create_link_text(*, url, label):
-    # Based on https://stackoverflow.com/a/71309268/1558022
-
-    # OSC 8 ; params ; URI ST <name> OSC 8 ;; ST
-    return f"\033]8;;{url}\033\\{label}\033]8;;\033\\"
-
-
-def pprint_nested_tree(bucket, tree, folder_counts, parents=None):
-    lines = []
-    parents = parents or []
-
-    if not parents:
-        lines.append(".")
-
-    entries = sorted(tree.items())
-
-    for i, (key, nested_tree) in enumerate(entries, start=1):
-        if parents:
-            full_path = f'{"/".join(parents)}/{key}'
-        else:
-            full_path = key
-        if isinstance(key, str):
-            label = create_link_text(
-                url=f"https://eu-west-1.console.aws.amazon.com/s3/buckets/{bucket}?prefix={full_path}/&showversions=false",
-                label=f"{key}/",
-            )
-        else:
-            label = key
-
-        if full_path in folder_counts:
-            obj_count_line = termcolor.colored(
-                f"...plus {folder_counts[full_path]} object{'s' if folder_counts[full_path] > 1 else ''}",
-                "blue",
-            )
-
-            if i == len(entries) and nested_tree:
-                obj_count_line = f"    ├── {obj_count_line}"
-            elif i == len(entries):
-                obj_count_line = f"    └── {obj_count_line}"
-            elif nested_tree:
-                obj_count_line = f"│   ├── {obj_count_line}"
-            else:
-                obj_count_line = f"│   └── {obj_count_line}"
-        else:
-            obj_count_line = None
-
-        if i == len(entries):
-            lines.append("└── " + label)
-
-            if obj_count_line is not None:
-                lines.append(obj_count_line)
-
-            lines.extend(
-                [
-                    "    " + l
-                    for l in pprint_nested_tree(
-                        bucket,
-                        nested_tree,
-                        folder_counts=folder_counts,
-                        parents=parents + [key],
-                    )
-                ]
-            )
-        else:
-            lines.append("├── " + label)
-
-            if obj_count_line is not None:
-                lines.append(obj_count_line)
-
-            lines.extend(
-                [
-                    "│   " + l
-                    for l in pprint_nested_tree(
-                        bucket,
-                        nested_tree,
-                        folder_counts=folder_counts,
-                        parents=parents + [key],
-                    )
-                ]
-            )
-
-    return lines
-
-
-@attr.s
-class S3Folder:
-    path: str = attr.ib()
-    objects: List[str] = attr.ib(factory=list)
-    folders = attr.ib(factory=dict)  # Mapping[str, S3Folder]
-
-
-def build_s3_tree(keys, path=None):
-    path = path or []
-
-    tree = S3Folder(path="/".join(path))
-
-    per_folder_keys = collections.defaultdict(list)
-
-    for k in keys:
-        if "/" in k:
-            folder_name, entry_name = k.split("/", 1)
-            per_folder_keys[folder_name].append(entry_name)
-        else:
-            per_folder_keys["."].append(k)
-
-    assert sum(len(entries) for entries in per_folder_keys.values()) == len(keys)
-
-    tree.objects = natsort.natsort(per_folder_keys.pop(".", []))
-
-    for folder_name, folder_keys in per_folder_keys.items():
-        tree.folders[folder_name] = build_s3_tree(
-            folder_keys, path=path + [folder_name]
-        )
-
-    return tree
-
-
-def pprint_s3tree(*, bucket, tree):
-    lines = []
-
-    # If we're at the top of the tree, we want to print a '.'
-    if tree.path == "":
-        lines.append(".")
-
-    # Start by printing any objects that are in this folder.  Print up to
-    # 4 objects, otherwise print 3 and then '...X other objects'
-    if len(tree.objects) == 4:
-        tree_object_count = 4
-    else:
-        tree_object_count = 3
-
-    for i, object_key in enumerate(sorted(tree.objects[:tree_object_count]), start=1):
-        if tree.folders or len(tree.objects) > i:
-            prefix_char = "├─"
-        else:
-            prefix_char = "└─"
-
-        lines.append(f"{prefix_char} {termcolor.colored(object_key, 'blue')}")
-
-    if len(tree.objects) > tree_object_count:
-        if tree.folders:
-            prefix_char = "├─"
-        else:
-            prefix_char = "└─"
-
-        extra_objects = f"...{len(tree.objects) - 3} other objects"
-        lines.append(f"{prefix_char} {termcolor.colored(extra_objects, 'blue')}")
-
-    for i, (folder_name, folder_tree) in enumerate(
-        sorted(tree.folders.items()), start=1
-    ):
-        if tree.path == "":
-            full_path = folder_name
-        else:
-            full_path = "/".join([tree.path, folder_name])
-
-        if len(tree.folders) > i:
-            folder_prefix_char = "├─"
-            sub_prefix_char = "│   "
-        else:
-            folder_prefix_char = "└─"
-            sub_prefix_char = "    "
-
-        lines.append(
-            folder_prefix_char
-            + " "
-            + create_link_text(
-                url=f"https://eu-west-1.console.aws.amazon.com/s3/buckets/{bucket}?prefix={full_path}/&showversions=false",
-                label=f"{folder_name}/",
-            )
-        )
-        lines.extend(
-            [
-                f"{sub_prefix_char}{ln}"
-                for ln in pprint_s3tree(bucket=bucket, tree=folder_tree)
-            ]
-        )
-
-    return lines
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    s3_prefix = parse_s3_uri(args.S3_URI)
-
-    sess = create_s3_session(args.S3_URI)
-
-    s3_objects = list(list_s3_objects(sess, **s3_prefix))
-
-    if not s3_objects:
-        print("(no objects)")
-        sys.exit(1)
-
-    keys = [s3_obj["Key"] for s3_obj in s3_objects if s3_obj["Size"] > 0]
-
-    tree = build_s3_tree(keys)
-
-    print("\n".join(pprint_s3tree(bucket=s3_prefix["Bucket"], tree=tree)))
-
-    print("")
-    total_size = sum(s3_obj["Size"] for s3_obj in s3_objects)
-    last_modified = max(s3_obj["LastModified"] for s3_obj in s3_objects)
-
-    if last_modified.date() == datetime.date.today():
-        last_modified_message = "today"
-    elif last_modified.year != datetime.date.today().year:
-        last_modified_message = f"in {last_modified.strftime('%B %Y')}"
-    else:
-        last_modified_message = last_modified.strftime("%d %B")
-
-    print(
-        termcolor.colored(
-            f'{humanize.intcomma(len(s3_objects))} object{"s" if len(s3_objects) > 1 else ""}, '
-            f"totalling {humanize.naturalsize(total_size)}, "
-            f"last modified {last_modified_message}",
-            "green",
-        )
-    )
+_ensure_aws_credentials_are_fresh
+s3tree.py "$@"

aws/s3tree.py (6) → aws/s3tree.py (0)

diff --git a/aws/s3tree.py b/aws/s3tree.py
deleted file mode 120000
index 086f1fa..0000000
--- a/aws/s3tree.py
+++ /dev/null
@@ -1 +0,0 @@
-s3tree
\ No newline at end of file

aws/s3tree.py (0) → aws/s3tree.py (8144)

diff --git a/aws/s3tree.py b/aws/s3tree.py
new file mode 100755
index 0000000..d6a722e
--- /dev/null
+++ b/aws/s3tree.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python3
+"""
+Prints a tree showing the structure of an S3 prefix.
+
+This is meant to give me an overview of what's in a prefix, not
+a complete listing.  Here's an example of what the output looks like:
+
+    .
+    └─ digitised/
+        └─ b12840889/
+            └─ v1/
+                ├─ bag-info.txt
+                ├─ bagit.txt
+                ├─ manifest-sha256.txt
+                ├─ tagmanifest-sha256.txt
+                └─ data/
+                    ├─ b12840889.xml
+                    ├─ b12840889_0001.xml
+                    └─ objects/
+                        ├─ b12840889_0001_0001.jp2
+                        ├─ b12840889_0001_0002.jp2
+                        ├─ b12840889_0001_0003.jp2
+                        └─ ...2785 other objects
+
+The folder names link to the S3 console, so I can jump into exploring the
+objects in more detail if useful.
+
+"""
+
+import argparse
+import collections
+import datetime
+import os
+import sys
+from typing import List
+
+import attr
+import boto3
+import humanize
+import natsort
+import termcolor
+
+from _common import create_s3_session, parse_s3_uri
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        prog="s3tree", description="Print a summary tree of an S3 prefix"
+    )
+
+    parser.add_argument("S3_URI")
+
+    return parser.parse_args()
+
+
+def list_s3_objects(sess, **kwargs):
+    s3 = sess.client("s3")
+
+    for page in s3.get_paginator("list_objects_v2").paginate(**kwargs):
+        yield from page.get("Contents", [])
+
+
+def create_link_text(*, url, label):
+    # Based on https://stackoverflow.com/a/71309268/1558022
+
+    # OSC 8 ; params ; URI ST <name> OSC 8 ;; ST
+    return f"\033]8;;{url}\033\\{label}\033]8;;\033\\"
+
+
+def pprint_nested_tree(bucket, tree, folder_counts, parents=None):
+    lines = []
+    parents = parents or []
+
+    if not parents:
+        lines.append(".")
+
+    entries = sorted(tree.items())
+
+    for i, (key, nested_tree) in enumerate(entries, start=1):
+        if parents:
+            full_path = f'{"/".join(parents)}/{key}'
+        else:
+            full_path = key
+        if isinstance(key, str):
+            label = create_link_text(
+                url=f"https://eu-west-1.console.aws.amazon.com/s3/buckets/{bucket}?prefix={full_path}/&showversions=false",
+                label=f"{key}/",
+            )
+        else:
+            label = key
+
+        if full_path in folder_counts:
+            obj_count_line = termcolor.colored(
+                f"...plus {folder_counts[full_path]} object{'s' if folder_counts[full_path] > 1 else ''}",
+                "blue",
+            )
+
+            if i == len(entries) and nested_tree:
+                obj_count_line = f"    ├── {obj_count_line}"
+            elif i == len(entries):
+                obj_count_line = f"    └── {obj_count_line}"
+            elif nested_tree:
+                obj_count_line = f"│   ├── {obj_count_line}"
+            else:
+                obj_count_line = f"│   └── {obj_count_line}"
+        else:
+            obj_count_line = None
+
+        if i == len(entries):
+            lines.append("└── " + label)
+
+            if obj_count_line is not None:
+                lines.append(obj_count_line)
+
+            lines.extend(
+                [
+                    "    " + l
+                    for l in pprint_nested_tree(
+                        bucket,
+                        nested_tree,
+                        folder_counts=folder_counts,
+                        parents=parents + [key],
+                    )
+                ]
+            )
+        else:
+            lines.append("├── " + label)
+
+            if obj_count_line is not None:
+                lines.append(obj_count_line)
+
+            lines.extend(
+                [
+                    "│   " + l
+                    for l in pprint_nested_tree(
+                        bucket,
+                        nested_tree,
+                        folder_counts=folder_counts,
+                        parents=parents + [key],
+                    )
+                ]
+            )
+
+    return lines
+
+
+@attr.s
+class S3Folder:
+    path: str = attr.ib()
+    objects: List[str] = attr.ib(factory=list)
+    folders = attr.ib(factory=dict)  # Mapping[str, S3Folder]
+
+
+def build_s3_tree(keys, path=None):
+    path = path or []
+
+    tree = S3Folder(path="/".join(path))
+
+    per_folder_keys = collections.defaultdict(list)
+
+    for k in keys:
+        if "/" in k:
+            folder_name, entry_name = k.split("/", 1)
+            per_folder_keys[folder_name].append(entry_name)
+        else:
+            per_folder_keys["."].append(k)
+
+    assert sum(len(entries) for entries in per_folder_keys.values()) == len(keys)
+
+    tree.objects = natsort.natsort(per_folder_keys.pop(".", []))
+
+    for folder_name, folder_keys in per_folder_keys.items():
+        tree.folders[folder_name] = build_s3_tree(
+            folder_keys, path=path + [folder_name]
+        )
+
+    return tree
+
+
+def pprint_s3tree(*, bucket, tree):
+    lines = []
+
+    # If we're at the top of the tree, we want to print a '.'
+    if tree.path == "":
+        lines.append(".")
+
+    # Start by printing any objects that are in this folder.  Print up to
+    # 4 objects, otherwise print 3 and then '...X other objects'
+    if len(tree.objects) == 4:
+        tree_object_count = 4
+    else:
+        tree_object_count = 3
+
+    for i, object_key in enumerate(sorted(tree.objects[:tree_object_count]), start=1):
+        if tree.folders or len(tree.objects) > i:
+            prefix_char = "├─"
+        else:
+            prefix_char = "└─"
+
+        lines.append(f"{prefix_char} {termcolor.colored(object_key, 'blue')}")
+
+    if len(tree.objects) > tree_object_count:
+        if tree.folders:
+            prefix_char = "├─"
+        else:
+            prefix_char = "└─"
+
+        extra_objects = f"...{len(tree.objects) - 3} other objects"
+        lines.append(f"{prefix_char} {termcolor.colored(extra_objects, 'blue')}")
+
+    for i, (folder_name, folder_tree) in enumerate(
+        sorted(tree.folders.items()), start=1
+    ):
+        if tree.path == "":
+            full_path = folder_name
+        else:
+            full_path = "/".join([tree.path, folder_name])
+
+        if len(tree.folders) > i:
+            folder_prefix_char = "├─"
+            sub_prefix_char = "│   "
+        else:
+            folder_prefix_char = "└─"
+            sub_prefix_char = "    "
+
+        lines.append(
+            folder_prefix_char
+            + " "
+            + create_link_text(
+                url=f"https://eu-west-1.console.aws.amazon.com/s3/buckets/{bucket}?prefix={full_path}/&showversions=false",
+                label=f"{folder_name}/",
+            )
+        )
+        lines.extend(
+            [
+                f"{sub_prefix_char}{ln}"
+                for ln in pprint_s3tree(bucket=bucket, tree=folder_tree)
+            ]
+        )
+
+    return lines
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    s3_prefix = parse_s3_uri(args.S3_URI)
+
+    sess = create_s3_session(args.S3_URI)
+
+    s3_objects = list(list_s3_objects(sess, **s3_prefix))
+
+    if not s3_objects:
+        print("(no objects)")
+        sys.exit(1)
+
+    keys = [s3_obj["Key"] for s3_obj in s3_objects if s3_obj["Size"] > 0]
+
+    tree = build_s3_tree(keys)
+
+    print("\n".join(pprint_s3tree(bucket=s3_prefix["Bucket"], tree=tree)))
+
+    print("")
+    total_size = sum(s3_obj["Size"] for s3_obj in s3_objects)
+    last_modified = max(s3_obj["LastModified"] for s3_obj in s3_objects)
+
+    if last_modified.date() == datetime.date.today():
+        last_modified_message = "today"
+    elif last_modified.year != datetime.date.today().year:
+        last_modified_message = f"in {last_modified.strftime('%B %Y')}"
+    else:
+        last_modified_message = last_modified.strftime("%d %B")
+
+    print(
+        termcolor.colored(
+            f'{humanize.intcomma(len(s3_objects))} object{"s" if len(s3_objects) > 1 else ""}, '
+            f"totalling {humanize.naturalsize(total_size)}, "
+            f"last modified {last_modified_message}",
+            "green",
+        )
+    )