get my AWS credentials first
- ID
c3a8fcd- date
2023-04-30 08:18:27+00:00- author
Alex Chan <alex@alexwlchan.net>- parent
0f9d749- message
get my AWS credentials first- changed files
7 files, 378 additions, 362 deletions
Changed files
aws/_common.py (1803) → aws/_common.py (1924)
diff --git a/aws/_common.py b/aws/_common.py
index 69be857..95e9424 100755
--- a/aws/_common.py
+++ b/aws/_common.py
@@ -7,6 +7,7 @@ import hyperlink
ACCOUNT_NAMES = {
"760097843905": "platform",
"299497370133": "workflow",
+ "975596993436": "storage",
}
@@ -40,7 +41,9 @@ def guess_account(s3_identifier):
"""
if "wellcomedigitalworkflow" in s3_identifier:
- account_id = '299497370133'
+ account_id = "299497370133"
+ if "wellcomecollection-storage" in s3_identifier:
+ account_id = "975596993436"
else:
return None
aws/bag (171) → aws/bag (106)
diff --git a/aws/bag b/aws/bag
index c400fcf..a2d1bda 100755
--- a/aws/bag
+++ b/aws/bag
@@ -3,4 +3,4 @@
set -o errexit
set -o nounset
-AWS_PROFILE=storage-read_only python3 ~/repos/pathscripts/aws/s3tree.py "s3://wellcomecollection-storage/digitised/$@"
+s3tree "s3://wellcomecollection-storage/digitised/$@"
aws/s3ls (2056) → aws/s3ls (99)
diff --git a/aws/s3ls b/aws/s3ls
index 5ab9569..4b3dfbd 100755
--- a/aws/s3ls
+++ b/aws/s3ls
@@ -1,82 +1,7 @@
-#!/usr/bin/env python3
-"""
-A script for listing all the objects in an S3 prefix.
+#!/usr/bin/env bash
-Objects are printed to stdout as JSON, one object per line.
-"""
+set -o errexit
+set -o nounset
-import argparse
-import datetime
-import json
-import sys
-
-import tqdm
-
-from _common import create_s3_session, parse_s3_uri
-
-
-def parse_args():
- parser = argparse.ArgumentParser(
- prog="s3ls", description="List all the objects in an S3 prefix"
- )
-
- parser.add_argument("S3_URI")
- parser.add_argument(
- "--with-versions",
- action="store_true",
- help="List every version of the objects in S3, not just the latest version",
- )
- parser.add_argument(
- "--start-after", help="Start listing objects at the given key", default=""
- )
-
- return parser.parse_args()
-
-
-class DatetimeEncoder(json.JSONEncoder):
- def default(self, obj):
- if isinstance(obj, datetime.datetime):
- return obj.isoformat()
-
-
-def get_objects(sess, **kwargs):
- """
- Generates every object in an S3 bucket.
- """
- paginator = sess.client("s3").get_paginator("list_objects_v2")
-
- for page in paginator.paginate(**kwargs):
- yield from page["Contents"]
-
-
-def get_object_versions(sess, **kwargs):
- """
- Generates every version of an object in an S3 bucket.
- """
- s3_client = sess.client("s3")
- paginator = s3_client.get_paginator("list_object_versions")
-
- for page in paginator.paginate(**kwargs):
- for key in ("Versions", "DeleteMarkers"):
- try:
- yield from page[key]
- except KeyError:
- pass
-
-
-if __name__ == "__main__":
- args = parse_args()
-
- s3_list_args = parse_s3_uri(args.S3_URI)
-
- sess = create_s3_session(args.S3_URI)
-
- if "--with-versions" in sys.argv:
- iterator = get_object_versions
- s3_list_args["KeyMarker"] = args.start_after
- else:
- iterator = get_objects
- s3_list_args["StartAfter"] = args.start_after
-
- for s3_obj in tqdm.tqdm(iterator(sess, **s3_list_args)):
- print(json.dumps(s3_obj, cls=DatetimeEncoder))
+_ensure_aws_credentials_are_fresh
+s3ls.py "$@"
aws/s3ls.py (0) → aws/s3ls.py (2056)
diff --git a/aws/s3ls.py b/aws/s3ls.py
new file mode 100755
index 0000000..5ab9569
--- /dev/null
+++ b/aws/s3ls.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""
+A script for listing all the objects in an S3 prefix.
+
+Objects are printed to stdout as JSON, one object per line.
+"""
+
+import argparse
+import datetime
+import json
+import sys
+
+import tqdm
+
+from _common import create_s3_session, parse_s3_uri
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ prog="s3ls", description="List all the objects in an S3 prefix"
+ )
+
+ parser.add_argument("S3_URI")
+ parser.add_argument(
+ "--with-versions",
+ action="store_true",
+ help="List every version of the objects in S3, not just the latest version",
+ )
+ parser.add_argument(
+ "--start-after", help="Start listing objects at the given key", default=""
+ )
+
+ return parser.parse_args()
+
+
+class DatetimeEncoder(json.JSONEncoder):
+ def default(self, obj):
+ if isinstance(obj, datetime.datetime):
+ return obj.isoformat()
+
+
+def get_objects(sess, **kwargs):
+ """
+ Generates every object in an S3 bucket.
+ """
+ paginator = sess.client("s3").get_paginator("list_objects_v2")
+
+ for page in paginator.paginate(**kwargs):
+ yield from page["Contents"]
+
+
+def get_object_versions(sess, **kwargs):
+ """
+ Generates every version of an object in an S3 bucket.
+ """
+ s3_client = sess.client("s3")
+ paginator = s3_client.get_paginator("list_object_versions")
+
+ for page in paginator.paginate(**kwargs):
+ for key in ("Versions", "DeleteMarkers"):
+ try:
+ yield from page[key]
+ except KeyError:
+ pass
+
+
+if __name__ == "__main__":
+ args = parse_args()
+
+ s3_list_args = parse_s3_uri(args.S3_URI)
+
+ sess = create_s3_session(args.S3_URI)
+
+ if "--with-versions" in sys.argv:
+ iterator = get_object_versions
+ s3_list_args["KeyMarker"] = args.start_after
+ else:
+ iterator = get_objects
+ s3_list_args["StartAfter"] = args.start_after
+
+ for s3_obj in tqdm.tqdm(iterator(sess, **s3_list_args)):
+ print(json.dumps(s3_obj, cls=DatetimeEncoder))
aws/s3tree (8144) → aws/s3tree (101)
diff --git a/aws/s3tree b/aws/s3tree
index d6a722e..63289f9 100755
--- a/aws/s3tree
+++ b/aws/s3tree
@@ -1,281 +1,7 @@
-#!/usr/bin/env python3
-"""
-Prints a tree showing the structure of an S3 prefix.
+#!/usr/bin/env bash
-This is meant to give me an overview of what's in a prefix, not
-a complete listing. Here's an example of what the output looks like:
+set -o errexit
+set -o nounset
- .
- └─ digitised/
- └─ b12840889/
- └─ v1/
- ├─ bag-info.txt
- ├─ bagit.txt
- ├─ manifest-sha256.txt
- ├─ tagmanifest-sha256.txt
- └─ data/
- ├─ b12840889.xml
- ├─ b12840889_0001.xml
- └─ objects/
- ├─ b12840889_0001_0001.jp2
- ├─ b12840889_0001_0002.jp2
- ├─ b12840889_0001_0003.jp2
- └─ ...2785 other objects
-
-The folder names link to the S3 console, so I can jump into exploring the
-objects in more detail if useful.
-
-"""
-
-import argparse
-import collections
-import datetime
-import os
-import sys
-from typing import List
-
-import attr
-import boto3
-import humanize
-import natsort
-import termcolor
-
-from _common import create_s3_session, parse_s3_uri
-
-
-def parse_args():
- parser = argparse.ArgumentParser(
- prog="s3tree", description="Print a summary tree of an S3 prefix"
- )
-
- parser.add_argument("S3_URI")
-
- return parser.parse_args()
-
-
-def list_s3_objects(sess, **kwargs):
- s3 = sess.client("s3")
-
- for page in s3.get_paginator("list_objects_v2").paginate(**kwargs):
- yield from page.get("Contents", [])
-
-
-def create_link_text(*, url, label):
- # Based on https://stackoverflow.com/a/71309268/1558022
-
- # OSC 8 ; params ; URI ST <name> OSC 8 ;; ST
- return f"\033]8;;{url}\033\\{label}\033]8;;\033\\"
-
-
-def pprint_nested_tree(bucket, tree, folder_counts, parents=None):
- lines = []
- parents = parents or []
-
- if not parents:
- lines.append(".")
-
- entries = sorted(tree.items())
-
- for i, (key, nested_tree) in enumerate(entries, start=1):
- if parents:
- full_path = f'{"/".join(parents)}/{key}'
- else:
- full_path = key
- if isinstance(key, str):
- label = create_link_text(
- url=f"https://eu-west-1.console.aws.amazon.com/s3/buckets/{bucket}?prefix={full_path}/&showversions=false",
- label=f"{key}/",
- )
- else:
- label = key
-
- if full_path in folder_counts:
- obj_count_line = termcolor.colored(
- f"...plus {folder_counts[full_path]} object{'s' if folder_counts[full_path] > 1 else ''}",
- "blue",
- )
-
- if i == len(entries) and nested_tree:
- obj_count_line = f" ├── {obj_count_line}"
- elif i == len(entries):
- obj_count_line = f" └── {obj_count_line}"
- elif nested_tree:
- obj_count_line = f"│ ├── {obj_count_line}"
- else:
- obj_count_line = f"│ └── {obj_count_line}"
- else:
- obj_count_line = None
-
- if i == len(entries):
- lines.append("└── " + label)
-
- if obj_count_line is not None:
- lines.append(obj_count_line)
-
- lines.extend(
- [
- " " + l
- for l in pprint_nested_tree(
- bucket,
- nested_tree,
- folder_counts=folder_counts,
- parents=parents + [key],
- )
- ]
- )
- else:
- lines.append("├── " + label)
-
- if obj_count_line is not None:
- lines.append(obj_count_line)
-
- lines.extend(
- [
- "│ " + l
- for l in pprint_nested_tree(
- bucket,
- nested_tree,
- folder_counts=folder_counts,
- parents=parents + [key],
- )
- ]
- )
-
- return lines
-
-
-@attr.s
-class S3Folder:
- path: str = attr.ib()
- objects: List[str] = attr.ib(factory=list)
- folders = attr.ib(factory=dict) # Mapping[str, S3Folder]
-
-
-def build_s3_tree(keys, path=None):
- path = path or []
-
- tree = S3Folder(path="/".join(path))
-
- per_folder_keys = collections.defaultdict(list)
-
- for k in keys:
- if "/" in k:
- folder_name, entry_name = k.split("/", 1)
- per_folder_keys[folder_name].append(entry_name)
- else:
- per_folder_keys["."].append(k)
-
- assert sum(len(entries) for entries in per_folder_keys.values()) == len(keys)
-
- tree.objects = natsort.natsort(per_folder_keys.pop(".", []))
-
- for folder_name, folder_keys in per_folder_keys.items():
- tree.folders[folder_name] = build_s3_tree(
- folder_keys, path=path + [folder_name]
- )
-
- return tree
-
-
-def pprint_s3tree(*, bucket, tree):
- lines = []
-
- # If we're at the top of the tree, we want to print a '.'
- if tree.path == "":
- lines.append(".")
-
- # Start by printing any objects that are in this folder. Print up to
- # 4 objects, otherwise print 3 and then '...X other objects'
- if len(tree.objects) == 4:
- tree_object_count = 4
- else:
- tree_object_count = 3
-
- for i, object_key in enumerate(sorted(tree.objects[:tree_object_count]), start=1):
- if tree.folders or len(tree.objects) > i:
- prefix_char = "├─"
- else:
- prefix_char = "└─"
-
- lines.append(f"{prefix_char} {termcolor.colored(object_key, 'blue')}")
-
- if len(tree.objects) > tree_object_count:
- if tree.folders:
- prefix_char = "├─"
- else:
- prefix_char = "└─"
-
- extra_objects = f"...{len(tree.objects) - 3} other objects"
- lines.append(f"{prefix_char} {termcolor.colored(extra_objects, 'blue')}")
-
- for i, (folder_name, folder_tree) in enumerate(
- sorted(tree.folders.items()), start=1
- ):
- if tree.path == "":
- full_path = folder_name
- else:
- full_path = "/".join([tree.path, folder_name])
-
- if len(tree.folders) > i:
- folder_prefix_char = "├─"
- sub_prefix_char = "│ "
- else:
- folder_prefix_char = "└─"
- sub_prefix_char = " "
-
- lines.append(
- folder_prefix_char
- + " "
- + create_link_text(
- url=f"https://eu-west-1.console.aws.amazon.com/s3/buckets/{bucket}?prefix={full_path}/&showversions=false",
- label=f"{folder_name}/",
- )
- )
- lines.extend(
- [
- f"{sub_prefix_char}{ln}"
- for ln in pprint_s3tree(bucket=bucket, tree=folder_tree)
- ]
- )
-
- return lines
-
-
-if __name__ == "__main__":
- args = parse_args()
-
- s3_prefix = parse_s3_uri(args.S3_URI)
-
- sess = create_s3_session(args.S3_URI)
-
- s3_objects = list(list_s3_objects(sess, **s3_prefix))
-
- if not s3_objects:
- print("(no objects)")
- sys.exit(1)
-
- keys = [s3_obj["Key"] for s3_obj in s3_objects if s3_obj["Size"] > 0]
-
- tree = build_s3_tree(keys)
-
- print("\n".join(pprint_s3tree(bucket=s3_prefix["Bucket"], tree=tree)))
-
- print("")
- total_size = sum(s3_obj["Size"] for s3_obj in s3_objects)
- last_modified = max(s3_obj["LastModified"] for s3_obj in s3_objects)
-
- if last_modified.date() == datetime.date.today():
- last_modified_message = "today"
- elif last_modified.year != datetime.date.today().year:
- last_modified_message = f"in {last_modified.strftime('%B %Y')}"
- else:
- last_modified_message = last_modified.strftime("%d %B")
-
- print(
- termcolor.colored(
- f'{humanize.intcomma(len(s3_objects))} object{"s" if len(s3_objects) > 1 else ""}, '
- f"totalling {humanize.naturalsize(total_size)}, "
- f"last modified {last_modified_message}",
- "green",
- )
- )
+_ensure_aws_credentials_are_fresh
+s3tree.py "$@"
aws/s3tree.py (6) → aws/s3tree.py (0)
diff --git a/aws/s3tree.py b/aws/s3tree.py
deleted file mode 120000
index 086f1fa..0000000
--- a/aws/s3tree.py
+++ /dev/null
@@ -1 +0,0 @@
-s3tree
\ No newline at end of file
aws/s3tree.py (0) → aws/s3tree.py (8144)
diff --git a/aws/s3tree.py b/aws/s3tree.py
new file mode 100755
index 0000000..d6a722e
--- /dev/null
+++ b/aws/s3tree.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python3
+"""
+Prints a tree showing the structure of an S3 prefix.
+
+This is meant to give me an overview of what's in a prefix, not
+a complete listing. Here's an example of what the output looks like:
+
+ .
+ └─ digitised/
+ └─ b12840889/
+ └─ v1/
+ ├─ bag-info.txt
+ ├─ bagit.txt
+ ├─ manifest-sha256.txt
+ ├─ tagmanifest-sha256.txt
+ └─ data/
+ ├─ b12840889.xml
+ ├─ b12840889_0001.xml
+ └─ objects/
+ ├─ b12840889_0001_0001.jp2
+ ├─ b12840889_0001_0002.jp2
+ ├─ b12840889_0001_0003.jp2
+ └─ ...2785 other objects
+
+The folder names link to the S3 console, so I can jump into exploring the
+objects in more detail if useful.
+
+"""
+
+import argparse
+import collections
+import datetime
+import os
+import sys
+from typing import List
+
+import attr
+import boto3
+import humanize
+import natsort
+import termcolor
+
+from _common import create_s3_session, parse_s3_uri
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ prog="s3tree", description="Print a summary tree of an S3 prefix"
+ )
+
+ parser.add_argument("S3_URI")
+
+ return parser.parse_args()
+
+
+def list_s3_objects(sess, **kwargs):
+ s3 = sess.client("s3")
+
+ for page in s3.get_paginator("list_objects_v2").paginate(**kwargs):
+ yield from page.get("Contents", [])
+
+
+def create_link_text(*, url, label):
+ # Based on https://stackoverflow.com/a/71309268/1558022
+
+ # OSC 8 ; params ; URI ST <name> OSC 8 ;; ST
+ return f"\033]8;;{url}\033\\{label}\033]8;;\033\\"
+
+
+def pprint_nested_tree(bucket, tree, folder_counts, parents=None):
+ lines = []
+ parents = parents or []
+
+ if not parents:
+ lines.append(".")
+
+ entries = sorted(tree.items())
+
+ for i, (key, nested_tree) in enumerate(entries, start=1):
+ if parents:
+ full_path = f'{"/".join(parents)}/{key}'
+ else:
+ full_path = key
+ if isinstance(key, str):
+ label = create_link_text(
+ url=f"https://eu-west-1.console.aws.amazon.com/s3/buckets/{bucket}?prefix={full_path}/&showversions=false",
+ label=f"{key}/",
+ )
+ else:
+ label = key
+
+ if full_path in folder_counts:
+ obj_count_line = termcolor.colored(
+ f"...plus {folder_counts[full_path]} object{'s' if folder_counts[full_path] > 1 else ''}",
+ "blue",
+ )
+
+ if i == len(entries) and nested_tree:
+ obj_count_line = f" ├── {obj_count_line}"
+ elif i == len(entries):
+ obj_count_line = f" └── {obj_count_line}"
+ elif nested_tree:
+ obj_count_line = f"│ ├── {obj_count_line}"
+ else:
+ obj_count_line = f"│ └── {obj_count_line}"
+ else:
+ obj_count_line = None
+
+ if i == len(entries):
+ lines.append("└── " + label)
+
+ if obj_count_line is not None:
+ lines.append(obj_count_line)
+
+ lines.extend(
+ [
+ " " + l
+ for l in pprint_nested_tree(
+ bucket,
+ nested_tree,
+ folder_counts=folder_counts,
+ parents=parents + [key],
+ )
+ ]
+ )
+ else:
+ lines.append("├── " + label)
+
+ if obj_count_line is not None:
+ lines.append(obj_count_line)
+
+ lines.extend(
+ [
+ "│ " + l
+ for l in pprint_nested_tree(
+ bucket,
+ nested_tree,
+ folder_counts=folder_counts,
+ parents=parents + [key],
+ )
+ ]
+ )
+
+ return lines
+
+
+@attr.s
+class S3Folder:
+ path: str = attr.ib()
+ objects: List[str] = attr.ib(factory=list)
+ folders = attr.ib(factory=dict) # Mapping[str, S3Folder]
+
+
+def build_s3_tree(keys, path=None):
+ path = path or []
+
+ tree = S3Folder(path="/".join(path))
+
+ per_folder_keys = collections.defaultdict(list)
+
+ for k in keys:
+ if "/" in k:
+ folder_name, entry_name = k.split("/", 1)
+ per_folder_keys[folder_name].append(entry_name)
+ else:
+ per_folder_keys["."].append(k)
+
+ assert sum(len(entries) for entries in per_folder_keys.values()) == len(keys)
+
+ tree.objects = natsort.natsort(per_folder_keys.pop(".", []))
+
+ for folder_name, folder_keys in per_folder_keys.items():
+ tree.folders[folder_name] = build_s3_tree(
+ folder_keys, path=path + [folder_name]
+ )
+
+ return tree
+
+
+def pprint_s3tree(*, bucket, tree):
+ lines = []
+
+ # If we're at the top of the tree, we want to print a '.'
+ if tree.path == "":
+ lines.append(".")
+
+ # Start by printing any objects that are in this folder. Print up to
+ # 4 objects, otherwise print 3 and then '...X other objects'
+ if len(tree.objects) == 4:
+ tree_object_count = 4
+ else:
+ tree_object_count = 3
+
+ for i, object_key in enumerate(sorted(tree.objects[:tree_object_count]), start=1):
+ if tree.folders or len(tree.objects) > i:
+ prefix_char = "├─"
+ else:
+ prefix_char = "└─"
+
+ lines.append(f"{prefix_char} {termcolor.colored(object_key, 'blue')}")
+
+ if len(tree.objects) > tree_object_count:
+ if tree.folders:
+ prefix_char = "├─"
+ else:
+ prefix_char = "└─"
+
+ extra_objects = f"...{len(tree.objects) - 3} other objects"
+ lines.append(f"{prefix_char} {termcolor.colored(extra_objects, 'blue')}")
+
+ for i, (folder_name, folder_tree) in enumerate(
+ sorted(tree.folders.items()), start=1
+ ):
+ if tree.path == "":
+ full_path = folder_name
+ else:
+ full_path = "/".join([tree.path, folder_name])
+
+ if len(tree.folders) > i:
+ folder_prefix_char = "├─"
+ sub_prefix_char = "│ "
+ else:
+ folder_prefix_char = "└─"
+ sub_prefix_char = " "
+
+ lines.append(
+ folder_prefix_char
+ + " "
+ + create_link_text(
+ url=f"https://eu-west-1.console.aws.amazon.com/s3/buckets/{bucket}?prefix={full_path}/&showversions=false",
+ label=f"{folder_name}/",
+ )
+ )
+ lines.extend(
+ [
+ f"{sub_prefix_char}{ln}"
+ for ln in pprint_s3tree(bucket=bucket, tree=folder_tree)
+ ]
+ )
+
+ return lines
+
+
+if __name__ == "__main__":
+ args = parse_args()
+
+ s3_prefix = parse_s3_uri(args.S3_URI)
+
+ sess = create_s3_session(args.S3_URI)
+
+ s3_objects = list(list_s3_objects(sess, **s3_prefix))
+
+ if not s3_objects:
+ print("(no objects)")
+ sys.exit(1)
+
+ keys = [s3_obj["Key"] for s3_obj in s3_objects if s3_obj["Size"] > 0]
+
+ tree = build_s3_tree(keys)
+
+ print("\n".join(pprint_s3tree(bucket=s3_prefix["Bucket"], tree=tree)))
+
+ print("")
+ total_size = sum(s3_obj["Size"] for s3_obj in s3_objects)
+ last_modified = max(s3_obj["LastModified"] for s3_obj in s3_objects)
+
+ if last_modified.date() == datetime.date.today():
+ last_modified_message = "today"
+ elif last_modified.year != datetime.date.today().year:
+ last_modified_message = f"in {last_modified.strftime('%B %Y')}"
+ else:
+ last_modified_message = last_modified.strftime("%d %B")
+
+ print(
+ termcolor.colored(
+ f'{humanize.intcomma(len(s3_objects))} object{"s" if len(s3_objects) > 1 else ""}, '
+ f"totalling {humanize.naturalsize(total_size)}, "
+ f"last modified {last_modified_message}",
+ "green",
+ )
+ )