3Prints a tree showing the structure of an S3 prefix.
5This is meant to give me an overview of what's in a prefix, not
6a complete listing. Here's an example of what the output looks like:
14 ├─ manifest-sha256.txt
15 ├─ tagmanifest-sha256.txt
20 ├─ b12840889_0001_0001.jp2
21 ├─ b12840889_0001_0002.jp2
22 ├─ b12840889_0001_0003.jp2
23 └─ ...2785 other objects
25The folder names link to the S3 console, so I can jump into exploring the
26objects in more detail if useful.
28== Interesting features ==
30* It tries to pick an appropriate IAM role based on the bucket name
31 (this only works for some buckets, and ones I have access to).
33* The folder names are all clickable links that go to the S3 console,
34 so I can jump into more detailed inspection.
36* It does natural sorting of S3 keys rather than alphabetical, which is
37 useful when I have lots of numeric-esque keys like in the example.
45from typing import List
52from _common import create_link_text, create_s3_session, parse_s3_uri
56 parser = argparse.ArgumentParser(
57 prog="s3tree", description="Print a summary tree of an S3 prefix"
60 parser.add_argument("S3_URI")
62 return parser.parse_args()
65def list_s3_objects(sess, **kwargs):
66 s3 = sess.client("s3")
68 for page in s3.get_paginator("list_objects_v2").paginate(**kwargs):
69 yield from page.get("Contents", [])
75 objects: List[str] = attr.ib(factory=list)
76 folders = attr.ib(factory=dict) # Mapping[str, S3Folder]
79def build_s3_tree(keys, path=None):
82 tree = S3Folder(path="/".join(path))
84 per_folder_keys = collections.defaultdict(list)
88 folder_name, entry_name = k.split("/", 1)
89 per_folder_keys[folder_name].append(entry_name)
91 per_folder_keys["."].append(k)
93 assert sum(len(entries) for entries in per_folder_keys.values()) == len(keys)
95 tree.objects = natsort.natsort(per_folder_keys.pop(".", []))
97 for folder_name, folder_keys in per_folder_keys.items():
98 tree.folders[folder_name] = build_s3_tree(
99 folder_keys, path=path + [folder_name]
105def pprint_s3tree(*, bucket, tree):
108 # If we're at the top of the tree, we want to print a '.'
112 # Start by printing any objects that are in this folder. Print up to
113 # 4 objects, otherwise print 3 and then '...X other objects'
114 if len(tree.objects) == 4:
115 tree_object_count = 4
117 tree_object_count = 3
119 for i, object_key in enumerate(sorted(tree.objects[:tree_object_count]), start=1):
120 if tree.folders or len(tree.objects) > i:
125 lines.append(f"{prefix_char} {termcolor.colored(object_key, 'blue')}")
127 if len(tree.objects) > tree_object_count:
133 # if there's only one more object left in the folder, we should
134 # just print it rather than '...1 other object'
135 assert len(tree.objects) - 3 > 1
137 extra_objects = f"...{len(tree.objects) - 3} other objects"
138 lines.append(f"{prefix_char} {termcolor.colored(extra_objects, 'blue')}")
140 for i, folder_name in enumerate(natsort.natsort(tree.folders), start=1):
141 folder_tree = tree.folders[folder_name]
144 full_path = folder_name
146 full_path = "/".join([tree.path, folder_name])
148 if len(tree.folders) > i:
149 folder_prefix_char = "├─"
150 sub_prefix_char = "│ "
152 folder_prefix_char = "└─"
153 sub_prefix_char = " "
159 url=f"https://eu-west-1.console.aws.amazon.com/s3/buckets/{bucket}?prefix={full_path}/&showversions=false",
160 label=f"{folder_name}/",
165 f"{sub_prefix_char}{ln}"
166 for ln in pprint_s3tree(bucket=bucket, tree=folder_tree)
173if __name__ == "__main__":
176 s3_location = parse_s3_uri(args.S3_URI)
177 s3_prefix = {"Bucket": s3_location["Bucket"], "Prefix": s3_location["Path"]}
179 sess = create_s3_session(args.S3_URI)
181 s3_objects = list(list_s3_objects(sess, **s3_prefix))
184 print("(no objects)")
189 for s3_obj in s3_objects
190 if s3_obj["Size"] > 0 or not s3_obj["Key"].endswith("/")
193 tree = build_s3_tree(keys)
195 print("\n".join(pprint_s3tree(bucket=s3_prefix["Bucket"], tree=tree)))
198 total_size = sum(s3_obj["Size"] for s3_obj in s3_objects)
199 last_modified = max(s3_obj["LastModified"] for s3_obj in s3_objects)
201 if last_modified.date() == datetime.date.today():
202 last_modified_message = "today"
203 elif last_modified.year != datetime.date.today().year:
204 last_modified_message = f"in {last_modified.strftime('%B %Y')}"
206 last_modified_message = last_modified.strftime("%-d %B")
210 f"{humanize.intcomma(len(s3_objects))} object{'s' if len(s3_objects) > 1 else ''}, "
211 f"totalling {humanize.naturalsize(total_size)}, "
212 f"last modified {last_modified_message}",