Skip to main content

aws/s3tree.py

1#!/usr/bin/env python3
2"""
3Prints a tree showing the structure of an S3 prefix.
5This is meant to give me an overview of what's in a prefix, not
6a complete listing. Here's an example of what the output looks like:
8 .
9 └─ digitised/
10 └─ b12840889/
11 └─ v1/
12 ├─ bag-info.txt
13 ├─ bagit.txt
14 ├─ manifest-sha256.txt
15 ├─ tagmanifest-sha256.txt
16 └─ data/
17 ├─ b12840889.xml
18 ├─ b12840889_0001.xml
19 └─ objects/
20 ├─ b12840889_0001_0001.jp2
21 ├─ b12840889_0001_0002.jp2
22 ├─ b12840889_0001_0003.jp2
23 └─ ...2785 other objects
25The folder names link to the S3 console, so I can jump into exploring the
26objects in more detail if useful.
28== Interesting features ==
30* It tries to pick an appropriate IAM role based on the bucket name
31 (this only works for some buckets, and ones I have access to).
33* The folder names are all clickable links that go to the S3 console,
34 so I can jump into more detailed inspection.
36* It does natural sorting of S3 keys rather than alphabetical, which is
37 useful when I have lots of numeric-esque keys like in the example.
39"""
41import argparse
42import collections
43import datetime
44import sys
45from typing import List
47import attr
48import humanize
49import natsort
50import termcolor
52from _common import create_link_text, create_s3_session, parse_s3_uri
55def parse_args():
56 parser = argparse.ArgumentParser(
57 prog="s3tree", description="Print a summary tree of an S3 prefix"
58 )
60 parser.add_argument("S3_URI")
62 return parser.parse_args()
65def list_s3_objects(sess, **kwargs):
66 s3 = sess.client("s3")
68 for page in s3.get_paginator("list_objects_v2").paginate(**kwargs):
69 yield from page.get("Contents", [])
72@attr.s
73class S3Folder:
74 path: str = attr.ib()
75 objects: List[str] = attr.ib(factory=list)
76 folders = attr.ib(factory=dict) # Mapping[str, S3Folder]
79def build_s3_tree(keys, path=None):
80 path = path or []
82 tree = S3Folder(path="/".join(path))
84 per_folder_keys = collections.defaultdict(list)
86 for k in keys:
87 if "/" in k:
88 folder_name, entry_name = k.split("/", 1)
89 per_folder_keys[folder_name].append(entry_name)
90 else:
91 per_folder_keys["."].append(k)
93 assert sum(len(entries) for entries in per_folder_keys.values()) == len(keys)
95 tree.objects = natsort.natsort(per_folder_keys.pop(".", []))
97 for folder_name, folder_keys in per_folder_keys.items():
98 tree.folders[folder_name] = build_s3_tree(
99 folder_keys, path=path + [folder_name]
100 )
102 return tree
105def pprint_s3tree(*, bucket, tree):
106 lines = []
108 # If we're at the top of the tree, we want to print a '.'
109 if tree.path == "":
110 lines.append(".")
112 # Start by printing any objects that are in this folder. Print up to
113 # 4 objects, otherwise print 3 and then '...X other objects'
114 if len(tree.objects) == 4:
115 tree_object_count = 4
116 else:
117 tree_object_count = 3
119 for i, object_key in enumerate(sorted(tree.objects[:tree_object_count]), start=1):
120 if tree.folders or len(tree.objects) > i:
121 prefix_char = "├─"
122 else:
123 prefix_char = "└─"
125 lines.append(f"{prefix_char} {termcolor.colored(object_key, 'blue')}")
127 if len(tree.objects) > tree_object_count:
128 if tree.folders:
129 prefix_char = "├─"
130 else:
131 prefix_char = "└─"
133 # if there's only one more object left in the folder, we should
134 # just print it rather than '...1 other object'
135 assert len(tree.objects) - 3 > 1
137 extra_objects = f"...{len(tree.objects) - 3} other objects"
138 lines.append(f"{prefix_char} {termcolor.colored(extra_objects, 'blue')}")
140 for i, folder_name in enumerate(natsort.natsort(tree.folders), start=1):
141 folder_tree = tree.folders[folder_name]
143 if tree.path == "":
144 full_path = folder_name
145 else:
146 full_path = "/".join([tree.path, folder_name])
148 if len(tree.folders) > i:
149 folder_prefix_char = "├─"
150 sub_prefix_char = "│ "
151 else:
152 folder_prefix_char = "└─"
153 sub_prefix_char = " "
155 lines.append(
156 folder_prefix_char
157 + " "
158 + create_link_text(
159 url=f"https://eu-west-1.console.aws.amazon.com/s3/buckets/{bucket}?prefix={full_path}/&showversions=false",
160 label=f"{folder_name}/",
161 )
162 )
163 lines.extend(
164 [
165 f"{sub_prefix_char}{ln}"
166 for ln in pprint_s3tree(bucket=bucket, tree=folder_tree)
167 ]
168 )
170 return lines
173if __name__ == "__main__":
174 args = parse_args()
176 s3_location = parse_s3_uri(args.S3_URI)
177 s3_prefix = {"Bucket": s3_location["Bucket"], "Prefix": s3_location["Path"]}
179 sess = create_s3_session(args.S3_URI)
181 s3_objects = list(list_s3_objects(sess, **s3_prefix))
183 if not s3_objects:
184 print("(no objects)")
185 sys.exit(1)
187 keys = [
188 s3_obj["Key"]
189 for s3_obj in s3_objects
190 if s3_obj["Size"] > 0 or not s3_obj["Key"].endswith("/")
191 ]
193 tree = build_s3_tree(keys)
195 print("\n".join(pprint_s3tree(bucket=s3_prefix["Bucket"], tree=tree)))
197 print("")
198 total_size = sum(s3_obj["Size"] for s3_obj in s3_objects)
199 last_modified = max(s3_obj["LastModified"] for s3_obj in s3_objects)
201 if last_modified.date() == datetime.date.today():
202 last_modified_message = "today"
203 elif last_modified.year != datetime.date.today().year:
204 last_modified_message = f"in {last_modified.strftime('%B %Y')}"
205 else:
206 last_modified_message = last_modified.strftime("%-d %B")
208 print(
209 termcolor.colored(
210 f"{humanize.intcomma(len(s3_objects))} object{'s' if len(s3_objects) > 1 else ''}, "
211 f"totalling {humanize.naturalsize(total_size)}, "
212 f"last modified {last_modified_message}",
213 "green",
214 )
215 )