2# Print the line numbers of the longest lines in the file. Also highlight
3# lines which are duplicates.
5# This is useful when I have a large file that I need to reduce in size;
6# I can target the biggest bits first.
11if __name__ == '__main__':
14 # This is a dict line_length -> dict[hash, line_nos]
15 line_hashes = collections.defaultdict(
16 lambda: collections.defaultdict(list)
19 for lineno, line in enumerate(open(path, "rb"), start=1):
20 line_hashes[len(line)][hashlib.md5(line).hexdigest()].append(lineno)
24 for length, lines in sorted(line_hashes.items(), reverse=True):
25 for lineset in sorted(lines.values(), key=lambda v: len(v), reverse=True):
26 print(f'L{lineset[0]}\t => {length} chars')
31 printed_lines += len(lineset)
33 if printed_lines >= 10:
36 if printed_lines >= 10: