Skip to main content

text/longlines

1#!/usr/bin/env python3
2# Print the line numbers of the longest lines in the file. Also highlight
3# lines which are duplicates.
4#
5# This is useful when I have a large file that I need to reduce in size;
6# I can target the biggest bits first.
8import collections, sys
9import hashlib
11if __name__ == '__main__':
12 path = sys.argv[1]
14 # This is a dict line_length -> dict[hash, line_nos]
15 line_hashes = collections.defaultdict(
16 lambda: collections.defaultdict(list)
17 )
19 for lineno, line in enumerate(open(path, "rb"), start=1):
20 line_hashes[len(line)][hashlib.md5(line).hexdigest()].append(lineno)
22 printed_lines = 0
24 for length, lines in sorted(line_hashes.items(), reverse=True):
25 for lineset in sorted(lines.values(), key=lambda v: len(v), reverse=True):
26 print(f'L{lineset[0]}\t => {length} chars')
27 if len(lineset) > 1:
28 for l in lineset[1:]:
29 print(f' -> L{l}')
31 printed_lines += len(lineset)
33 if printed_lines >= 10:
34 break
36 if printed_lines >= 10:
37 break