#!/usr/bin/env python3 # Print the line numbers of the longest lines in the file. Also highlight # lines which are duplicates. # # This is useful when I have a large file that I need to reduce in size; # I can target the biggest bits first. import collections, sys import hashlib if __name__ == '__main__': path = sys.argv[1] # This is a dict line_length -> dict[hash, line_nos] line_hashes = collections.defaultdict( lambda: collections.defaultdict(list) ) for lineno, line in enumerate(open(path, "rb"), start=1): line_hashes[len(line)][hashlib.md5(line).hexdigest()].append(lineno) printed_lines = 0 for length, lines in sorted(line_hashes.items(), reverse=True): for lineset in sorted(lines.values(), key=lambda v: len(v), reverse=True): print(f'L{lineset[0]}\t => {length} chars') if len(lineset) > 1: for l in lineset[1:]: print(f' -> L{l}') printed_lines += len(lineset) if printed_lines >= 10: break if printed_lines >= 10: break