Add a script for highlighting long lines in a text file

ID

3762875

date

2024-01-11 09:43:32+00:00

author

Alex Chan <alex@alexwlchan.net>

parent

04c2fd6

message

Add a script for highlighting long lines in a text file

changed files

2 files, 51 additions, 1 deletion

text/README.md
text/longlines

Changed files

text/README.md (7303) → text/README.md (7650)

diff --git a/text/README.md b/text/README.md
index 0ff1035..3a0dc21 100644
--- a/text/README.md
+++ b/text/README.md
@@ -45,6 +45,10 @@ scripts = [
         """,
     },
     {
+        "usage": "longlines [PATH]",
+        "description": "print the line numbers of the longest lines in the file."
+    },
+    {
         "usage": "midline [PATH]",
         "description": "print the line in the middle of a file, e.g. if the file has 5 lines, it prints line 3"
     },
@@ -144,6 +148,15 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
   </dd>
 
   <dt>
+    <a href="https://github.com/alexwlchan/scripts/blob/main/text/longlines">
+      <code>longlines [PATH]</code>
+    </a>
+  </dt>
+  <dd>
+    print the line numbers of the longest lines in the file.
+  </dd>
+
+  <dt>
     <a href="https://github.com/alexwlchan/scripts/blob/main/text/midline">
       <code>midline [PATH]</code>
     </a>
@@ -237,4 +250,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
     "codepoints. This is a Docker wrapper around <a href="https://github.com/lunasorcery/utf8info">a tool of the same name</a> by @lunasorcery.
   </dd>
 </dl>
-<!-- [[[end]]] (checksum: bc564280bcf97df83fad1af796670312) -->
\ No newline at end of file
+<!-- [[[end]]] (checksum: ca2352d23f305660d76ce3000c8e98df) -->
\ No newline at end of file

text/longlines (0) → text/longlines (1120)

diff --git a/text/longlines b/text/longlines
new file mode 100755
index 0000000..7927d2f
--- /dev/null
+++ b/text/longlines
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+# Print the line numbers of the longest lines in the file.  Also highlight
+# lines which are duplicates.
+#
+# This is useful when I have a large file that I need to reduce in size;
+# I can target the biggest bits first.
+
+import collections, sys
+import hashlib
+
+if __name__ == '__main__':
+    path = sys.argv[1]
+
+    # This is a dict line_length -> dict[hash, line_nos]
+    line_hashes = collections.defaultdict(
+        lambda: collections.defaultdict(list)
+    )
+
+    for lineno, line in enumerate(open(path, "rb"), start=1):
+        line_hashes[len(line)][hashlib.md5(line).hexdigest()].append(lineno)
+
+    printed_lines = 0
+
+    for length, lines in sorted(line_hashes.items(), reverse=True):
+        for lineset in sorted(lines.values(), key=lambda v: len(v), reverse=True):
+            print(f'L{lineset[0]}\t => {length} chars')
+            if len(lineset) > 1:
+                for l in lineset[1:]:
+                    print(f' -> L{l}')
+
+            printed_lines += len(lineset)
+
+            if printed_lines >= 10:
+                break
+
+        if printed_lines >= 10:
+            break