Add a script for highlighting long lines in a text file
- ID
3762875- date
2024-01-11 09:43:32+00:00- author
Alex Chan <alex@alexwlchan.net>- parent
04c2fd6- message
Add a script for highlighting long lines in a text file- changed files
2 files, 51 additions, 1 deletion
Changed files
text/README.md (7303) → text/README.md (7650)
diff --git a/text/README.md b/text/README.md
index 0ff1035..3a0dc21 100644
--- a/text/README.md
+++ b/text/README.md
@@ -45,6 +45,10 @@ scripts = [
""",
},
{
+ "usage": "longlines [PATH]",
+ "description": "print the line numbers of the longest lines in the file."
+ },
+ {
"usage": "midline [PATH]",
"description": "print the line in the middle of a file, e.g. if the file has 5 lines, it prints line 3"
},
@@ -144,6 +148,15 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
</dd>
<dt>
+ <a href="https://github.com/alexwlchan/scripts/blob/main/text/longlines">
+ <code>longlines [PATH]</code>
+ </a>
+ </dt>
+ <dd>
+ print the line numbers of the longest lines in the file.
+ </dd>
+
+ <dt>
<a href="https://github.com/alexwlchan/scripts/blob/main/text/midline">
<code>midline [PATH]</code>
</a>
@@ -237,4 +250,4 @@ cog_helpers.create_description_table(folder_name=folder_name, scripts=scripts)
"codepoints. This is a Docker wrapper around <a href="https://github.com/lunasorcery/utf8info">a tool of the same name</a> by @lunasorcery.
</dd>
</dl>
-<!-- [[[end]]] (checksum: bc564280bcf97df83fad1af796670312) -->
\ No newline at end of file
+<!-- [[[end]]] (checksum: ca2352d23f305660d76ce3000c8e98df) -->
\ No newline at end of file
text/longlines (0) → text/longlines (1120)
diff --git a/text/longlines b/text/longlines
new file mode 100755
index 0000000..7927d2f
--- /dev/null
+++ b/text/longlines
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+# Print the line numbers of the longest lines in the file. Also highlight
+# lines which are duplicates.
+#
+# This is useful when I have a large file that I need to reduce in size;
+# I can target the biggest bits first.
+
+import collections, sys
+import hashlib
+
+if __name__ == '__main__':
+ path = sys.argv[1]
+
+ # This is a dict line_length -> dict[hash, line_nos]
+ line_hashes = collections.defaultdict(
+ lambda: collections.defaultdict(list)
+ )
+
+ for lineno, line in enumerate(open(path, "rb"), start=1):
+ line_hashes[len(line)][hashlib.md5(line).hexdigest()].append(lineno)
+
+ printed_lines = 0
+
+ for length, lines in sorted(line_hashes.items(), reverse=True):
+ for lineset in sorted(lines.values(), key=lambda v: len(v), reverse=True):
+ print(f'L{lineset[0]}\t => {length} chars')
+ if len(lineset) > 1:
+ for l in lineset[1:]:
+ print(f' -> L{l}')
+
+ printed_lines += len(lineset)
+
+ if printed_lines >= 10:
+ break
+
+ if printed_lines >= 10:
+ break