From 7804570a379f29809a0b7540b6d94abc51d4046c Mon Sep 17 00:00:00 2001 From: Douglas Bagnall Date: Tue, 22 Nov 2022 08:35:14 +1300 Subject: [PATCH] lib/compression: script to test 3 byte hash Compression uses a 3 byte hash remember LZ77 matches in a 14-bit table. This script runs the hash over all 16M combinations, then again over all ASCII combinations, counting collisions to find hot-spots. If you think you have a better hash, you are probably right, but you should try it here -- alter h() -- before committing to it. This one is literally the first one I thought of. Signed-off-by: Douglas Bagnall Reviewed-by: Joseph Sutton --- lib/compression/tests/scripts/three-byte-hash | 49 +++++++++++++++++++ python/samba/tests/usage.py | 1 + 2 files changed, 50 insertions(+) create mode 100755 lib/compression/tests/scripts/three-byte-hash diff --git a/lib/compression/tests/scripts/three-byte-hash b/lib/compression/tests/scripts/three-byte-hash new file mode 100755 index 00000000000..100d0bc39d8 --- /dev/null +++ b/lib/compression/tests/scripts/three-byte-hash @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +"""Print statistics about a certain three byte hash. + +USAGE: three_byte_hash +""" +import sys + +if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) > 1: + print(__doc__) + exit(not ('--help' in sys.argv or '-h' in sys.argv)) + + +from statistics import mean, pstdev, median + + +def h(*args, bits=12): + a = args[0] + b = args[1] ^ 0x2e + c = args[2] ^ 0x55 + d = ((a + b) << 8) ^ (((c - a) & 0xffff) << 5) ^ (c + b) ^ (0xcab + a) + return d & ((1 << bits) - 1) + + +def count(fn, bits, filter=None): + counts = [0] * (1 << bits) + for i in range(256 ** 3): + a, b, c = i & 255, (i >> 8) & 255, i >> 16 + if filter and not (filter(a) and filter(b) and filter(c)): + continue + + h = fn(a, b, c, bits=bits) + counts[h] += 1 + + print(f" {bits} bits; {len(counts)} buckets, " + f"expected {(1<<24) / len(counts)}") + print(f"median {median(counts)}") + print(f"mean {mean(counts)}") + print(f"min {min(counts)}") + print(f"max {max(counts)}") + print(f"stddev {pstdev(counts)}") + + +for b in (12, 13, 14): + count(h, b) + + print("With ASCII filter") + letters = set(range(32, 127)) + letters |= set(b'\r\n\t\0') + count(h, b, filter=letters.__contains__) diff --git a/python/samba/tests/usage.py b/python/samba/tests/usage.py index 477e6a00f4e..682dad885f2 100644 --- a/python/samba/tests/usage.py +++ b/python/samba/tests/usage.py @@ -116,6 +116,7 @@ EXCLUDE_USAGE = { 'python/samba/tests/krb5/claims_tests.py', 'python/samba/tests/krb5/lockout_tests.py', 'python/samba/tests/krb5/group_tests.py', + 'lib/compression/tests/scripts/three-byte-hash', } EXCLUDE_HELP = {