1
0
mirror of https://github.com/samba-team/samba.git synced 2025-01-11 05:18:09 +03:00
samba-mirror/lib/compression/tests/scripts/make-test-vectors
Douglas Bagnall dadecede54 lib/compression: helper script to make unbalanced data
Huffman tree re-quantisation and perhaps other code paths are only
triggered by pathological data like this.

Signed-off-by: Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
Reviewed-by: Joseph Sutton <josephsutton@catalyst.net.nz>
2022-12-01 22:56:39 +00:00

186 lines
4.1 KiB
Python
Executable File

#!/usr/bin/python3
"""Generate a few strings with unbalanced distributions to test the
regeneration of the Huffman tree when it gets too deep.
USAGE: make-test-vectors DIR
This will fill up DIR with test files.
"""
import sys
import random
from collections import defaultdict
if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2:
print(__doc__)
exit(len(sys.argv) != 2)
DIR = sys.argv[1]
SIZE = (1 << 17) + (23) # two and a bit blocks
SIZE_NAME = "128k+"
# SIZE = (1 << 16)
# SIZE_NAME = "64"
random.seed(1)
def squares(n):
array = []
for i in range(n):
a = random.random()
b = random.random()
array.append(int(a * b * 256))
return bytes(array)
def skewed_choices(n):
b = list(range(256))
array = random.choices(b, weights=b, k=n)
return bytes(array)
def fib_shuffle(n):
array = []
a, b = 1, 1
for i in range(100):
array.extend([i] * a)
a, b = a + b, a
if len(array) > 1000000:
break
random.shuffle(array)
return bytes(array[:n])
def exp_shuffle(n):
array = []
for i in range(256):
array.extend([i] * int(1.04 ** i))
if len(array) > 1000000:
break
random.shuffle(array)
return bytes(array[:n])
def and_rand(n):
array = []
for i in range(n):
a = random.randrange(256)
b = random.randrange(256)
array.append(a & b)
return bytes(array)
def betavar(n, a, b):
array = []
for i in range(n):
x = random.betavariate(a, b)
array.append(int(x * 255.999999999999))
return bytes(array)
def repeated_alphabet(n):
a = b'abcdefghijklmnopqrstuvwxyz'
na = n // len(a) + 1
s = a * na
return s[:n]
def decayed_alphabet(n):
s = list(repeated_alphabet(n))
for i in range(256):
j = random.randrange(n)
s[j] = i
return bytes(s)
def trigram_model(n):
with open(__file__, 'rb') as f:
data = f.read()
lut = defaultdict(list)
for a, b, c in zip(data, data[1:], data[2:]):
k = bytes([a, b])
lut[k].append(c)
k = random.choice(list(lut.keys()))
s = []
p = k[1]
for i in range(n + 10):
c = random.choice(lut[k])
s.append(c)
k = bytes([p, c])
p = c
return bytes(s[10:])
def trigram_sum_model(n):
with open(__file__, 'rb') as f:
data = f.read()
lut = [[random.randrange(256)] for i in range(512)]
for a, b, c in zip(data, data[1:], data[2:]):
lut[a + b].append(c)
s = []
i = random.randrange(len(data) - 1)
a = data[i]
b = data[i + 1]
for i in range(n + 10):
x = lut[a + b]
c = random.choice(x)
s.append(c)
a = b
b = c
return bytes(s[10:])
def the_classics():
# this used to be main()
sq = squares(SIZE)
ch = skewed_choices(SIZE)
fs = fib_shuffle(SIZE)
es = exp_shuffle(SIZE)
ar = and_rand(SIZE)
bv1 = betavar(SIZE, 0.1, 1.5)
bv2 = betavar(SIZE, 0.5, 2.0)
bv3 = betavar(SIZE, 0.05, 0.05)
print("n sq ch fs es")
for i in range(256):
print(f"{i:3} {sq.count(i):5} {ch.count(i):5} "
f"{fs.count(i):5} {es.count(i):5}"
f"{ar.count(i):5} {bv1.count(i):5}"
f"{bv2.count(i):5} {bv3.count(i):5}"
)
for series, fn in ((sq, "square_series"),
(ch, "skewed_choices"),
(fs, "fib_shuffle"),
(es, "exp_shuffle"),
(ar, "and_rand"),
(bv1, "beta-variate1"),
(bv2, "beta-variate2"),
(bv3, "beta-variate3"),
):
with open(f"{DIR}/{fn}-{SIZE_NAME}", "wb") as f:
f.write(series)
def main():
if True:
the_classics()
for series, fn in ((decayed_alphabet(SIZE), "decayed_alphabet"),
(trigram_model(SIZE), "trigram"),
(trigram_sum_model(SIZE), "trigram_sum"),
):
with open(f"{DIR}/{fn}_{SIZE_NAME}", "wb") as f:
f.write(series)
main()