ovirt-imageio/test/blkhash_test.py

# SPDX-FileCopyrightText: Red Hat, Inc.
# SPDX-License-Identifier: GPL-2.0-or-later

import hashlib
import pytest
from functools import partial

from ovirt_imageio._internal import blkhash

default_hash = partial(hashlib.blake2b, digest_size=32)


def test_algorithm_basic():
    h = default_hash()
    for i in range(10):
        block = (b"%02d\n" % i).ljust(blkhash.BLOCK_SIZE, b"\0")
        block_digest = default_hash(block).digest()
        h.update(block_digest)
    assert h.hexdigest() == (
        "7934079f80b53142d738d2bb7efaedf696a3d34d76a7865a24130bc7b4a7acfe"
    )


def test_algorithm_zero_optimization():
    # Hash the entire image. This is the case when we don't have extent
    # information, and do not use zero detection.
    zero_block = b"\0" * blkhash.BLOCK_SIZE
    h1 = default_hash()
    for i in range(10):
        block_digest = default_hash(zero_block).digest()
        h1.update(block_digest)

    # Hash a pre-computed digest instead of the actual bytes. Here we either
    # have extent information, or we detected that the blocks are zero blocks.
    h2 = default_hash()
    block_digest = default_hash(zero_block).digest()
    for i in range(10):
        h2.update(block_digest)

    # We must get the same checksum in both cases.
    assert h1.hexdigest() == h2.hexdigest()


def test_hasher_data():
    h1 = blkhash.Hash()
    for i in range(10):
        block = (b"%02d\n" % i).ljust(blkhash.BLOCK_SIZE, b"\0")
        h1.update(block)

    h2 = default_hash()
    for i in range(10):
        block = (b"%02d\n" % i).ljust(blkhash.BLOCK_SIZE, b"\0")
        block_digest = default_hash(block).digest()
        h2.update(block_digest)

    assert h1.hexdigest() == h2.hexdigest()


def test_hasher_zero():
    block = b"\0" * blkhash.BLOCK_SIZE

    h1 = blkhash.Hash()
    h1.update(block)
    h1.update(block)

    h2 = blkhash.Hash()
    h2.zero(len(block))
    h2.zero(len(block))

    assert h1.hexdigest() == h2.hexdigest()


@pytest.mark.parametrize("size,algorithm,digest_size,checksum", [
    # Files aligned to block size.
    (4 * 1024**2, "blake2b", 32,
        "f426bb2cf1e1901fe4e87423950944ecfed6d9d18a09e6e802aa4912e1c9b2d6"),
    (4 * 1024**2, "sha1", None,
        "3ed03b375b6658d99b63ced1867a95aeef080b79"),
    # Files not aligned to block size.
    (3 * 1024**2, "blake2b", 32,
        "42f3b76772a6d3dcffae2a24697721687975e2c60ddfd4ba7831ea9ce772ca71"),
    (3 * 1024**2, "sha1", None,
        "6cba43b908381be45a55ab9b4361f8370b928354"),
    (5 * 1024**2, "blake2b", 32,
        "0da53b583fc1fbbac7edea14454c79f84a8107613e614f2c7a47071dfdcf41a6"),
    (5 * 1024**2, "sha1", None,
        "d3936edd8e3a8ff10e8257a9f460d8da67838549"),
    # Empty file.
    (0, "blake2b", 32,
        "0e5751c026e543b2e8ab2eb06099daa1d1e5df47778f7787faab45cdf12fe3a8"),
    (0, "sha1", None,
        "da39a3ee5e6b4b0d3255bfef95601890afd80709"),
])
def test_checksum(tmpdir, size, algorithm, digest_size, checksum):
    path = str(tmpdir.join("file"))

    with open(path, "wb") as f:
        f.write(b"data")
        f.truncate(size)

    actual = blkhash.checksum(
        path,
        block_size=blkhash.BLOCK_SIZE,
        algorithm=algorithm,
        digest_size=digest_size)

    assert actual == {
        "algorithm": algorithm,
        "block_size": blkhash.BLOCK_SIZE,
        "checksum": checksum,
    }
reuse: addheader test/*.py Add SPDX header to python files with the 'py' extension in the test directory. Signed-off-by: Albert Esteve <aesteve@redhat.com> 2022-10-06 13:33:45 +02:00			`# SPDX-FileCopyrightText: Red Hat, Inc.`
			`# SPDX-License-Identifier: GPL-2.0-or-later`
checksum: Optimize zero hashing Use block based hashing algorithm: H( H(block 1) + H(block 2) + ... + H(block N) ) This is basically creating a hash list[1], and using the root hash as the result. The algorithm is similar to eD2k hash algorithm[2], but we support any algorithm and block size. The default algorithm is blake2b[3] and block size is 4 MiB. blake2b is as fast as sha-1, but secure at least as sha-3. When we don't have extents information, for example when using preallocated image, or sparse image on storage that does not report sparseness information (e.g. NFS < 4.2, GlusterFS with sharding) we detect zero blocks and optimize hashing. Checksum calculation time is limited by storage read throughput. If we have extent information, we can compute the hash for zero blocks without reading anything from storage, speeding up the calculation dramatically. When hashing zero blocks, instead of hashing entire block (4 MiB) we hash a precomputed digest bytes (32 bytes). This is up to 131072 times faster. Since the checksum depends on the block size, the response includes now also the block size: $ curl -k https://localhost:54322/images/nbd/checksum \| jq { "checksum": "061bbe365935437440f7372204b85acc4bfb76fe3fc20347a20b788bf445c131", "algorithm": "blake2b", "block_size": 4194304 } To compare the checksum to a local file, you must use the same algorithm and block_size: >> from ovirt_imageio import client >> client.checksum("disk.img", block_size=4194304, algorithm="blake2b") "061bbe365935437440f7372204b85acc4bfb76fe3fc20347a20b788bf445c131" To compare to a pre-computed checksum, the caller can specify the block_size using q query parameter: $ curl -k https://localhost:54322/images/nbd/checksum?block_size=2097152 \| jq { "checksum": "777b9c2f6598d503d43c14a39b31cdd8aee9f48b475f2af0f4c668e33297016c", "algorithm": "blake2b", "block_size": 2097152 } Here are initial results: tool fedora-32[4] full-6g[5] empty-6g[6] empty-100g[7] empty-1t[8] ---------------------------------------------------------------------------- checksum 2.84 3.04 0.03 0.06 0.28 b2sum[9] 8.29 8.42 8.48 161.00 1648.64 [1] https://en.wikipedia.org/wiki/Hash_list [2] https://en.wikipedia.org/wiki/Ed2k_URI_scheme#eD2k_hash_algorithm [3] https://blake2.net/ [4] virt-builder fedora-32 -o fedora-32.raw --root-password=password:root [5] dd if=/zero/zero bs=8M count=768 of=full-6g.raw [6] truncate -s 6g empty-6g.raw [7] truncate -s 100g empty-100g.raw [8] truncate -s 1t empty-1t.raw [9] b2sum --length 256 {path} Change-Id: I0661daf6a36e3eee57ef54128782e2a9aa11943e Signed-off-by: Nir Soffer <nsoffer@redhat.com> 2020-08-12 03:07:43 +03:00
			`import hashlib`
			`import pytest`
			`from functools import partial`

			`from ovirt_imageio._internal import blkhash`

			`default_hash = partial(hashlib.blake2b, digest_size=32)`


			`def test_algorithm_basic():`
			`h = default_hash()`
			`for i in range(10):`
			`block = (b"%02d\n" % i).ljust(blkhash.BLOCK_SIZE, b"\0")`
			`block_digest = default_hash(block).digest()`
			`h.update(block_digest)`
			`assert h.hexdigest() == (`
			`"7934079f80b53142d738d2bb7efaedf696a3d34d76a7865a24130bc7b4a7acfe"`
			`)`


			`def test_algorithm_zero_optimization():`
			`# Hash the entire image. This is the case when we don't have extent`
			`# information, and do not use zero detection.`
			`zero_block = b"\0" * blkhash.BLOCK_SIZE`
			`h1 = default_hash()`
			`for i in range(10):`
			`block_digest = default_hash(zero_block).digest()`
			`h1.update(block_digest)`

			`# Hash a pre-computed digest instead of the actual bytes. Here we either`
			`# have extent information, or we detected that the blocks are zero blocks.`
			`h2 = default_hash()`
			`block_digest = default_hash(zero_block).digest()`
			`for i in range(10):`
			`h2.update(block_digest)`

			`# We must get the same checksum in both cases.`
			`assert h1.hexdigest() == h2.hexdigest()`


			`def test_hasher_data():`
			`h1 = blkhash.Hash()`
			`for i in range(10):`
			`block = (b"%02d\n" % i).ljust(blkhash.BLOCK_SIZE, b"\0")`
			`h1.update(block)`

			`h2 = default_hash()`
			`for i in range(10):`
			`block = (b"%02d\n" % i).ljust(blkhash.BLOCK_SIZE, b"\0")`
			`block_digest = default_hash(block).digest()`
			`h2.update(block_digest)`

			`assert h1.hexdigest() == h2.hexdigest()`


			`def test_hasher_zero():`
			`block = b"\0" * blkhash.BLOCK_SIZE`

			`h1 = blkhash.Hash()`
			`h1.update(block)`
			`h1.update(block)`

			`h2 = blkhash.Hash()`
			`h2.zero(len(block))`
			`h2.zero(len(block))`

			`assert h1.hexdigest() == h2.hexdigest()`


checksum: Always return a dict Computing a checksum is meaningless without the algorithm and block size. Change checksum.Operation to return a dict with all the details so we get dict both from server and client. This also make the tests easier since we can compare the checksum retuned from blkhash.checksum(), client.checksum() and server without reconstructing the dict. Example output with this change with sdk example script: $ ./checksum_image.py snap3-full.qcow2 { "algorithm": "blake2b", "block_size": 4194304, "checksum": "0814cf35f454b705af354f1a9e123d9b40a9b1291afc3f5d9c34d7a45d86c87f" } Change-Id: I502e479af5d2d2be9396991b9b360aece96fa925 Signed-off-by: Nir Soffer <nsoffer@redhat.com> 2020-09-29 16:48:23 +03:00			`@pytest.mark.parametrize("size,algorithm,digest_size,checksum", [`
checksum: Optimize zero hashing Use block based hashing algorithm: H( H(block 1) + H(block 2) + ... + H(block N) ) This is basically creating a hash list[1], and using the root hash as the result. The algorithm is similar to eD2k hash algorithm[2], but we support any algorithm and block size. The default algorithm is blake2b[3] and block size is 4 MiB. blake2b is as fast as sha-1, but secure at least as sha-3. When we don't have extents information, for example when using preallocated image, or sparse image on storage that does not report sparseness information (e.g. NFS < 4.2, GlusterFS with sharding) we detect zero blocks and optimize hashing. Checksum calculation time is limited by storage read throughput. If we have extent information, we can compute the hash for zero blocks without reading anything from storage, speeding up the calculation dramatically. When hashing zero blocks, instead of hashing entire block (4 MiB) we hash a precomputed digest bytes (32 bytes). This is up to 131072 times faster. Since the checksum depends on the block size, the response includes now also the block size: $ curl -k https://localhost:54322/images/nbd/checksum \| jq { "checksum": "061bbe365935437440f7372204b85acc4bfb76fe3fc20347a20b788bf445c131", "algorithm": "blake2b", "block_size": 4194304 } To compare the checksum to a local file, you must use the same algorithm and block_size: >> from ovirt_imageio import client >> client.checksum("disk.img", block_size=4194304, algorithm="blake2b") "061bbe365935437440f7372204b85acc4bfb76fe3fc20347a20b788bf445c131" To compare to a pre-computed checksum, the caller can specify the block_size using q query parameter: $ curl -k https://localhost:54322/images/nbd/checksum?block_size=2097152 \| jq { "checksum": "777b9c2f6598d503d43c14a39b31cdd8aee9f48b475f2af0f4c668e33297016c", "algorithm": "blake2b", "block_size": 2097152 } Here are initial results: tool fedora-32[4] full-6g[5] empty-6g[6] empty-100g[7] empty-1t[8] ---------------------------------------------------------------------------- checksum 2.84 3.04 0.03 0.06 0.28 b2sum[9] 8.29 8.42 8.48 161.00 1648.64 [1] https://en.wikipedia.org/wiki/Hash_list [2] https://en.wikipedia.org/wiki/Ed2k_URI_scheme#eD2k_hash_algorithm [3] https://blake2.net/ [4] virt-builder fedora-32 -o fedora-32.raw --root-password=password:root [5] dd if=/zero/zero bs=8M count=768 of=full-6g.raw [6] truncate -s 6g empty-6g.raw [7] truncate -s 100g empty-100g.raw [8] truncate -s 1t empty-1t.raw [9] b2sum --length 256 {path} Change-Id: I0661daf6a36e3eee57ef54128782e2a9aa11943e Signed-off-by: Nir Soffer <nsoffer@redhat.com> 2020-08-12 03:07:43 +03:00			`# Files aligned to block size.`
			`(4 * 1024**2, "blake2b", 32,`
			`"f426bb2cf1e1901fe4e87423950944ecfed6d9d18a09e6e802aa4912e1c9b2d6"),`
			`(4 * 1024**2, "sha1", None,`
			`"3ed03b375b6658d99b63ced1867a95aeef080b79"),`
			`# Files not aligned to block size.`
			`(3 * 1024**2, "blake2b", 32,`
			`"42f3b76772a6d3dcffae2a24697721687975e2c60ddfd4ba7831ea9ce772ca71"),`
			`(3 * 1024**2, "sha1", None,`
			`"6cba43b908381be45a55ab9b4361f8370b928354"),`
			`(5 * 1024**2, "blake2b", 32,`
			`"0da53b583fc1fbbac7edea14454c79f84a8107613e614f2c7a47071dfdcf41a6"),`
			`(5 * 1024**2, "sha1", None,`
			`"d3936edd8e3a8ff10e8257a9f460d8da67838549"),`
			`# Empty file.`
			`(0, "blake2b", 32,`
			`"0e5751c026e543b2e8ab2eb06099daa1d1e5df47778f7787faab45cdf12fe3a8"),`
			`(0, "sha1", None,`
			`"da39a3ee5e6b4b0d3255bfef95601890afd80709"),`
			`])`
checksum: Always return a dict Computing a checksum is meaningless without the algorithm and block size. Change checksum.Operation to return a dict with all the details so we get dict both from server and client. This also make the tests easier since we can compare the checksum retuned from blkhash.checksum(), client.checksum() and server without reconstructing the dict. Example output with this change with sdk example script: $ ./checksum_image.py snap3-full.qcow2 { "algorithm": "blake2b", "block_size": 4194304, "checksum": "0814cf35f454b705af354f1a9e123d9b40a9b1291afc3f5d9c34d7a45d86c87f" } Change-Id: I502e479af5d2d2be9396991b9b360aece96fa925 Signed-off-by: Nir Soffer <nsoffer@redhat.com> 2020-09-29 16:48:23 +03:00			`def test_checksum(tmpdir, size, algorithm, digest_size, checksum):`
checksum: Optimize zero hashing Use block based hashing algorithm: H( H(block 1) + H(block 2) + ... + H(block N) ) This is basically creating a hash list[1], and using the root hash as the result. The algorithm is similar to eD2k hash algorithm[2], but we support any algorithm and block size. The default algorithm is blake2b[3] and block size is 4 MiB. blake2b is as fast as sha-1, but secure at least as sha-3. When we don't have extents information, for example when using preallocated image, or sparse image on storage that does not report sparseness information (e.g. NFS < 4.2, GlusterFS with sharding) we detect zero blocks and optimize hashing. Checksum calculation time is limited by storage read throughput. If we have extent information, we can compute the hash for zero blocks without reading anything from storage, speeding up the calculation dramatically. When hashing zero blocks, instead of hashing entire block (4 MiB) we hash a precomputed digest bytes (32 bytes). This is up to 131072 times faster. Since the checksum depends on the block size, the response includes now also the block size: $ curl -k https://localhost:54322/images/nbd/checksum \| jq { "checksum": "061bbe365935437440f7372204b85acc4bfb76fe3fc20347a20b788bf445c131", "algorithm": "blake2b", "block_size": 4194304 } To compare the checksum to a local file, you must use the same algorithm and block_size: >> from ovirt_imageio import client >> client.checksum("disk.img", block_size=4194304, algorithm="blake2b") "061bbe365935437440f7372204b85acc4bfb76fe3fc20347a20b788bf445c131" To compare to a pre-computed checksum, the caller can specify the block_size using q query parameter: $ curl -k https://localhost:54322/images/nbd/checksum?block_size=2097152 \| jq { "checksum": "777b9c2f6598d503d43c14a39b31cdd8aee9f48b475f2af0f4c668e33297016c", "algorithm": "blake2b", "block_size": 2097152 } Here are initial results: tool fedora-32[4] full-6g[5] empty-6g[6] empty-100g[7] empty-1t[8] ---------------------------------------------------------------------------- checksum 2.84 3.04 0.03 0.06 0.28 b2sum[9] 8.29 8.42 8.48 161.00 1648.64 [1] https://en.wikipedia.org/wiki/Hash_list [2] https://en.wikipedia.org/wiki/Ed2k_URI_scheme#eD2k_hash_algorithm [3] https://blake2.net/ [4] virt-builder fedora-32 -o fedora-32.raw --root-password=password:root [5] dd if=/zero/zero bs=8M count=768 of=full-6g.raw [6] truncate -s 6g empty-6g.raw [7] truncate -s 100g empty-100g.raw [8] truncate -s 1t empty-1t.raw [9] b2sum --length 256 {path} Change-Id: I0661daf6a36e3eee57ef54128782e2a9aa11943e Signed-off-by: Nir Soffer <nsoffer@redhat.com> 2020-08-12 03:07:43 +03:00			`path = str(tmpdir.join("file"))`

			`with open(path, "wb") as f:`
			`f.write(b"data")`
			`f.truncate(size)`

checksum: Always return a dict Computing a checksum is meaningless without the algorithm and block size. Change checksum.Operation to return a dict with all the details so we get dict both from server and client. This also make the tests easier since we can compare the checksum retuned from blkhash.checksum(), client.checksum() and server without reconstructing the dict. Example output with this change with sdk example script: $ ./checksum_image.py snap3-full.qcow2 { "algorithm": "blake2b", "block_size": 4194304, "checksum": "0814cf35f454b705af354f1a9e123d9b40a9b1291afc3f5d9c34d7a45d86c87f" } Change-Id: I502e479af5d2d2be9396991b9b360aece96fa925 Signed-off-by: Nir Soffer <nsoffer@redhat.com> 2020-09-29 16:48:23 +03:00			`actual = blkhash.checksum(`
checksum: Optimize zero hashing Use block based hashing algorithm: H( H(block 1) + H(block 2) + ... + H(block N) ) This is basically creating a hash list[1], and using the root hash as the result. The algorithm is similar to eD2k hash algorithm[2], but we support any algorithm and block size. The default algorithm is blake2b[3] and block size is 4 MiB. blake2b is as fast as sha-1, but secure at least as sha-3. When we don't have extents information, for example when using preallocated image, or sparse image on storage that does not report sparseness information (e.g. NFS < 4.2, GlusterFS with sharding) we detect zero blocks and optimize hashing. Checksum calculation time is limited by storage read throughput. If we have extent information, we can compute the hash for zero blocks without reading anything from storage, speeding up the calculation dramatically. When hashing zero blocks, instead of hashing entire block (4 MiB) we hash a precomputed digest bytes (32 bytes). This is up to 131072 times faster. Since the checksum depends on the block size, the response includes now also the block size: $ curl -k https://localhost:54322/images/nbd/checksum \| jq { "checksum": "061bbe365935437440f7372204b85acc4bfb76fe3fc20347a20b788bf445c131", "algorithm": "blake2b", "block_size": 4194304 } To compare the checksum to a local file, you must use the same algorithm and block_size: >> from ovirt_imageio import client >> client.checksum("disk.img", block_size=4194304, algorithm="blake2b") "061bbe365935437440f7372204b85acc4bfb76fe3fc20347a20b788bf445c131" To compare to a pre-computed checksum, the caller can specify the block_size using q query parameter: $ curl -k https://localhost:54322/images/nbd/checksum?block_size=2097152 \| jq { "checksum": "777b9c2f6598d503d43c14a39b31cdd8aee9f48b475f2af0f4c668e33297016c", "algorithm": "blake2b", "block_size": 2097152 } Here are initial results: tool fedora-32[4] full-6g[5] empty-6g[6] empty-100g[7] empty-1t[8] ---------------------------------------------------------------------------- checksum 2.84 3.04 0.03 0.06 0.28 b2sum[9] 8.29 8.42 8.48 161.00 1648.64 [1] https://en.wikipedia.org/wiki/Hash_list [2] https://en.wikipedia.org/wiki/Ed2k_URI_scheme#eD2k_hash_algorithm [3] https://blake2.net/ [4] virt-builder fedora-32 -o fedora-32.raw --root-password=password:root [5] dd if=/zero/zero bs=8M count=768 of=full-6g.raw [6] truncate -s 6g empty-6g.raw [7] truncate -s 100g empty-100g.raw [8] truncate -s 1t empty-1t.raw [9] b2sum --length 256 {path} Change-Id: I0661daf6a36e3eee57ef54128782e2a9aa11943e Signed-off-by: Nir Soffer <nsoffer@redhat.com> 2020-08-12 03:07:43 +03:00			`path,`
			`block_size=blkhash.BLOCK_SIZE,`
			`algorithm=algorithm,`
			`digest_size=digest_size)`

checksum: Always return a dict Computing a checksum is meaningless without the algorithm and block size. Change checksum.Operation to return a dict with all the details so we get dict both from server and client. This also make the tests easier since we can compare the checksum retuned from blkhash.checksum(), client.checksum() and server without reconstructing the dict. Example output with this change with sdk example script: $ ./checksum_image.py snap3-full.qcow2 { "algorithm": "blake2b", "block_size": 4194304, "checksum": "0814cf35f454b705af354f1a9e123d9b40a9b1291afc3f5d9c34d7a45d86c87f" } Change-Id: I502e479af5d2d2be9396991b9b360aece96fa925 Signed-off-by: Nir Soffer <nsoffer@redhat.com> 2020-09-29 16:48:23 +03:00			`assert actual == {`
			`"algorithm": algorithm,`
			`"block_size": blkhash.BLOCK_SIZE,`
			`"checksum": checksum,`
			`}`