3584240b9c
Rename all uds_log_* to vdo_log_*. Signed-off-by: Mike Snitzer <snitzer@kernel.org> Signed-off-by: Chung Chung <cchung@redhat.com> Signed-off-by: Matthew Sakai <msakai@redhat.com>
1971 lines
63 KiB
C
1971 lines
63 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Copyright 2023 Red Hat
|
|
*/
|
|
#include "delta-index.h"
|
|
|
|
#include <linux/bitops.h>
|
|
#include <linux/bits.h>
|
|
#include <linux/compiler.h>
|
|
#include <linux/limits.h>
|
|
#include <linux/log2.h>
|
|
|
|
#include "cpu.h"
|
|
#include "errors.h"
|
|
#include "logger.h"
|
|
#include "memory-alloc.h"
|
|
#include "numeric.h"
|
|
#include "permassert.h"
|
|
#include "string-utils.h"
|
|
#include "time-utils.h"
|
|
|
|
#include "config.h"
|
|
#include "indexer.h"
|
|
|
|
/*
|
|
* The entries in a delta index could be stored in a single delta list, but to reduce search times
|
|
* and update costs it uses multiple delta lists. These lists are stored in a single chunk of
|
|
* memory managed by the delta_zone structure. The delta_zone can move the data around within its
|
|
* memory, so the location of each delta list is recorded as a bit offset into the memory. Because
|
|
* the volume index can contain over a million delta lists, we want to be efficient with the size
|
|
* of the delta list header information. This information is encoded into 16 bytes per list. The
|
|
* volume index delta list memory can easily exceed 4 gigabits, so a 64 bit value is needed to
|
|
* address the memory. The volume index delta lists average around 6 kilobits, so 16 bits are
|
|
* sufficient to store the size of a delta list.
|
|
*
|
|
* Each delta list is stored as a bit stream. Within the delta list encoding, bits and bytes are
|
|
* numbered in little endian order. Within a byte, bit 0 is the least significant bit (0x1), and
|
|
* bit 7 is the most significant bit (0x80). Within a bit stream, bit 7 is the most significant bit
|
|
* of byte 0, and bit 8 is the least significant bit of byte 1. Within a byte array, a byte's
|
|
* number corresponds to its index in the array.
|
|
*
|
|
* A standard delta list entry is stored as a fixed length payload (the value) followed by a
|
|
* variable length key (the delta). A collision entry is used when two block names have the same
|
|
* delta list address. A collision entry always follows a standard entry for the hash with which it
|
|
* collides, and is encoded with DELTA == 0 with an additional 256 bits field at the end,
|
|
* containing the full block name. An entry with a delta of 0 at the beginning of a delta list
|
|
* indicates a normal entry.
|
|
*
|
|
* The delta in each entry is encoded with a variable-length Huffman code to minimize the memory
|
|
* used by small deltas. The Huffman code is specified by three parameters, which can be computed
|
|
* from the desired mean delta when the index is full. (See compute_coding_constants() for
|
|
* details.)
|
|
*
|
|
* The bit field utilities used to read and write delta entries assume that it is possible to read
|
|
* some bytes beyond the end of the bit field, so a delta_zone memory allocation is guarded by two
|
|
* invalid delta lists to prevent reading outside the delta_zone memory. The valid delta lists are
|
|
* numbered 1 to N, and the guard lists are numbered 0 and N+1. The function to decode the bit
|
|
* stream include a step that skips over bits set to 0 until the first 1 bit is found. A corrupted
|
|
* delta list could cause this step to run off the end of the delta_zone memory, so as extra
|
|
* protection against this happening, the tail guard list is set to all ones.
|
|
*
|
|
* The delta_index supports two different forms. The mutable form is created by
|
|
* uds_initialize_delta_index(), and is used for the volume index and for open chapter indexes. The
|
|
* immutable form is created by uds_initialize_delta_index_page(), and is used for closed (and
|
|
* cached) chapter index pages. The immutable form does not allocate delta list headers or
|
|
* temporary offsets, and thus is somewhat more memory efficient.
|
|
*/
|
|
|
|
/*
|
|
* This is the largest field size supported by get_field() and set_field(). Any field that is
|
|
* larger is not guaranteed to fit in a single byte-aligned u32.
|
|
*/
|
|
#define MAX_FIELD_BITS ((sizeof(u32) - 1) * BITS_PER_BYTE + 1)
|
|
|
|
/*
|
|
* This is the largest field size supported by get_big_field() and set_big_field(). Any field that
|
|
* is larger is not guaranteed to fit in a single byte-aligned u64.
|
|
*/
|
|
#define MAX_BIG_FIELD_BITS ((sizeof(u64) - 1) * BITS_PER_BYTE + 1)
|
|
|
|
/*
|
|
* This is the number of guard bytes needed at the end of the memory byte array when using the bit
|
|
* utilities. These utilities call get_big_field() and set_big_field(), which can access up to 7
|
|
* bytes beyond the end of the desired field. The definition is written to make it clear how this
|
|
* value is derived.
|
|
*/
|
|
#define POST_FIELD_GUARD_BYTES (sizeof(u64) - 1)
|
|
|
|
/* The number of guard bits that are needed in the tail guard list */
|
|
#define GUARD_BITS (POST_FIELD_GUARD_BYTES * BITS_PER_BYTE)
|
|
|
|
/*
|
|
* The maximum size of a single delta list in bytes. We count guard bytes in this value because a
|
|
* buffer of this size can be used with move_bits().
|
|
*/
|
|
#define DELTA_LIST_MAX_BYTE_COUNT \
|
|
((U16_MAX + BITS_PER_BYTE) / BITS_PER_BYTE + POST_FIELD_GUARD_BYTES)
|
|
|
|
/* The number of extra bytes and bits needed to store a collision entry */
|
|
#define COLLISION_BYTES UDS_RECORD_NAME_SIZE
|
|
#define COLLISION_BITS (COLLISION_BYTES * BITS_PER_BYTE)
|
|
|
|
/*
|
|
* Immutable delta lists are packed into pages containing a header that encodes the delta list
|
|
* information into 19 bits per list (64KB bit offset).
|
|
*/
|
|
#define IMMUTABLE_HEADER_SIZE 19
|
|
|
|
/*
|
|
* Constants and structures for the saved delta index. "DI" is for delta_index, and -##### is a
|
|
* number to increment when the format of the data changes.
|
|
*/
|
|
#define MAGIC_SIZE 8
|
|
|
|
static const char DELTA_INDEX_MAGIC[] = "DI-00002";
|
|
|
|
struct delta_index_header {
|
|
char magic[MAGIC_SIZE];
|
|
u32 zone_number;
|
|
u32 zone_count;
|
|
u32 first_list;
|
|
u32 list_count;
|
|
u64 record_count;
|
|
u64 collision_count;
|
|
};
|
|
|
|
/*
|
|
* Header data used for immutable delta index pages. This data is followed by the delta list offset
|
|
* table.
|
|
*/
|
|
struct delta_page_header {
|
|
/* Externally-defined nonce */
|
|
u64 nonce;
|
|
/* The virtual chapter number */
|
|
u64 virtual_chapter_number;
|
|
/* Index of the first delta list on the page */
|
|
u16 first_list;
|
|
/* Number of delta lists on the page */
|
|
u16 list_count;
|
|
} __packed;
|
|
|
|
static inline u64 get_delta_list_byte_start(const struct delta_list *delta_list)
|
|
{
|
|
return delta_list->start / BITS_PER_BYTE;
|
|
}
|
|
|
|
static inline u16 get_delta_list_byte_size(const struct delta_list *delta_list)
|
|
{
|
|
unsigned int bit_offset = delta_list->start % BITS_PER_BYTE;
|
|
|
|
return BITS_TO_BYTES(bit_offset + delta_list->size);
|
|
}
|
|
|
|
static void rebalance_delta_zone(const struct delta_zone *delta_zone, u32 first,
|
|
u32 last)
|
|
{
|
|
struct delta_list *delta_list;
|
|
u64 new_start;
|
|
|
|
if (first == last) {
|
|
/* Only one list is moving, and we know there is space. */
|
|
delta_list = &delta_zone->delta_lists[first];
|
|
new_start = delta_zone->new_offsets[first];
|
|
if (delta_list->start != new_start) {
|
|
u64 source;
|
|
u64 destination;
|
|
|
|
source = get_delta_list_byte_start(delta_list);
|
|
delta_list->start = new_start;
|
|
destination = get_delta_list_byte_start(delta_list);
|
|
memmove(delta_zone->memory + destination,
|
|
delta_zone->memory + source,
|
|
get_delta_list_byte_size(delta_list));
|
|
}
|
|
} else {
|
|
/*
|
|
* There is more than one list. Divide the problem in half, and use recursive calls
|
|
* to process each half. Note that after this computation, first <= middle, and
|
|
* middle < last.
|
|
*/
|
|
u32 middle = (first + last) / 2;
|
|
|
|
delta_list = &delta_zone->delta_lists[middle];
|
|
new_start = delta_zone->new_offsets[middle];
|
|
|
|
/*
|
|
* The direction that our middle list is moving determines which half of the
|
|
* problem must be processed first.
|
|
*/
|
|
if (new_start > delta_list->start) {
|
|
rebalance_delta_zone(delta_zone, middle + 1, last);
|
|
rebalance_delta_zone(delta_zone, first, middle);
|
|
} else {
|
|
rebalance_delta_zone(delta_zone, first, middle);
|
|
rebalance_delta_zone(delta_zone, middle + 1, last);
|
|
}
|
|
}
|
|
}
|
|
|
|
static inline size_t get_zone_memory_size(unsigned int zone_count, size_t memory_size)
|
|
{
|
|
/* Round up so that each zone is a multiple of 64K in size. */
|
|
size_t ALLOC_BOUNDARY = 64 * 1024;
|
|
|
|
return (memory_size / zone_count + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY;
|
|
}
|
|
|
|
void uds_reset_delta_index(const struct delta_index *delta_index)
|
|
{
|
|
unsigned int z;
|
|
|
|
/*
|
|
* Initialize all delta lists to be empty. We keep 2 extra delta list descriptors, one
|
|
* before the first real entry and one after so that we don't need to bounds check the
|
|
* array access when calculating preceding and following gap sizes.
|
|
*/
|
|
for (z = 0; z < delta_index->zone_count; z++) {
|
|
u64 list_bits;
|
|
u64 spacing;
|
|
u64 offset;
|
|
unsigned int i;
|
|
struct delta_zone *zone = &delta_index->delta_zones[z];
|
|
struct delta_list *delta_lists = zone->delta_lists;
|
|
|
|
/* Zeroing the delta list headers initializes the head guard list correctly. */
|
|
memset(delta_lists, 0,
|
|
(zone->list_count + 2) * sizeof(struct delta_list));
|
|
|
|
/* Set all the bits in the end guard list. */
|
|
list_bits = (u64) zone->size * BITS_PER_BYTE - GUARD_BITS;
|
|
delta_lists[zone->list_count + 1].start = list_bits;
|
|
delta_lists[zone->list_count + 1].size = GUARD_BITS;
|
|
memset(zone->memory + (list_bits / BITS_PER_BYTE), ~0,
|
|
POST_FIELD_GUARD_BYTES);
|
|
|
|
/* Evenly space out the real delta lists by setting regular offsets. */
|
|
spacing = list_bits / zone->list_count;
|
|
offset = spacing / 2;
|
|
for (i = 1; i <= zone->list_count; i++) {
|
|
delta_lists[i].start = offset;
|
|
offset += spacing;
|
|
}
|
|
|
|
/* Update the statistics. */
|
|
zone->discard_count += zone->record_count;
|
|
zone->record_count = 0;
|
|
zone->collision_count = 0;
|
|
}
|
|
}
|
|
|
|
/* Compute the Huffman coding parameters for the given mean delta. The Huffman code is specified by
|
|
* three parameters:
|
|
*
|
|
* MINBITS The number of bits in the smallest code
|
|
* BASE The number of values coded using a code of length MINBITS
|
|
* INCR The number of values coded by using one additional bit
|
|
*
|
|
* These parameters are related by this equation:
|
|
*
|
|
* BASE + INCR == 1 << MINBITS
|
|
*
|
|
* The math for the Huffman code of an exponential distribution says that
|
|
*
|
|
* INCR = log(2) * MEAN_DELTA
|
|
*
|
|
* Then use the smallest MINBITS value so that
|
|
*
|
|
* (1 << MINBITS) > INCR
|
|
*
|
|
* And then
|
|
*
|
|
* BASE = (1 << MINBITS) - INCR
|
|
*
|
|
* Now the index can generate a code such that
|
|
* - The first BASE values code using MINBITS bits.
|
|
* - The next INCR values code using MINBITS+1 bits.
|
|
* - The next INCR values code using MINBITS+2 bits.
|
|
* - (and so on).
|
|
*/
|
|
static void compute_coding_constants(u32 mean_delta, u16 *min_bits, u32 *min_keys, u32 *incr_keys)
|
|
{
|
|
/*
|
|
* We want to compute the rounded value of log(2) * mean_delta. Since we cannot always use
|
|
* floating point, use a really good integer approximation.
|
|
*/
|
|
*incr_keys = (836158UL * mean_delta + 603160UL) / 1206321UL;
|
|
*min_bits = bits_per(*incr_keys + 1);
|
|
*min_keys = (1 << *min_bits) - *incr_keys;
|
|
}
|
|
|
|
void uds_uninitialize_delta_index(struct delta_index *delta_index)
|
|
{
|
|
unsigned int z;
|
|
|
|
if (delta_index->delta_zones == NULL)
|
|
return;
|
|
|
|
for (z = 0; z < delta_index->zone_count; z++) {
|
|
vdo_free(vdo_forget(delta_index->delta_zones[z].new_offsets));
|
|
vdo_free(vdo_forget(delta_index->delta_zones[z].delta_lists));
|
|
vdo_free(vdo_forget(delta_index->delta_zones[z].memory));
|
|
}
|
|
|
|
vdo_free(delta_index->delta_zones);
|
|
memset(delta_index, 0, sizeof(struct delta_index));
|
|
}
|
|
|
|
static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size,
|
|
u32 first_list, u32 list_count, u32 mean_delta,
|
|
u32 payload_bits, u8 tag)
|
|
{
|
|
int result;
|
|
|
|
result = vdo_allocate(size, u8, "delta list", &delta_zone->memory);
|
|
if (result != VDO_SUCCESS)
|
|
return result;
|
|
|
|
result = vdo_allocate(list_count + 2, u64, "delta list temp",
|
|
&delta_zone->new_offsets);
|
|
if (result != VDO_SUCCESS)
|
|
return result;
|
|
|
|
/* Allocate the delta lists. */
|
|
result = vdo_allocate(list_count + 2, struct delta_list, "delta lists",
|
|
&delta_zone->delta_lists);
|
|
if (result != VDO_SUCCESS)
|
|
return result;
|
|
|
|
compute_coding_constants(mean_delta, &delta_zone->min_bits,
|
|
&delta_zone->min_keys, &delta_zone->incr_keys);
|
|
delta_zone->value_bits = payload_bits;
|
|
delta_zone->buffered_writer = NULL;
|
|
delta_zone->size = size;
|
|
delta_zone->rebalance_time = 0;
|
|
delta_zone->rebalance_count = 0;
|
|
delta_zone->record_count = 0;
|
|
delta_zone->collision_count = 0;
|
|
delta_zone->discard_count = 0;
|
|
delta_zone->overflow_count = 0;
|
|
delta_zone->first_list = first_list;
|
|
delta_zone->list_count = list_count;
|
|
delta_zone->tag = tag;
|
|
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zone_count,
|
|
u32 list_count, u32 mean_delta, u32 payload_bits,
|
|
size_t memory_size, u8 tag)
|
|
{
|
|
int result;
|
|
unsigned int z;
|
|
size_t zone_memory;
|
|
|
|
result = vdo_allocate(zone_count, struct delta_zone, "Delta Index Zones",
|
|
&delta_index->delta_zones);
|
|
if (result != VDO_SUCCESS)
|
|
return result;
|
|
|
|
delta_index->zone_count = zone_count;
|
|
delta_index->list_count = list_count;
|
|
delta_index->lists_per_zone = DIV_ROUND_UP(list_count, zone_count);
|
|
delta_index->memory_size = 0;
|
|
delta_index->mutable = true;
|
|
delta_index->tag = tag;
|
|
|
|
for (z = 0; z < zone_count; z++) {
|
|
u32 lists_in_zone = delta_index->lists_per_zone;
|
|
u32 first_list_in_zone = z * lists_in_zone;
|
|
|
|
if (z == zone_count - 1) {
|
|
/*
|
|
* The last zone gets fewer lists if zone_count doesn't evenly divide
|
|
* list_count. We'll have an underflow if the assertion below doesn't hold.
|
|
*/
|
|
if (delta_index->list_count <= first_list_in_zone) {
|
|
uds_uninitialize_delta_index(delta_index);
|
|
return vdo_log_error_strerror(UDS_INVALID_ARGUMENT,
|
|
"%u delta lists not enough for %u zones",
|
|
list_count, zone_count);
|
|
}
|
|
lists_in_zone = delta_index->list_count - first_list_in_zone;
|
|
}
|
|
|
|
zone_memory = get_zone_memory_size(zone_count, memory_size);
|
|
result = initialize_delta_zone(&delta_index->delta_zones[z], zone_memory,
|
|
first_list_in_zone, lists_in_zone,
|
|
mean_delta, payload_bits, tag);
|
|
if (result != UDS_SUCCESS) {
|
|
uds_uninitialize_delta_index(delta_index);
|
|
return result;
|
|
}
|
|
|
|
delta_index->memory_size +=
|
|
(sizeof(struct delta_zone) + zone_memory +
|
|
(lists_in_zone + 2) * (sizeof(struct delta_list) + sizeof(u64)));
|
|
}
|
|
|
|
uds_reset_delta_index(delta_index);
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
/* Read a bit field from an arbitrary bit boundary. */
|
|
static inline u32 get_field(const u8 *memory, u64 offset, u8 size)
|
|
{
|
|
const void *addr = memory + offset / BITS_PER_BYTE;
|
|
|
|
return (get_unaligned_le32(addr) >> (offset % BITS_PER_BYTE)) & ((1 << size) - 1);
|
|
}
|
|
|
|
/* Write a bit field to an arbitrary bit boundary. */
|
|
static inline void set_field(u32 value, u8 *memory, u64 offset, u8 size)
|
|
{
|
|
void *addr = memory + offset / BITS_PER_BYTE;
|
|
int shift = offset % BITS_PER_BYTE;
|
|
u32 data = get_unaligned_le32(addr);
|
|
|
|
data &= ~(((1 << size) - 1) << shift);
|
|
data |= value << shift;
|
|
put_unaligned_le32(data, addr);
|
|
}
|
|
|
|
/* Get the bit offset to the immutable delta list header. */
|
|
static inline u32 get_immutable_header_offset(u32 list_number)
|
|
{
|
|
return sizeof(struct delta_page_header) * BITS_PER_BYTE +
|
|
list_number * IMMUTABLE_HEADER_SIZE;
|
|
}
|
|
|
|
/* Get the bit offset to the start of the immutable delta list bit stream. */
|
|
static inline u32 get_immutable_start(const u8 *memory, u32 list_number)
|
|
{
|
|
return get_field(memory, get_immutable_header_offset(list_number),
|
|
IMMUTABLE_HEADER_SIZE);
|
|
}
|
|
|
|
/* Set the bit offset to the start of the immutable delta list bit stream. */
|
|
static inline void set_immutable_start(u8 *memory, u32 list_number, u32 start)
|
|
{
|
|
set_field(start, memory, get_immutable_header_offset(list_number),
|
|
IMMUTABLE_HEADER_SIZE);
|
|
}
|
|
|
|
static bool verify_delta_index_page(u64 nonce, u16 list_count, u64 expected_nonce,
|
|
u8 *memory, size_t memory_size)
|
|
{
|
|
unsigned int i;
|
|
|
|
/*
|
|
* Verify the nonce. A mismatch can happen here during rebuild if we haven't written the
|
|
* entire volume at least once.
|
|
*/
|
|
if (nonce != expected_nonce)
|
|
return false;
|
|
|
|
/* Verify that the number of delta lists can fit in the page. */
|
|
if (list_count > ((memory_size - sizeof(struct delta_page_header)) *
|
|
BITS_PER_BYTE / IMMUTABLE_HEADER_SIZE))
|
|
return false;
|
|
|
|
/*
|
|
* Verify that the first delta list is immediately after the last delta
|
|
* list header.
|
|
*/
|
|
if (get_immutable_start(memory, 0) != get_immutable_header_offset(list_count + 1))
|
|
return false;
|
|
|
|
/* Verify that the lists are in the correct order. */
|
|
for (i = 0; i < list_count; i++) {
|
|
if (get_immutable_start(memory, i) > get_immutable_start(memory, i + 1))
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Verify that the last list ends on the page, and that there is room
|
|
* for the post-field guard bits.
|
|
*/
|
|
if (get_immutable_start(memory, list_count) >
|
|
(memory_size - POST_FIELD_GUARD_BYTES) * BITS_PER_BYTE)
|
|
return false;
|
|
|
|
/* Verify that the guard bytes are correctly set to all ones. */
|
|
for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) {
|
|
if (memory[memory_size - POST_FIELD_GUARD_BYTES + i] != (u8) ~0)
|
|
return false;
|
|
}
|
|
|
|
/* All verifications passed. */
|
|
return true;
|
|
}
|
|
|
|
/* Initialize a delta index page to refer to a supplied page. */
|
|
int uds_initialize_delta_index_page(struct delta_index_page *delta_index_page,
|
|
u64 expected_nonce, u32 mean_delta, u32 payload_bits,
|
|
u8 *memory, size_t memory_size)
|
|
{
|
|
u64 nonce;
|
|
u64 vcn;
|
|
u64 first_list;
|
|
u64 list_count;
|
|
struct delta_page_header *header = (struct delta_page_header *) memory;
|
|
struct delta_zone *delta_zone = &delta_index_page->delta_zone;
|
|
const u8 *nonce_addr = (const u8 *) &header->nonce;
|
|
const u8 *vcn_addr = (const u8 *) &header->virtual_chapter_number;
|
|
const u8 *first_list_addr = (const u8 *) &header->first_list;
|
|
const u8 *list_count_addr = (const u8 *) &header->list_count;
|
|
|
|
/* First assume that the header is little endian. */
|
|
nonce = get_unaligned_le64(nonce_addr);
|
|
vcn = get_unaligned_le64(vcn_addr);
|
|
first_list = get_unaligned_le16(first_list_addr);
|
|
list_count = get_unaligned_le16(list_count_addr);
|
|
if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
|
|
memory_size)) {
|
|
/* If that fails, try big endian. */
|
|
nonce = get_unaligned_be64(nonce_addr);
|
|
vcn = get_unaligned_be64(vcn_addr);
|
|
first_list = get_unaligned_be16(first_list_addr);
|
|
list_count = get_unaligned_be16(list_count_addr);
|
|
if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
|
|
memory_size)) {
|
|
/*
|
|
* Both attempts failed. Do not log this as an error, because it can happen
|
|
* during a rebuild if we haven't written the entire volume at least once.
|
|
*/
|
|
return UDS_CORRUPT_DATA;
|
|
}
|
|
}
|
|
|
|
delta_index_page->delta_index.delta_zones = delta_zone;
|
|
delta_index_page->delta_index.zone_count = 1;
|
|
delta_index_page->delta_index.list_count = list_count;
|
|
delta_index_page->delta_index.lists_per_zone = list_count;
|
|
delta_index_page->delta_index.mutable = false;
|
|
delta_index_page->delta_index.tag = 'p';
|
|
delta_index_page->virtual_chapter_number = vcn;
|
|
delta_index_page->lowest_list_number = first_list;
|
|
delta_index_page->highest_list_number = first_list + list_count - 1;
|
|
|
|
compute_coding_constants(mean_delta, &delta_zone->min_bits,
|
|
&delta_zone->min_keys, &delta_zone->incr_keys);
|
|
delta_zone->value_bits = payload_bits;
|
|
delta_zone->memory = memory;
|
|
delta_zone->delta_lists = NULL;
|
|
delta_zone->new_offsets = NULL;
|
|
delta_zone->buffered_writer = NULL;
|
|
delta_zone->size = memory_size;
|
|
delta_zone->rebalance_time = 0;
|
|
delta_zone->rebalance_count = 0;
|
|
delta_zone->record_count = 0;
|
|
delta_zone->collision_count = 0;
|
|
delta_zone->discard_count = 0;
|
|
delta_zone->overflow_count = 0;
|
|
delta_zone->first_list = 0;
|
|
delta_zone->list_count = list_count;
|
|
delta_zone->tag = 'p';
|
|
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
/* Read a large bit field from an arbitrary bit boundary. */
|
|
static inline u64 get_big_field(const u8 *memory, u64 offset, u8 size)
|
|
{
|
|
const void *addr = memory + offset / BITS_PER_BYTE;
|
|
|
|
return (get_unaligned_le64(addr) >> (offset % BITS_PER_BYTE)) & ((1UL << size) - 1);
|
|
}
|
|
|
|
/* Write a large bit field to an arbitrary bit boundary. */
|
|
static inline void set_big_field(u64 value, u8 *memory, u64 offset, u8 size)
|
|
{
|
|
void *addr = memory + offset / BITS_PER_BYTE;
|
|
u8 shift = offset % BITS_PER_BYTE;
|
|
u64 data = get_unaligned_le64(addr);
|
|
|
|
data &= ~(((1UL << size) - 1) << shift);
|
|
data |= value << shift;
|
|
put_unaligned_le64(data, addr);
|
|
}
|
|
|
|
/* Set a sequence of bits to all zeros. */
|
|
static inline void set_zero(u8 *memory, u64 offset, u32 size)
|
|
{
|
|
if (size > 0) {
|
|
u8 *addr = memory + offset / BITS_PER_BYTE;
|
|
u8 shift = offset % BITS_PER_BYTE;
|
|
u32 count = size + shift > BITS_PER_BYTE ? (u32) BITS_PER_BYTE - shift : size;
|
|
|
|
*addr++ &= ~(((1 << count) - 1) << shift);
|
|
for (size -= count; size > BITS_PER_BYTE; size -= BITS_PER_BYTE)
|
|
*addr++ = 0;
|
|
|
|
if (size > 0)
|
|
*addr &= 0xFF << size;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Move several bits from a higher to a lower address, moving the lower addressed bits first. The
|
|
* size and memory offsets are measured in bits.
|
|
*/
|
|
static void move_bits_down(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
|
|
{
|
|
const u8 *source;
|
|
u8 *destination;
|
|
u8 offset;
|
|
u8 count;
|
|
u64 field;
|
|
|
|
/* Start by moving one field that ends on a to int boundary. */
|
|
count = (MAX_BIG_FIELD_BITS - ((to_offset + MAX_BIG_FIELD_BITS) % BITS_PER_TYPE(u32)));
|
|
field = get_big_field(from, from_offset, count);
|
|
set_big_field(field, to, to_offset, count);
|
|
from_offset += count;
|
|
to_offset += count;
|
|
size -= count;
|
|
|
|
/* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
|
|
offset = from_offset % BITS_PER_TYPE(u32);
|
|
source = from + (from_offset - offset) / BITS_PER_BYTE;
|
|
destination = to + to_offset / BITS_PER_BYTE;
|
|
while (size > MAX_BIG_FIELD_BITS) {
|
|
put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
|
|
source += sizeof(u32);
|
|
destination += sizeof(u32);
|
|
from_offset += BITS_PER_TYPE(u32);
|
|
to_offset += BITS_PER_TYPE(u32);
|
|
size -= BITS_PER_TYPE(u32);
|
|
}
|
|
|
|
/* Finish up by moving any remaining bits. */
|
|
if (size > 0) {
|
|
field = get_big_field(from, from_offset, size);
|
|
set_big_field(field, to, to_offset, size);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Move several bits from a lower to a higher address, moving the higher addressed bits first. The
|
|
* size and memory offsets are measured in bits.
|
|
*/
|
|
static void move_bits_up(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
|
|
{
|
|
const u8 *source;
|
|
u8 *destination;
|
|
u8 offset;
|
|
u8 count;
|
|
u64 field;
|
|
|
|
/* Start by moving one field that begins on a destination int boundary. */
|
|
count = (to_offset + size) % BITS_PER_TYPE(u32);
|
|
if (count > 0) {
|
|
size -= count;
|
|
field = get_big_field(from, from_offset + size, count);
|
|
set_big_field(field, to, to_offset + size, count);
|
|
}
|
|
|
|
/* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
|
|
offset = (from_offset + size) % BITS_PER_TYPE(u32);
|
|
source = from + (from_offset + size - offset) / BITS_PER_BYTE;
|
|
destination = to + (to_offset + size) / BITS_PER_BYTE;
|
|
while (size > MAX_BIG_FIELD_BITS) {
|
|
source -= sizeof(u32);
|
|
destination -= sizeof(u32);
|
|
size -= BITS_PER_TYPE(u32);
|
|
put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
|
|
}
|
|
|
|
/* Finish up by moving any remaining bits. */
|
|
if (size > 0) {
|
|
field = get_big_field(from, from_offset, size);
|
|
set_big_field(field, to, to_offset, size);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Move bits from one field to another. When the fields overlap, behave as if we first move all the
|
|
* bits from the source to a temporary value, and then move all the bits from the temporary value
|
|
* to the destination. The size and memory offsets are measured in bits.
|
|
*/
|
|
static void move_bits(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
|
|
{
|
|
u64 field;
|
|
|
|
/* A small move doesn't require special handling. */
|
|
if (size <= MAX_BIG_FIELD_BITS) {
|
|
if (size > 0) {
|
|
field = get_big_field(from, from_offset, size);
|
|
set_big_field(field, to, to_offset, size);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
if (from_offset > to_offset)
|
|
move_bits_down(from, from_offset, to, to_offset, size);
|
|
else
|
|
move_bits_up(from, from_offset, to, to_offset, size);
|
|
}
|
|
|
|
/*
|
|
* Pack delta lists from a mutable delta index into an immutable delta index page. A range of delta
|
|
* lists (starting with a specified list index) is copied from the mutable delta index into a
|
|
* memory page used in the immutable index. The number of lists copied onto the page is returned in
|
|
* list_count.
|
|
*/
|
|
int uds_pack_delta_index_page(const struct delta_index *delta_index, u64 header_nonce,
|
|
u8 *memory, size_t memory_size, u64 virtual_chapter_number,
|
|
u32 first_list, u32 *list_count)
|
|
{
|
|
const struct delta_zone *delta_zone;
|
|
struct delta_list *delta_lists;
|
|
u32 max_lists;
|
|
u32 n_lists = 0;
|
|
u32 offset;
|
|
u32 i;
|
|
int free_bits;
|
|
int bits;
|
|
struct delta_page_header *header;
|
|
|
|
delta_zone = &delta_index->delta_zones[0];
|
|
delta_lists = &delta_zone->delta_lists[first_list + 1];
|
|
max_lists = delta_index->list_count - first_list;
|
|
|
|
/*
|
|
* Compute how many lists will fit on the page. Subtract the size of the fixed header, one
|
|
* delta list offset, and the guard bytes from the page size to determine how much space is
|
|
* available for delta lists.
|
|
*/
|
|
free_bits = memory_size * BITS_PER_BYTE;
|
|
free_bits -= get_immutable_header_offset(1);
|
|
free_bits -= GUARD_BITS;
|
|
if (free_bits < IMMUTABLE_HEADER_SIZE) {
|
|
/* This page is too small to store any delta lists. */
|
|
return vdo_log_error_strerror(UDS_OVERFLOW,
|
|
"Chapter Index Page of %zu bytes is too small",
|
|
memory_size);
|
|
}
|
|
|
|
while (n_lists < max_lists) {
|
|
/* Each list requires a delta list offset and the list data. */
|
|
bits = IMMUTABLE_HEADER_SIZE + delta_lists[n_lists].size;
|
|
if (bits > free_bits)
|
|
break;
|
|
|
|
n_lists++;
|
|
free_bits -= bits;
|
|
}
|
|
|
|
*list_count = n_lists;
|
|
|
|
header = (struct delta_page_header *) memory;
|
|
put_unaligned_le64(header_nonce, (u8 *) &header->nonce);
|
|
put_unaligned_le64(virtual_chapter_number,
|
|
(u8 *) &header->virtual_chapter_number);
|
|
put_unaligned_le16(first_list, (u8 *) &header->first_list);
|
|
put_unaligned_le16(n_lists, (u8 *) &header->list_count);
|
|
|
|
/* Construct the delta list offset table. */
|
|
offset = get_immutable_header_offset(n_lists + 1);
|
|
set_immutable_start(memory, 0, offset);
|
|
for (i = 0; i < n_lists; i++) {
|
|
offset += delta_lists[i].size;
|
|
set_immutable_start(memory, i + 1, offset);
|
|
}
|
|
|
|
/* Copy the delta list data onto the memory page. */
|
|
for (i = 0; i < n_lists; i++) {
|
|
move_bits(delta_zone->memory, delta_lists[i].start, memory,
|
|
get_immutable_start(memory, i), delta_lists[i].size);
|
|
}
|
|
|
|
/* Set all the bits in the guard bytes. */
|
|
memset(memory + memory_size - POST_FIELD_GUARD_BYTES, ~0,
|
|
POST_FIELD_GUARD_BYTES);
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
/* Compute the new offsets of the delta lists. */
|
|
static void compute_new_list_offsets(struct delta_zone *delta_zone, u32 growing_index,
|
|
size_t growing_size, size_t used_space)
|
|
{
|
|
size_t spacing;
|
|
u32 i;
|
|
struct delta_list *delta_lists = delta_zone->delta_lists;
|
|
u32 tail_guard_index = delta_zone->list_count + 1;
|
|
|
|
spacing = (delta_zone->size - used_space) / delta_zone->list_count;
|
|
delta_zone->new_offsets[0] = 0;
|
|
for (i = 0; i <= delta_zone->list_count; i++) {
|
|
delta_zone->new_offsets[i + 1] =
|
|
(delta_zone->new_offsets[i] +
|
|
get_delta_list_byte_size(&delta_lists[i]) + spacing);
|
|
delta_zone->new_offsets[i] *= BITS_PER_BYTE;
|
|
delta_zone->new_offsets[i] += delta_lists[i].start % BITS_PER_BYTE;
|
|
if (i == 0)
|
|
delta_zone->new_offsets[i + 1] -= spacing / 2;
|
|
if (i + 1 == growing_index)
|
|
delta_zone->new_offsets[i + 1] += growing_size;
|
|
}
|
|
|
|
delta_zone->new_offsets[tail_guard_index] =
|
|
(delta_zone->size * BITS_PER_BYTE - delta_lists[tail_guard_index].size);
|
|
}
|
|
|
|
static void rebalance_lists(struct delta_zone *delta_zone)
|
|
{
|
|
struct delta_list *delta_lists;
|
|
u32 i;
|
|
size_t used_space = 0;
|
|
|
|
/* Extend and balance memory to receive the delta lists */
|
|
delta_lists = delta_zone->delta_lists;
|
|
for (i = 0; i <= delta_zone->list_count + 1; i++)
|
|
used_space += get_delta_list_byte_size(&delta_lists[i]);
|
|
|
|
compute_new_list_offsets(delta_zone, 0, 0, used_space);
|
|
for (i = 1; i <= delta_zone->list_count + 1; i++)
|
|
delta_lists[i].start = delta_zone->new_offsets[i];
|
|
}
|
|
|
|
/* Start restoring a delta index from multiple input streams. */
|
|
int uds_start_restoring_delta_index(struct delta_index *delta_index,
|
|
struct buffered_reader **buffered_readers,
|
|
unsigned int reader_count)
|
|
{
|
|
int result;
|
|
unsigned int zone_count = reader_count;
|
|
u64 record_count = 0;
|
|
u64 collision_count = 0;
|
|
u32 first_list[MAX_ZONES];
|
|
u32 list_count[MAX_ZONES];
|
|
unsigned int z;
|
|
u32 list_next = 0;
|
|
const struct delta_zone *delta_zone;
|
|
|
|
/* Read and validate each header. */
|
|
for (z = 0; z < zone_count; z++) {
|
|
struct delta_index_header header;
|
|
u8 buffer[sizeof(struct delta_index_header)];
|
|
size_t offset = 0;
|
|
|
|
result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
|
|
sizeof(buffer));
|
|
if (result != UDS_SUCCESS) {
|
|
return vdo_log_warning_strerror(result,
|
|
"failed to read delta index header");
|
|
}
|
|
|
|
memcpy(&header.magic, buffer, MAGIC_SIZE);
|
|
offset += MAGIC_SIZE;
|
|
decode_u32_le(buffer, &offset, &header.zone_number);
|
|
decode_u32_le(buffer, &offset, &header.zone_count);
|
|
decode_u32_le(buffer, &offset, &header.first_list);
|
|
decode_u32_le(buffer, &offset, &header.list_count);
|
|
decode_u64_le(buffer, &offset, &header.record_count);
|
|
decode_u64_le(buffer, &offset, &header.collision_count);
|
|
|
|
result = VDO_ASSERT(offset == sizeof(struct delta_index_header),
|
|
"%zu bytes decoded of %zu expected", offset,
|
|
sizeof(struct delta_index_header));
|
|
if (result != VDO_SUCCESS) {
|
|
return vdo_log_warning_strerror(result,
|
|
"failed to read delta index header");
|
|
}
|
|
|
|
if (memcmp(header.magic, DELTA_INDEX_MAGIC, MAGIC_SIZE) != 0) {
|
|
return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
|
|
"delta index file has bad magic number");
|
|
}
|
|
|
|
if (zone_count != header.zone_count) {
|
|
return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
|
|
"delta index files contain mismatched zone counts (%u,%u)",
|
|
zone_count, header.zone_count);
|
|
}
|
|
|
|
if (header.zone_number != z) {
|
|
return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
|
|
"delta index zone %u found in slot %u",
|
|
header.zone_number, z);
|
|
}
|
|
|
|
first_list[z] = header.first_list;
|
|
list_count[z] = header.list_count;
|
|
record_count += header.record_count;
|
|
collision_count += header.collision_count;
|
|
|
|
if (first_list[z] != list_next) {
|
|
return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
|
|
"delta index file for zone %u starts with list %u instead of list %u",
|
|
z, first_list[z], list_next);
|
|
}
|
|
|
|
list_next += list_count[z];
|
|
}
|
|
|
|
if (list_next != delta_index->list_count) {
|
|
return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
|
|
"delta index files contain %u delta lists instead of %u delta lists",
|
|
list_next, delta_index->list_count);
|
|
}
|
|
|
|
if (collision_count > record_count) {
|
|
return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
|
|
"delta index files contain %llu collisions and %llu records",
|
|
(unsigned long long) collision_count,
|
|
(unsigned long long) record_count);
|
|
}
|
|
|
|
uds_reset_delta_index(delta_index);
|
|
delta_index->delta_zones[0].record_count = record_count;
|
|
delta_index->delta_zones[0].collision_count = collision_count;
|
|
|
|
/* Read the delta lists and distribute them to the proper zones. */
|
|
for (z = 0; z < zone_count; z++) {
|
|
u32 i;
|
|
|
|
delta_index->load_lists[z] = 0;
|
|
for (i = 0; i < list_count[z]; i++) {
|
|
u16 delta_list_size;
|
|
u32 list_number;
|
|
unsigned int zone_number;
|
|
u8 size_data[sizeof(u16)];
|
|
|
|
result = uds_read_from_buffered_reader(buffered_readers[z],
|
|
size_data,
|
|
sizeof(size_data));
|
|
if (result != UDS_SUCCESS) {
|
|
return vdo_log_warning_strerror(result,
|
|
"failed to read delta index size");
|
|
}
|
|
|
|
delta_list_size = get_unaligned_le16(size_data);
|
|
if (delta_list_size > 0)
|
|
delta_index->load_lists[z] += 1;
|
|
|
|
list_number = first_list[z] + i;
|
|
zone_number = list_number / delta_index->lists_per_zone;
|
|
delta_zone = &delta_index->delta_zones[zone_number];
|
|
list_number -= delta_zone->first_list;
|
|
delta_zone->delta_lists[list_number + 1].size = delta_list_size;
|
|
}
|
|
}
|
|
|
|
/* Prepare each zone to start receiving the delta list data. */
|
|
for (z = 0; z < delta_index->zone_count; z++)
|
|
rebalance_lists(&delta_index->delta_zones[z]);
|
|
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
static int restore_delta_list_to_zone(struct delta_zone *delta_zone,
|
|
const struct delta_list_save_info *save_info,
|
|
const u8 *data)
|
|
{
|
|
struct delta_list *delta_list;
|
|
u16 bit_count;
|
|
u16 byte_count;
|
|
u32 list_number = save_info->index - delta_zone->first_list;
|
|
|
|
if (list_number >= delta_zone->list_count) {
|
|
return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
|
|
"invalid delta list number %u not in range [%u,%u)",
|
|
save_info->index, delta_zone->first_list,
|
|
delta_zone->first_list + delta_zone->list_count);
|
|
}
|
|
|
|
delta_list = &delta_zone->delta_lists[list_number + 1];
|
|
if (delta_list->size == 0) {
|
|
return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
|
|
"unexpected delta list number %u",
|
|
save_info->index);
|
|
}
|
|
|
|
bit_count = delta_list->size + save_info->bit_offset;
|
|
byte_count = BITS_TO_BYTES(bit_count);
|
|
if (save_info->byte_count != byte_count) {
|
|
return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
|
|
"unexpected delta list size %u != %u",
|
|
save_info->byte_count, byte_count);
|
|
}
|
|
|
|
move_bits(data, save_info->bit_offset, delta_zone->memory, delta_list->start,
|
|
delta_list->size);
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
static int restore_delta_list_data(struct delta_index *delta_index, unsigned int load_zone,
|
|
struct buffered_reader *buffered_reader, u8 *data)
|
|
{
|
|
int result;
|
|
struct delta_list_save_info save_info;
|
|
u8 buffer[sizeof(struct delta_list_save_info)];
|
|
unsigned int new_zone;
|
|
|
|
result = uds_read_from_buffered_reader(buffered_reader, buffer, sizeof(buffer));
|
|
if (result != UDS_SUCCESS) {
|
|
return vdo_log_warning_strerror(result,
|
|
"failed to read delta list data");
|
|
}
|
|
|
|
save_info = (struct delta_list_save_info) {
|
|
.tag = buffer[0],
|
|
.bit_offset = buffer[1],
|
|
.byte_count = get_unaligned_le16(&buffer[2]),
|
|
.index = get_unaligned_le32(&buffer[4]),
|
|
};
|
|
|
|
if ((save_info.bit_offset >= BITS_PER_BYTE) ||
|
|
(save_info.byte_count > DELTA_LIST_MAX_BYTE_COUNT)) {
|
|
return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
|
|
"corrupt delta list data");
|
|
}
|
|
|
|
/* Make sure the data is intended for this delta index. */
|
|
if (save_info.tag != delta_index->tag)
|
|
return UDS_CORRUPT_DATA;
|
|
|
|
if (save_info.index >= delta_index->list_count) {
|
|
return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
|
|
"invalid delta list number %u of %u",
|
|
save_info.index,
|
|
delta_index->list_count);
|
|
}
|
|
|
|
result = uds_read_from_buffered_reader(buffered_reader, data,
|
|
save_info.byte_count);
|
|
if (result != UDS_SUCCESS) {
|
|
return vdo_log_warning_strerror(result,
|
|
"failed to read delta list data");
|
|
}
|
|
|
|
delta_index->load_lists[load_zone] -= 1;
|
|
new_zone = save_info.index / delta_index->lists_per_zone;
|
|
return restore_delta_list_to_zone(&delta_index->delta_zones[new_zone],
|
|
&save_info, data);
|
|
}
|
|
|
|
/* Restore delta lists from saved data. */
|
|
int uds_finish_restoring_delta_index(struct delta_index *delta_index,
|
|
struct buffered_reader **buffered_readers,
|
|
unsigned int reader_count)
|
|
{
|
|
int result;
|
|
int saved_result = UDS_SUCCESS;
|
|
unsigned int z;
|
|
u8 *data;
|
|
|
|
result = vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT, u8, __func__, &data);
|
|
if (result != VDO_SUCCESS)
|
|
return result;
|
|
|
|
for (z = 0; z < reader_count; z++) {
|
|
while (delta_index->load_lists[z] > 0) {
|
|
result = restore_delta_list_data(delta_index, z,
|
|
buffered_readers[z], data);
|
|
if (result != UDS_SUCCESS) {
|
|
saved_result = result;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
vdo_free(data);
|
|
return saved_result;
|
|
}
|
|
|
|
int uds_check_guard_delta_lists(struct buffered_reader **buffered_readers,
|
|
unsigned int reader_count)
|
|
{
|
|
int result;
|
|
unsigned int z;
|
|
u8 buffer[sizeof(struct delta_list_save_info)];
|
|
|
|
for (z = 0; z < reader_count; z++) {
|
|
result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
|
|
sizeof(buffer));
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
|
|
if (buffer[0] != 'z')
|
|
return UDS_CORRUPT_DATA;
|
|
}
|
|
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
static int flush_delta_list(struct delta_zone *zone, u32 flush_index)
|
|
{
|
|
struct delta_list *delta_list;
|
|
u8 buffer[sizeof(struct delta_list_save_info)];
|
|
int result;
|
|
|
|
delta_list = &zone->delta_lists[flush_index + 1];
|
|
|
|
buffer[0] = zone->tag;
|
|
buffer[1] = delta_list->start % BITS_PER_BYTE;
|
|
put_unaligned_le16(get_delta_list_byte_size(delta_list), &buffer[2]);
|
|
put_unaligned_le32(zone->first_list + flush_index, &buffer[4]);
|
|
|
|
result = uds_write_to_buffered_writer(zone->buffered_writer, buffer,
|
|
sizeof(buffer));
|
|
if (result != UDS_SUCCESS) {
|
|
vdo_log_warning_strerror(result, "failed to write delta list memory");
|
|
return result;
|
|
}
|
|
|
|
result = uds_write_to_buffered_writer(zone->buffered_writer,
|
|
zone->memory + get_delta_list_byte_start(delta_list),
|
|
get_delta_list_byte_size(delta_list));
|
|
if (result != UDS_SUCCESS)
|
|
vdo_log_warning_strerror(result, "failed to write delta list memory");
|
|
|
|
return result;
|
|
}
|
|
|
|
/* Start saving a delta index zone to a buffered output stream. */
|
|
int uds_start_saving_delta_index(const struct delta_index *delta_index,
|
|
unsigned int zone_number,
|
|
struct buffered_writer *buffered_writer)
|
|
{
|
|
int result;
|
|
u32 i;
|
|
struct delta_zone *delta_zone;
|
|
u8 buffer[sizeof(struct delta_index_header)];
|
|
size_t offset = 0;
|
|
|
|
delta_zone = &delta_index->delta_zones[zone_number];
|
|
memcpy(buffer, DELTA_INDEX_MAGIC, MAGIC_SIZE);
|
|
offset += MAGIC_SIZE;
|
|
encode_u32_le(buffer, &offset, zone_number);
|
|
encode_u32_le(buffer, &offset, delta_index->zone_count);
|
|
encode_u32_le(buffer, &offset, delta_zone->first_list);
|
|
encode_u32_le(buffer, &offset, delta_zone->list_count);
|
|
encode_u64_le(buffer, &offset, delta_zone->record_count);
|
|
encode_u64_le(buffer, &offset, delta_zone->collision_count);
|
|
|
|
result = VDO_ASSERT(offset == sizeof(struct delta_index_header),
|
|
"%zu bytes encoded of %zu expected", offset,
|
|
sizeof(struct delta_index_header));
|
|
if (result != VDO_SUCCESS)
|
|
return result;
|
|
|
|
result = uds_write_to_buffered_writer(buffered_writer, buffer, offset);
|
|
if (result != UDS_SUCCESS)
|
|
return vdo_log_warning_strerror(result,
|
|
"failed to write delta index header");
|
|
|
|
for (i = 0; i < delta_zone->list_count; i++) {
|
|
u8 data[sizeof(u16)];
|
|
struct delta_list *delta_list;
|
|
|
|
delta_list = &delta_zone->delta_lists[i + 1];
|
|
put_unaligned_le16(delta_list->size, data);
|
|
result = uds_write_to_buffered_writer(buffered_writer, data,
|
|
sizeof(data));
|
|
if (result != UDS_SUCCESS)
|
|
return vdo_log_warning_strerror(result,
|
|
"failed to write delta list size");
|
|
}
|
|
|
|
delta_zone->buffered_writer = buffered_writer;
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
int uds_finish_saving_delta_index(const struct delta_index *delta_index,
|
|
unsigned int zone_number)
|
|
{
|
|
int result;
|
|
int first_error = UDS_SUCCESS;
|
|
u32 i;
|
|
struct delta_zone *delta_zone;
|
|
struct delta_list *delta_list;
|
|
|
|
delta_zone = &delta_index->delta_zones[zone_number];
|
|
for (i = 0; i < delta_zone->list_count; i++) {
|
|
delta_list = &delta_zone->delta_lists[i + 1];
|
|
if (delta_list->size > 0) {
|
|
result = flush_delta_list(delta_zone, i);
|
|
if ((result != UDS_SUCCESS) && (first_error == UDS_SUCCESS))
|
|
first_error = result;
|
|
}
|
|
}
|
|
|
|
delta_zone->buffered_writer = NULL;
|
|
return first_error;
|
|
}
|
|
|
|
int uds_write_guard_delta_list(struct buffered_writer *buffered_writer)
|
|
{
|
|
int result;
|
|
u8 buffer[sizeof(struct delta_list_save_info)];
|
|
|
|
memset(buffer, 0, sizeof(struct delta_list_save_info));
|
|
buffer[0] = 'z';
|
|
|
|
result = uds_write_to_buffered_writer(buffered_writer, buffer, sizeof(buffer));
|
|
if (result != UDS_SUCCESS)
|
|
vdo_log_warning_strerror(result, "failed to write guard delta list");
|
|
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
size_t uds_compute_delta_index_save_bytes(u32 list_count, size_t memory_size)
|
|
{
|
|
/* One zone will use at least as much memory as other zone counts. */
|
|
return (sizeof(struct delta_index_header) +
|
|
list_count * (sizeof(struct delta_list_save_info) + 1) +
|
|
get_zone_memory_size(1, memory_size));
|
|
}
|
|
|
|
static int assert_not_at_end(const struct delta_index_entry *delta_entry)
|
|
{
|
|
int result = VDO_ASSERT(!delta_entry->at_end,
|
|
"operation is invalid because the list entry is at the end of the delta list");
|
|
if (result != VDO_SUCCESS)
|
|
result = UDS_BAD_STATE;
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Prepare to search for an entry in the specified delta list.
|
|
*
|
|
* This is always the first function to be called when dealing with delta index entries. It is
|
|
* always followed by calls to uds_next_delta_index_entry() to iterate through a delta list. The
|
|
* fields of the delta_index_entry argument will be set up for iteration, but will not contain an
|
|
* entry from the list.
|
|
*/
|
|
int uds_start_delta_index_search(const struct delta_index *delta_index, u32 list_number,
|
|
u32 key, struct delta_index_entry *delta_entry)
|
|
{
|
|
int result;
|
|
unsigned int zone_number;
|
|
struct delta_zone *delta_zone;
|
|
struct delta_list *delta_list;
|
|
|
|
result = VDO_ASSERT((list_number < delta_index->list_count),
|
|
"Delta list number (%u) is out of range (%u)", list_number,
|
|
delta_index->list_count);
|
|
if (result != VDO_SUCCESS)
|
|
return UDS_CORRUPT_DATA;
|
|
|
|
zone_number = list_number / delta_index->lists_per_zone;
|
|
delta_zone = &delta_index->delta_zones[zone_number];
|
|
list_number -= delta_zone->first_list;
|
|
result = VDO_ASSERT((list_number < delta_zone->list_count),
|
|
"Delta list number (%u) is out of range (%u) for zone (%u)",
|
|
list_number, delta_zone->list_count, zone_number);
|
|
if (result != VDO_SUCCESS)
|
|
return UDS_CORRUPT_DATA;
|
|
|
|
if (delta_index->mutable) {
|
|
delta_list = &delta_zone->delta_lists[list_number + 1];
|
|
} else {
|
|
u32 end_offset;
|
|
|
|
/*
|
|
* Translate the immutable delta list header into a temporary
|
|
* full delta list header.
|
|
*/
|
|
delta_list = &delta_entry->temp_delta_list;
|
|
delta_list->start = get_immutable_start(delta_zone->memory, list_number);
|
|
end_offset = get_immutable_start(delta_zone->memory, list_number + 1);
|
|
delta_list->size = end_offset - delta_list->start;
|
|
delta_list->save_key = 0;
|
|
delta_list->save_offset = 0;
|
|
}
|
|
|
|
if (key > delta_list->save_key) {
|
|
delta_entry->key = delta_list->save_key;
|
|
delta_entry->offset = delta_list->save_offset;
|
|
} else {
|
|
delta_entry->key = 0;
|
|
delta_entry->offset = 0;
|
|
if (key == 0) {
|
|
/*
|
|
* This usually means we're about to walk the entire delta list, so get all
|
|
* of it into the CPU cache.
|
|
*/
|
|
uds_prefetch_range(&delta_zone->memory[delta_list->start / BITS_PER_BYTE],
|
|
delta_list->size / BITS_PER_BYTE, false);
|
|
}
|
|
}
|
|
|
|
delta_entry->at_end = false;
|
|
delta_entry->delta_zone = delta_zone;
|
|
delta_entry->delta_list = delta_list;
|
|
delta_entry->entry_bits = 0;
|
|
delta_entry->is_collision = false;
|
|
delta_entry->list_number = list_number;
|
|
delta_entry->list_overflow = false;
|
|
delta_entry->value_bits = delta_zone->value_bits;
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
static inline u64 get_delta_entry_offset(const struct delta_index_entry *delta_entry)
|
|
{
|
|
return delta_entry->delta_list->start + delta_entry->offset;
|
|
}
|
|
|
|
/*
|
|
* Decode a delta index entry delta value. The delta_index_entry basically describes the previous
|
|
* list entry, and has had its offset field changed to point to the subsequent entry. We decode the
|
|
* bit stream and update the delta_list_entry to describe the entry.
|
|
*/
|
|
static inline void decode_delta(struct delta_index_entry *delta_entry)
|
|
{
|
|
int key_bits;
|
|
u32 delta;
|
|
const struct delta_zone *delta_zone = delta_entry->delta_zone;
|
|
const u8 *memory = delta_zone->memory;
|
|
u64 delta_offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
|
|
const u8 *addr = memory + delta_offset / BITS_PER_BYTE;
|
|
int offset = delta_offset % BITS_PER_BYTE;
|
|
u32 data = get_unaligned_le32(addr) >> offset;
|
|
|
|
addr += sizeof(u32);
|
|
key_bits = delta_zone->min_bits;
|
|
delta = data & ((1 << key_bits) - 1);
|
|
if (delta >= delta_zone->min_keys) {
|
|
data >>= key_bits;
|
|
if (data == 0) {
|
|
key_bits = sizeof(u32) * BITS_PER_BYTE - offset;
|
|
while ((data = get_unaligned_le32(addr)) == 0) {
|
|
addr += sizeof(u32);
|
|
key_bits += sizeof(u32) * BITS_PER_BYTE;
|
|
}
|
|
}
|
|
key_bits += ffs(data);
|
|
delta += ((key_bits - delta_zone->min_bits - 1) * delta_zone->incr_keys);
|
|
}
|
|
delta_entry->delta = delta;
|
|
delta_entry->key += delta;
|
|
|
|
/* Check for a collision, a delta of zero after the start. */
|
|
if (unlikely((delta == 0) && (delta_entry->offset > 0))) {
|
|
delta_entry->is_collision = true;
|
|
delta_entry->entry_bits = delta_entry->value_bits + key_bits + COLLISION_BITS;
|
|
} else {
|
|
delta_entry->is_collision = false;
|
|
delta_entry->entry_bits = delta_entry->value_bits + key_bits;
|
|
}
|
|
}
|
|
|
|
noinline int uds_next_delta_index_entry(struct delta_index_entry *delta_entry)
|
|
{
|
|
int result;
|
|
const struct delta_list *delta_list;
|
|
u32 next_offset;
|
|
u16 size;
|
|
|
|
result = assert_not_at_end(delta_entry);
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
|
|
delta_list = delta_entry->delta_list;
|
|
delta_entry->offset += delta_entry->entry_bits;
|
|
size = delta_list->size;
|
|
if (unlikely(delta_entry->offset >= size)) {
|
|
delta_entry->at_end = true;
|
|
delta_entry->delta = 0;
|
|
delta_entry->is_collision = false;
|
|
result = VDO_ASSERT((delta_entry->offset == size),
|
|
"next offset past end of delta list");
|
|
if (result != VDO_SUCCESS)
|
|
result = UDS_CORRUPT_DATA;
|
|
|
|
return result;
|
|
}
|
|
|
|
decode_delta(delta_entry);
|
|
|
|
next_offset = delta_entry->offset + delta_entry->entry_bits;
|
|
if (next_offset > size) {
|
|
/*
|
|
* This is not an assertion because uds_validate_chapter_index_page() wants to
|
|
* handle this error.
|
|
*/
|
|
vdo_log_warning("Decoded past the end of the delta list");
|
|
return UDS_CORRUPT_DATA;
|
|
}
|
|
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
int uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry)
|
|
{
|
|
int result;
|
|
struct delta_list *delta_list = delta_entry->delta_list;
|
|
|
|
result = VDO_ASSERT(!delta_entry->is_collision, "entry is not a collision");
|
|
if (result != VDO_SUCCESS)
|
|
return result;
|
|
|
|
delta_list->save_key = delta_entry->key - delta_entry->delta;
|
|
delta_list->save_offset = delta_entry->offset;
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
static void set_delta(struct delta_index_entry *delta_entry, u32 delta)
|
|
{
|
|
const struct delta_zone *delta_zone = delta_entry->delta_zone;
|
|
u32 key_bits = (delta_zone->min_bits +
|
|
((delta_zone->incr_keys - delta_zone->min_keys + delta) /
|
|
delta_zone->incr_keys));
|
|
|
|
delta_entry->delta = delta;
|
|
delta_entry->entry_bits = delta_entry->value_bits + key_bits;
|
|
}
|
|
|
|
static void get_collision_name(const struct delta_index_entry *entry, u8 *name)
|
|
{
|
|
u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
|
|
const u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
|
|
int size = COLLISION_BYTES;
|
|
int shift = offset % BITS_PER_BYTE;
|
|
|
|
while (--size >= 0)
|
|
*name++ = get_unaligned_le16(addr++) >> shift;
|
|
}
|
|
|
|
static void set_collision_name(const struct delta_index_entry *entry, const u8 *name)
|
|
{
|
|
u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
|
|
u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
|
|
int size = COLLISION_BYTES;
|
|
int shift = offset % BITS_PER_BYTE;
|
|
u16 mask = ~((u16) 0xFF << shift);
|
|
u16 data;
|
|
|
|
while (--size >= 0) {
|
|
data = (get_unaligned_le16(addr) & mask) | (*name++ << shift);
|
|
put_unaligned_le16(data, addr++);
|
|
}
|
|
}
|
|
|
|
int uds_get_delta_index_entry(const struct delta_index *delta_index, u32 list_number,
|
|
u32 key, const u8 *name,
|
|
struct delta_index_entry *delta_entry)
|
|
{
|
|
int result;
|
|
|
|
result = uds_start_delta_index_search(delta_index, list_number, key,
|
|
delta_entry);
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
|
|
do {
|
|
result = uds_next_delta_index_entry(delta_entry);
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
} while (!delta_entry->at_end && (key > delta_entry->key));
|
|
|
|
result = uds_remember_delta_index_offset(delta_entry);
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
|
|
if (!delta_entry->at_end && (key == delta_entry->key)) {
|
|
struct delta_index_entry collision_entry = *delta_entry;
|
|
|
|
for (;;) {
|
|
u8 full_name[COLLISION_BYTES];
|
|
|
|
result = uds_next_delta_index_entry(&collision_entry);
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
|
|
if (collision_entry.at_end || !collision_entry.is_collision)
|
|
break;
|
|
|
|
get_collision_name(&collision_entry, full_name);
|
|
if (memcmp(full_name, name, COLLISION_BYTES) == 0) {
|
|
*delta_entry = collision_entry;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
int uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry, u8 *name)
|
|
{
|
|
int result;
|
|
|
|
result = assert_not_at_end(delta_entry);
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
|
|
result = VDO_ASSERT(delta_entry->is_collision,
|
|
"Cannot get full block name from a non-collision delta index entry");
|
|
if (result != VDO_SUCCESS)
|
|
return UDS_BAD_STATE;
|
|
|
|
get_collision_name(delta_entry, name);
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
u32 uds_get_delta_entry_value(const struct delta_index_entry *delta_entry)
|
|
{
|
|
return get_field(delta_entry->delta_zone->memory,
|
|
get_delta_entry_offset(delta_entry), delta_entry->value_bits);
|
|
}
|
|
|
|
static int assert_mutable_entry(const struct delta_index_entry *delta_entry)
|
|
{
|
|
int result = VDO_ASSERT((delta_entry->delta_list != &delta_entry->temp_delta_list),
|
|
"delta index is mutable");
|
|
if (result != VDO_SUCCESS)
|
|
result = UDS_BAD_STATE;
|
|
|
|
return result;
|
|
}
|
|
|
|
int uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value)
|
|
{
|
|
int result;
|
|
u32 value_mask = (1 << delta_entry->value_bits) - 1;
|
|
|
|
result = assert_mutable_entry(delta_entry);
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
|
|
result = assert_not_at_end(delta_entry);
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
|
|
result = VDO_ASSERT((value & value_mask) == value,
|
|
"Value (%u) being set in a delta index is too large (must fit in %u bits)",
|
|
value, delta_entry->value_bits);
|
|
if (result != VDO_SUCCESS)
|
|
return UDS_INVALID_ARGUMENT;
|
|
|
|
set_field(value, delta_entry->delta_zone->memory,
|
|
get_delta_entry_offset(delta_entry), delta_entry->value_bits);
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Extend the memory used by the delta lists by adding growing_size bytes before the list indicated
|
|
* by growing_index, then rebalancing the lists in the new chunk.
|
|
*/
|
|
static int extend_delta_zone(struct delta_zone *delta_zone, u32 growing_index,
|
|
size_t growing_size)
|
|
{
|
|
ktime_t start_time;
|
|
ktime_t end_time;
|
|
struct delta_list *delta_lists;
|
|
u32 i;
|
|
size_t used_space;
|
|
|
|
|
|
/* Calculate the amount of space that is or will be in use. */
|
|
start_time = current_time_ns(CLOCK_MONOTONIC);
|
|
delta_lists = delta_zone->delta_lists;
|
|
used_space = growing_size;
|
|
for (i = 0; i <= delta_zone->list_count + 1; i++)
|
|
used_space += get_delta_list_byte_size(&delta_lists[i]);
|
|
|
|
if (delta_zone->size < used_space)
|
|
return UDS_OVERFLOW;
|
|
|
|
/* Compute the new offsets of the delta lists. */
|
|
compute_new_list_offsets(delta_zone, growing_index, growing_size, used_space);
|
|
|
|
/*
|
|
* When we rebalance the delta list, we will include the end guard list in the rebalancing.
|
|
* It contains the end guard data, which must be copied.
|
|
*/
|
|
rebalance_delta_zone(delta_zone, 1, delta_zone->list_count + 1);
|
|
end_time = current_time_ns(CLOCK_MONOTONIC);
|
|
delta_zone->rebalance_count++;
|
|
delta_zone->rebalance_time += ktime_sub(end_time, start_time);
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
static int insert_bits(struct delta_index_entry *delta_entry, u16 size)
|
|
{
|
|
u64 free_before;
|
|
u64 free_after;
|
|
u64 source;
|
|
u64 destination;
|
|
u32 count;
|
|
bool before_flag;
|
|
u8 *memory;
|
|
struct delta_zone *delta_zone = delta_entry->delta_zone;
|
|
struct delta_list *delta_list = delta_entry->delta_list;
|
|
/* Compute bits in use before and after the inserted bits. */
|
|
u32 total_size = delta_list->size;
|
|
u32 before_size = delta_entry->offset;
|
|
u32 after_size = total_size - delta_entry->offset;
|
|
|
|
if (total_size + size > U16_MAX) {
|
|
delta_entry->list_overflow = true;
|
|
delta_zone->overflow_count++;
|
|
return UDS_OVERFLOW;
|
|
}
|
|
|
|
/* Compute bits available before and after the delta list. */
|
|
free_before = (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
|
|
free_after = (delta_list[1].start - (delta_list[0].start + delta_list[0].size));
|
|
|
|
if ((size <= free_before) && (size <= free_after)) {
|
|
/*
|
|
* We have enough space to use either before or after the list. Select the smaller
|
|
* amount of data. If it is exactly the same, try to take from the larger amount of
|
|
* free space.
|
|
*/
|
|
if (before_size < after_size)
|
|
before_flag = true;
|
|
else if (after_size < before_size)
|
|
before_flag = false;
|
|
else
|
|
before_flag = free_before > free_after;
|
|
} else if (size <= free_before) {
|
|
/* There is space before but not after. */
|
|
before_flag = true;
|
|
} else if (size <= free_after) {
|
|
/* There is space after but not before. */
|
|
before_flag = false;
|
|
} else {
|
|
/*
|
|
* Neither of the surrounding spaces is large enough for this request. Extend
|
|
* and/or rebalance the delta list memory choosing to move the least amount of
|
|
* data.
|
|
*/
|
|
int result;
|
|
u32 growing_index = delta_entry->list_number + 1;
|
|
|
|
before_flag = before_size < after_size;
|
|
if (!before_flag)
|
|
growing_index++;
|
|
result = extend_delta_zone(delta_zone, growing_index,
|
|
BITS_TO_BYTES(size));
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
}
|
|
|
|
delta_list->size += size;
|
|
if (before_flag) {
|
|
source = delta_list->start;
|
|
destination = source - size;
|
|
delta_list->start -= size;
|
|
count = before_size;
|
|
} else {
|
|
source = delta_list->start + delta_entry->offset;
|
|
destination = source + size;
|
|
count = after_size;
|
|
}
|
|
|
|
memory = delta_zone->memory;
|
|
move_bits(memory, source, memory, destination, count);
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
static void encode_delta(const struct delta_index_entry *delta_entry)
|
|
{
|
|
u32 temp;
|
|
u32 t1;
|
|
u32 t2;
|
|
u64 offset;
|
|
const struct delta_zone *delta_zone = delta_entry->delta_zone;
|
|
u8 *memory = delta_zone->memory;
|
|
|
|
offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
|
|
if (delta_entry->delta < delta_zone->min_keys) {
|
|
set_field(delta_entry->delta, memory, offset, delta_zone->min_bits);
|
|
return;
|
|
}
|
|
|
|
temp = delta_entry->delta - delta_zone->min_keys;
|
|
t1 = (temp % delta_zone->incr_keys) + delta_zone->min_keys;
|
|
t2 = temp / delta_zone->incr_keys;
|
|
set_field(t1, memory, offset, delta_zone->min_bits);
|
|
set_zero(memory, offset + delta_zone->min_bits, t2);
|
|
set_field(1, memory, offset + delta_zone->min_bits + t2, 1);
|
|
}
|
|
|
|
static void encode_entry(const struct delta_index_entry *delta_entry, u32 value,
|
|
const u8 *name)
|
|
{
|
|
u8 *memory = delta_entry->delta_zone->memory;
|
|
u64 offset = get_delta_entry_offset(delta_entry);
|
|
|
|
set_field(value, memory, offset, delta_entry->value_bits);
|
|
encode_delta(delta_entry);
|
|
if (name != NULL)
|
|
set_collision_name(delta_entry, name);
|
|
}
|
|
|
|
/*
|
|
* Create a new entry in the delta index. If the entry is a collision, the full 256 bit name must
|
|
* be provided.
|
|
*/
|
|
int uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key, u32 value,
|
|
const u8 *name)
|
|
{
|
|
int result;
|
|
struct delta_zone *delta_zone;
|
|
|
|
result = assert_mutable_entry(delta_entry);
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
|
|
if (delta_entry->is_collision) {
|
|
/*
|
|
* The caller wants us to insert a collision entry onto a collision entry. This
|
|
* happens when we find a collision and attempt to add the name again to the index.
|
|
* This is normally a fatal error unless we are replaying a closed chapter while we
|
|
* are rebuilding a volume index.
|
|
*/
|
|
return UDS_DUPLICATE_NAME;
|
|
}
|
|
|
|
if (delta_entry->offset < delta_entry->delta_list->save_offset) {
|
|
/*
|
|
* The saved entry offset is after the new entry and will no longer be valid, so
|
|
* replace it with the insertion point.
|
|
*/
|
|
result = uds_remember_delta_index_offset(delta_entry);
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
}
|
|
|
|
if (name != NULL) {
|
|
/* Insert a collision entry which is placed after this entry. */
|
|
result = assert_not_at_end(delta_entry);
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
|
|
result = VDO_ASSERT((key == delta_entry->key),
|
|
"incorrect key for collision entry");
|
|
if (result != VDO_SUCCESS)
|
|
return result;
|
|
|
|
delta_entry->offset += delta_entry->entry_bits;
|
|
set_delta(delta_entry, 0);
|
|
delta_entry->is_collision = true;
|
|
delta_entry->entry_bits += COLLISION_BITS;
|
|
result = insert_bits(delta_entry, delta_entry->entry_bits);
|
|
} else if (delta_entry->at_end) {
|
|
/* Insert a new entry at the end of the delta list. */
|
|
result = VDO_ASSERT((key >= delta_entry->key), "key past end of list");
|
|
if (result != VDO_SUCCESS)
|
|
return result;
|
|
|
|
set_delta(delta_entry, key - delta_entry->key);
|
|
delta_entry->key = key;
|
|
delta_entry->at_end = false;
|
|
result = insert_bits(delta_entry, delta_entry->entry_bits);
|
|
} else {
|
|
u16 old_entry_size;
|
|
u16 additional_size;
|
|
struct delta_index_entry next_entry;
|
|
u32 next_value;
|
|
|
|
/*
|
|
* Insert a new entry which requires the delta in the following entry to be
|
|
* updated.
|
|
*/
|
|
result = VDO_ASSERT((key < delta_entry->key),
|
|
"key precedes following entry");
|
|
if (result != VDO_SUCCESS)
|
|
return result;
|
|
|
|
result = VDO_ASSERT((key >= delta_entry->key - delta_entry->delta),
|
|
"key effects following entry's delta");
|
|
if (result != VDO_SUCCESS)
|
|
return result;
|
|
|
|
old_entry_size = delta_entry->entry_bits;
|
|
next_entry = *delta_entry;
|
|
next_value = uds_get_delta_entry_value(&next_entry);
|
|
set_delta(delta_entry, key - (delta_entry->key - delta_entry->delta));
|
|
delta_entry->key = key;
|
|
set_delta(&next_entry, next_entry.key - key);
|
|
next_entry.offset += delta_entry->entry_bits;
|
|
/* The two new entries are always bigger than the single entry being replaced. */
|
|
additional_size = (delta_entry->entry_bits +
|
|
next_entry.entry_bits - old_entry_size);
|
|
result = insert_bits(delta_entry, additional_size);
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
|
|
encode_entry(&next_entry, next_value, NULL);
|
|
}
|
|
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
|
|
encode_entry(delta_entry, value, name);
|
|
delta_zone = delta_entry->delta_zone;
|
|
delta_zone->record_count++;
|
|
delta_zone->collision_count += delta_entry->is_collision ? 1 : 0;
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
static void delete_bits(const struct delta_index_entry *delta_entry, int size)
|
|
{
|
|
u64 source;
|
|
u64 destination;
|
|
u32 count;
|
|
bool before_flag;
|
|
struct delta_list *delta_list = delta_entry->delta_list;
|
|
u8 *memory = delta_entry->delta_zone->memory;
|
|
/* Compute bits retained before and after the deleted bits. */
|
|
u32 total_size = delta_list->size;
|
|
u32 before_size = delta_entry->offset;
|
|
u32 after_size = total_size - delta_entry->offset - size;
|
|
|
|
/*
|
|
* Determine whether to add to the available space either before or after the delta list.
|
|
* We prefer to move the least amount of data. If it is exactly the same, try to add to the
|
|
* smaller amount of free space.
|
|
*/
|
|
if (before_size < after_size) {
|
|
before_flag = true;
|
|
} else if (after_size < before_size) {
|
|
before_flag = false;
|
|
} else {
|
|
u64 free_before =
|
|
(delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
|
|
u64 free_after =
|
|
(delta_list[1].start - (delta_list[0].start + delta_list[0].size));
|
|
|
|
before_flag = (free_before < free_after);
|
|
}
|
|
|
|
delta_list->size -= size;
|
|
if (before_flag) {
|
|
source = delta_list->start;
|
|
destination = source + size;
|
|
delta_list->start += size;
|
|
count = before_size;
|
|
} else {
|
|
destination = delta_list->start + delta_entry->offset;
|
|
source = destination + size;
|
|
count = after_size;
|
|
}
|
|
|
|
move_bits(memory, source, memory, destination, count);
|
|
}
|
|
|
|
int uds_remove_delta_index_entry(struct delta_index_entry *delta_entry)
|
|
{
|
|
int result;
|
|
struct delta_index_entry next_entry;
|
|
struct delta_zone *delta_zone;
|
|
struct delta_list *delta_list;
|
|
|
|
result = assert_mutable_entry(delta_entry);
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
|
|
next_entry = *delta_entry;
|
|
result = uds_next_delta_index_entry(&next_entry);
|
|
if (result != UDS_SUCCESS)
|
|
return result;
|
|
|
|
delta_zone = delta_entry->delta_zone;
|
|
|
|
if (delta_entry->is_collision) {
|
|
/* This is a collision entry, so just remove it. */
|
|
delete_bits(delta_entry, delta_entry->entry_bits);
|
|
next_entry.offset = delta_entry->offset;
|
|
delta_zone->collision_count -= 1;
|
|
} else if (next_entry.at_end) {
|
|
/* This entry is at the end of the list, so just remove it. */
|
|
delete_bits(delta_entry, delta_entry->entry_bits);
|
|
next_entry.key -= delta_entry->delta;
|
|
next_entry.offset = delta_entry->offset;
|
|
} else {
|
|
/* The delta in the next entry needs to be updated. */
|
|
u32 next_value = uds_get_delta_entry_value(&next_entry);
|
|
u16 old_size = delta_entry->entry_bits + next_entry.entry_bits;
|
|
|
|
if (next_entry.is_collision) {
|
|
next_entry.is_collision = false;
|
|
delta_zone->collision_count -= 1;
|
|
}
|
|
|
|
set_delta(&next_entry, delta_entry->delta + next_entry.delta);
|
|
next_entry.offset = delta_entry->offset;
|
|
/* The one new entry is always smaller than the two entries being replaced. */
|
|
delete_bits(delta_entry, old_size - next_entry.entry_bits);
|
|
encode_entry(&next_entry, next_value, NULL);
|
|
}
|
|
|
|
delta_zone->record_count--;
|
|
delta_zone->discard_count++;
|
|
*delta_entry = next_entry;
|
|
|
|
delta_list = delta_entry->delta_list;
|
|
if (delta_entry->offset < delta_list->save_offset) {
|
|
/* The saved entry offset is no longer valid. */
|
|
delta_list->save_key = 0;
|
|
delta_list->save_offset = 0;
|
|
}
|
|
|
|
return UDS_SUCCESS;
|
|
}
|
|
|
|
void uds_get_delta_index_stats(const struct delta_index *delta_index,
|
|
struct delta_index_stats *stats)
|
|
{
|
|
unsigned int z;
|
|
const struct delta_zone *delta_zone;
|
|
|
|
memset(stats, 0, sizeof(struct delta_index_stats));
|
|
for (z = 0; z < delta_index->zone_count; z++) {
|
|
delta_zone = &delta_index->delta_zones[z];
|
|
stats->rebalance_time += delta_zone->rebalance_time;
|
|
stats->rebalance_count += delta_zone->rebalance_count;
|
|
stats->record_count += delta_zone->record_count;
|
|
stats->collision_count += delta_zone->collision_count;
|
|
stats->discard_count += delta_zone->discard_count;
|
|
stats->overflow_count += delta_zone->overflow_count;
|
|
stats->list_count += delta_zone->list_count;
|
|
}
|
|
}
|
|
|
|
size_t uds_compute_delta_index_size(u32 entry_count, u32 mean_delta, u32 payload_bits)
|
|
{
|
|
u16 min_bits;
|
|
u32 incr_keys;
|
|
u32 min_keys;
|
|
|
|
compute_coding_constants(mean_delta, &min_bits, &min_keys, &incr_keys);
|
|
/* On average, each delta is encoded into about min_bits + 1.5 bits. */
|
|
return entry_count * (payload_bits + min_bits + 1) + entry_count / 2;
|
|
}
|
|
|
|
u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta,
|
|
u32 payload_bits, size_t bytes_per_page)
|
|
{
|
|
unsigned int bits_per_delta_list;
|
|
unsigned int bits_per_page;
|
|
size_t bits_per_index;
|
|
|
|
/* Compute the expected number of bits needed for all the entries. */
|
|
bits_per_index = uds_compute_delta_index_size(entry_count, mean_delta,
|
|
payload_bits);
|
|
bits_per_delta_list = bits_per_index / list_count;
|
|
|
|
/* Add in the immutable delta list headers. */
|
|
bits_per_index += list_count * IMMUTABLE_HEADER_SIZE;
|
|
/* Compute the number of usable bits on an immutable index page. */
|
|
bits_per_page = ((bytes_per_page - sizeof(struct delta_page_header)) * BITS_PER_BYTE);
|
|
/*
|
|
* Reduce the bits per page by one immutable delta list header and one delta list to
|
|
* account for internal fragmentation.
|
|
*/
|
|
bits_per_page -= IMMUTABLE_HEADER_SIZE + bits_per_delta_list;
|
|
/* Now compute the number of pages needed. */
|
|
return DIV_ROUND_UP(bits_per_index, bits_per_page);
|
|
}
|
|
|
|
void uds_log_delta_index_entry(struct delta_index_entry *delta_entry)
|
|
{
|
|
vdo_log_ratelimit(vdo_log_info,
|
|
"List 0x%X Key 0x%X Offset 0x%X%s%s List_size 0x%X%s",
|
|
delta_entry->list_number, delta_entry->key,
|
|
delta_entry->offset, delta_entry->at_end ? " end" : "",
|
|
delta_entry->is_collision ? " collision" : "",
|
|
delta_entry->delta_list->size,
|
|
delta_entry->list_overflow ? " overflow" : "");
|
|
delta_entry->list_overflow = false;
|
|
}
|