1
1
mirror of https://github.com/systemd/systemd-stable.git synced 2025-02-04 17:47:03 +03:00

journal: Use 32-bit entry array offsets in compact mode

Before:

OBJECT TYPE      ENTRIES SIZE
Unused           0       0B
Data             3610336 595.7M
Field            5310    285.2K
Entry            3498326 1.2G
Data Hash Table  29	 103.1M
Field Hash Table 29      151.3K
Entry Array      605991  1011.6M
Tag              0	 0B
Total            7720021 2.9G

After:

OBJECT TYPE      ENTRIES SIZE
Unused           0	 0B
Data             3562667 591.0M
Field            3971    213.6K
Entry            3498566 1.2G
Data Hash Table  20	 71.1M
Field Hash Table 20	 104.3K
Entry Array	 582647  505.0M
Tag              0	 0B
Total            7647891 2.4G
This commit is contained in:
Daan De Meyer 2021-10-23 22:36:47 +01:00
parent d06727aec2
commit 99daf3ce03
6 changed files with 73 additions and 40 deletions

View File

@ -71,7 +71,7 @@ thread](https://lists.freedesktop.org/archives/systemd-devel/2012-October/007054
## Basics ## Basics
* All offsets, sizes, time values, hashes (and most other numeric values) are 64bit unsigned integers in LE format. * All offsets, sizes, time values, hashes (and most other numeric values) are 32bit/64bit unsigned integers in LE format.
* Offsets are always relative to the beginning of the file. * Offsets are always relative to the beginning of the file.
* The 64bit hash function siphash24 is used for newer journal files. For older files [Jenkins lookup3](https://en.wikipedia.org/wiki/Jenkins_hash_function) is used, more specifically `jenkins_hashlittle2()` with the first 32bit integer it returns as higher 32bit part of the 64bit value, and the second one uses as lower 32bit part. * The 64bit hash function siphash24 is used for newer journal files. For older files [Jenkins lookup3](https://en.wikipedia.org/wiki/Jenkins_hash_function) is used, more specifically `jenkins_hashlittle2()` with the first 32bit integer it returns as higher 32bit part of the 64bit value, and the second one uses as lower 32bit part.
* All structures are aligned to 64bit boundaries and padded to multiples of 64bit * All structures are aligned to 64bit boundaries and padded to multiples of 64bit
@ -552,7 +552,10 @@ creativity rather than runtime parameters.
_packed_ struct EntryArrayObject { _packed_ struct EntryArrayObject {
ObjectHeader object; ObjectHeader object;
le64_t next_entry_array_offset; le64_t next_entry_array_offset;
le64_t items[]; union {
le64_t regular[];
le32_t compact[];
} items;
}; };
``` ```
@ -560,6 +563,9 @@ Entry Arrays are used to store a sorted array of offsets to entries. Entry
arrays are strictly sorted by offsets on disk, and hence by their timestamps arrays are strictly sorted by offsets on disk, and hence by their timestamps
and sequence numbers (with some restrictions, see above). and sequence numbers (with some restrictions, see above).
If the `HEADER_INCOMPATIBLE_COMPACT` flag is set, offsets are stored as 32-bit
integers instead of 64bit.
Entry Arrays are chained up. If one entry array is full another one is Entry Arrays are chained up. If one entry array is full another one is
allocated and the **next_entry_array_offset** field of the old one pointed to allocated and the **next_entry_array_offset** field of the old one pointed to
it. An Entry Array with **next_entry_array_offset** set to 0 is the last in the it. An Entry Array with **next_entry_array_offset** set to 0 is the last in the

View File

@ -50,7 +50,7 @@ static int managed_journal_file_entry_array_punch_hole(JournalFile *f, uint64_t
if (r < 0) if (r < 0)
return r; return r;
n_items += journal_file_entry_array_n_items(&o); n_items += journal_file_entry_array_n_items(f, &o);
p = q; p = q;
} }
@ -67,7 +67,7 @@ static int managed_journal_file_entry_array_punch_hole(JournalFile *f, uint64_t
return 0; return 0;
offset = p + offsetof(Object, entry_array.items) + offset = p + offsetof(Object, entry_array.items) +
(journal_file_entry_array_n_items(&o) - n_unused) * sizeof(le64_t); (journal_file_entry_array_n_items(f, &o) - n_unused) * journal_file_entry_array_item_size(f);
sz = p + le64toh(o.object.size) - offset; sz = p + le64toh(o.object.size) - offset;
if (sz < MINIMUM_HOLE_SIZE) if (sz < MINIMUM_HOLE_SIZE)

View File

@ -117,7 +117,10 @@ struct HashTableObject {
struct EntryArrayObject { struct EntryArrayObject {
ObjectHeader object; ObjectHeader object;
le64_t next_entry_array_offset; le64_t next_entry_array_offset;
le64_t items[]; union {
le64_t regular[0];
le32_t compact[0];
} items;
} _packed_; } _packed_;
#define TAG_LENGTH (256/8) #define TAG_LENGTH (256/8)

View File

@ -716,7 +716,7 @@ static int check_object_header(Object *o, ObjectType type, uint64_t offset) {
/* Lightweight object checks. We want this to be fast, so that we won't /* Lightweight object checks. We want this to be fast, so that we won't
* slowdown every journal_file_move_to_object() call too much. */ * slowdown every journal_file_move_to_object() call too much. */
static int check_object(Object *o, uint64_t offset) { static int check_object(JournalFile *f, Object *o, uint64_t offset) {
assert(o); assert(o);
switch (o->object.type) { switch (o->object.type) {
@ -827,8 +827,8 @@ static int check_object(Object *o, uint64_t offset) {
sz = le64toh(READ_NOW(o->object.size)); sz = le64toh(READ_NOW(o->object.size));
if (sz < offsetof(Object, entry_array.items) || if (sz < offsetof(Object, entry_array.items) ||
(sz - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 || (sz - offsetof(Object, entry_array.items)) % journal_file_entry_array_item_size(f) != 0 ||
(sz - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0) (sz - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f) <= 0)
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Invalid object entry array size: %" PRIu64 ": %" PRIu64, "Invalid object entry array size: %" PRIu64 ": %" PRIu64,
sz, sz,
@ -895,7 +895,7 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset
if (r < 0) if (r < 0)
return r; return r;
r = check_object(o, offset); r = check_object(f, o, offset);
if (r < 0) if (r < 0)
return r; return r;
@ -944,7 +944,7 @@ int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t of
"Short read while reading object: %" PRIu64, "Short read while reading object: %" PRIu64,
offset); offset);
r = check_object(&o, offset); r = check_object(f, &o, offset);
if (r < 0) if (r < 0)
return r; return r;
@ -1672,7 +1672,7 @@ uint64_t journal_file_entry_n_items(Object *o) {
return (sz - offsetof(Object, entry.items)) / sizeof(EntryItem); return (sz - offsetof(Object, entry.items)) / sizeof(EntryItem);
} }
uint64_t journal_file_entry_array_n_items(Object *o) { uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) {
uint64_t sz; uint64_t sz;
assert(o); assert(o);
@ -1684,7 +1684,7 @@ uint64_t journal_file_entry_array_n_items(Object *o) {
if (sz < offsetof(Object, entry_array.items)) if (sz < offsetof(Object, entry_array.items))
return 0; return 0;
return (sz - offsetof(Object, entry_array.items)) / sizeof(uint64_t); return (sz - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f);
} }
uint64_t journal_file_hash_table_n_items(Object *o) { uint64_t journal_file_hash_table_n_items(Object *o) {
@ -1702,6 +1702,17 @@ uint64_t journal_file_hash_table_n_items(Object *o) {
return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem); return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem);
} }
static void write_entry_array_item(JournalFile *f, Object *o, uint64_t i, uint64_t p) {
assert(f);
assert(o);
if (JOURNAL_HEADER_COMPACT(f->header)) {
assert(p <= UINT32_MAX);
o->entry_array.items.compact[i] = htole32(p);
} else
o->entry_array.items.regular[i] = htole64(p);
}
static int link_entry_into_array(JournalFile *f, static int link_entry_into_array(JournalFile *f,
le64_t *first, le64_t *first,
le64_t *idx, le64_t *idx,
@ -1724,9 +1735,9 @@ static int link_entry_into_array(JournalFile *f,
if (r < 0) if (r < 0)
return r; return r;
n = journal_file_entry_array_n_items(o); n = journal_file_entry_array_n_items(f, o);
if (i < n) { if (i < n) {
o->entry_array.items[i] = htole64(p); write_entry_array_item(f, o, i, p);
*idx = htole64(hidx + 1); *idx = htole64(hidx + 1);
return 0; return 0;
} }
@ -1745,7 +1756,7 @@ static int link_entry_into_array(JournalFile *f,
n = 4; n = 4;
r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY, r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
offsetof(Object, entry_array.items) + n * sizeof(uint64_t), offsetof(Object, entry_array.items) + n * journal_file_entry_array_item_size(f),
&o, &q); &o, &q);
if (r < 0) if (r < 0)
return r; return r;
@ -1756,7 +1767,7 @@ static int link_entry_into_array(JournalFile *f,
return r; return r;
#endif #endif
o->entry_array.items[i] = htole64(p); write_entry_array_item(f, o, i, p);
if (ap == 0) if (ap == 0)
*first = htole64(q); *first = htole64(q);
@ -2277,7 +2288,7 @@ static int generic_array_get(
if (r < 0) if (r < 0)
return r; return r;
k = journal_file_entry_array_n_items(o); k = journal_file_entry_array_n_items(f, o);
if (i < k) if (i < k)
break; break;
@ -2297,7 +2308,7 @@ static int generic_array_get(
if (r < 0) if (r < 0)
return r; return r;
k = journal_file_entry_array_n_items(o); k = journal_file_entry_array_n_items(f, o);
if (k == 0) if (k == 0)
break; break;
@ -2305,12 +2316,12 @@ static int generic_array_get(
} }
do { do {
p = le64toh(o->entry_array.items[i]); p = journal_file_entry_array_item(f, o, i);
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret); r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);
if (r >= 0) { if (r >= 0) {
/* Let's cache this item for the next invocation */ /* Let's cache this item for the next invocation */
chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i); chain_cache_put(f->chain_cache, ci, first, a, journal_file_entry_array_item(f, o, 0), t, i);
if (ret_offset) if (ret_offset)
*ret_offset = p; *ret_offset = p;
@ -2438,13 +2449,13 @@ static int generic_array_bisect(
if (r < 0) if (r < 0)
return r; return r;
k = journal_file_entry_array_n_items(array); k = journal_file_entry_array_n_items(f, array);
right = MIN(k, n); right = MIN(k, n);
if (right <= 0) if (right <= 0)
return 0; return 0;
i = right - 1; i = right - 1;
lp = p = le64toh(array->entry_array.items[i]); lp = p = journal_file_entry_array_item(f, array, i);
if (p <= 0) if (p <= 0)
r = -EBADMSG; r = -EBADMSG;
else else
@ -2477,7 +2488,7 @@ static int generic_array_bisect(
if (last_index > 0) { if (last_index > 0) {
uint64_t x = last_index - 1; uint64_t x = last_index - 1;
p = le64toh(array->entry_array.items[x]); p = journal_file_entry_array_item(f, array, x);
if (p <= 0) if (p <= 0)
return -EBADMSG; return -EBADMSG;
@ -2497,7 +2508,7 @@ static int generic_array_bisect(
if (last_index < right) { if (last_index < right) {
uint64_t y = last_index + 1; uint64_t y = last_index + 1;
p = le64toh(array->entry_array.items[y]); p = journal_file_entry_array_item(f, array, y);
if (p <= 0) if (p <= 0)
return -EBADMSG; return -EBADMSG;
@ -2527,7 +2538,7 @@ static int generic_array_bisect(
assert(left < right); assert(left < right);
i = (left + right) / 2; i = (left + right) / 2;
p = le64toh(array->entry_array.items[i]); p = journal_file_entry_array_item(f, array, i);
if (p <= 0) if (p <= 0)
r = -EBADMSG; r = -EBADMSG;
else else
@ -2575,14 +2586,14 @@ found:
return 0; return 0;
/* Let's cache this item for the next invocation */ /* Let's cache this item for the next invocation */
chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i); chain_cache_put(f->chain_cache, ci, first, a, journal_file_entry_array_item(f, array, 0), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i);
if (subtract_one && i == 0) if (subtract_one && i == 0)
p = last_p; p = last_p;
else if (subtract_one) else if (subtract_one)
p = le64toh(array->entry_array.items[i-1]); p = journal_file_entry_array_item(f, array, i - 1);
else else
p = le64toh(array->entry_array.items[i]); p = journal_file_entry_array_item(f, array, i);
if (ret) { if (ret) {
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret); r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);

View File

@ -194,7 +194,20 @@ int journal_file_tail_end_by_pread(JournalFile *f, uint64_t *ret_offset);
int journal_file_tail_end_by_mmap(JournalFile *f, uint64_t *ret_offset); int journal_file_tail_end_by_mmap(JournalFile *f, uint64_t *ret_offset);
uint64_t journal_file_entry_n_items(Object *o) _pure_; uint64_t journal_file_entry_n_items(Object *o) _pure_;
uint64_t journal_file_entry_array_n_items(Object *o) _pure_; uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) _pure_;
static inline uint64_t journal_file_entry_array_item(JournalFile *f, Object *o, size_t i) {
assert(f);
assert(o);
return JOURNAL_HEADER_COMPACT(f->header) ? le32toh(o->entry_array.items.compact[i]) :
le64toh(o->entry_array.items.regular[i]);
}
static inline size_t journal_file_entry_array_item_size(JournalFile *f) {
assert(f);
return JOURNAL_HEADER_COMPACT(f->header) ? sizeof(le32_t) : sizeof(le64_t);
}
uint64_t journal_file_hash_table_n_items(Object *o) _pure_; uint64_t journal_file_hash_table_n_items(Object *o) _pure_;
int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *ret_offset); int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *ret_offset);

View File

@ -335,8 +335,8 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o
break; break;
case OBJECT_ENTRY_ARRAY: case OBJECT_ENTRY_ARRAY:
if ((le64toh(o->object.size) - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 || if ((le64toh(o->object.size) - offsetof(Object, entry_array.items)) % journal_file_entry_array_item_size(f) != 0 ||
(le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0) { (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f) <= 0) {
error(offset, error(offset,
"Invalid object entry array size: %"PRIu64, "Invalid object entry array size: %"PRIu64,
le64toh(o->object.size)); le64toh(o->object.size));
@ -350,15 +350,15 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o
return -EBADMSG; return -EBADMSG;
} }
for (uint64_t i = 0; i < journal_file_entry_array_n_items(o); i++) for (uint64_t i = 0; i < journal_file_entry_array_n_items(f, o); i++) {
if (le64toh(o->entry_array.items[i]) != 0 && uint64_t q = journal_file_entry_array_item(f, o, i);
!VALID64(le64toh(o->entry_array.items[i]))) { if (q != 0 && !VALID64(q)) {
error(offset, error(offset,
"Invalid object entry array item (%"PRIu64"/%"PRIu64"): "OFSfmt, "Invalid object entry array item (%"PRIu64"/%"PRIu64"): "OFSfmt,
i, journal_file_entry_array_n_items(o), i, journal_file_entry_array_n_items(f, o), q);
le64toh(o->entry_array.items[i]));
return -EBADMSG; return -EBADMSG;
} }
}
break; break;
@ -490,10 +490,10 @@ static int verify_data(
return -EBADMSG; return -EBADMSG;
} }
m = journal_file_entry_array_n_items(o); m = journal_file_entry_array_n_items(f, o);
for (j = 0; i < n && j < m; i++, j++) { for (j = 0; i < n && j < m; i++, j++) {
q = le64toh(o->entry_array.items[j]); q = journal_file_entry_array_item(f, o, j);
if (q <= last) { if (q <= last) {
error(p, "Data object's entry array not sorted (%"PRIu64" <= %"PRIu64")", q, last); error(p, "Data object's entry array not sorted (%"PRIu64" <= %"PRIu64")", q, last);
return -EBADMSG; return -EBADMSG;
@ -737,11 +737,11 @@ static int verify_entry_array(
return -EBADMSG; return -EBADMSG;
} }
m = journal_file_entry_array_n_items(o); m = journal_file_entry_array_n_items(f, o);
for (j = 0; i < n && j < m; i++, j++) { for (j = 0; i < n && j < m; i++, j++) {
uint64_t p; uint64_t p;
p = le64toh(o->entry_array.items[j]); p = journal_file_entry_array_item(f, o, j);
if (p <= last) { if (p <= last) {
error(a, "Entry array not sorted at %"PRIu64" of %"PRIu64, i, n); error(a, "Entry array not sorted at %"PRIu64" of %"PRIu64, i, n);
return -EBADMSG; return -EBADMSG;