mirror of
https://github.com/systemd/systemd.git
synced 2025-01-11 09:18:07 +03:00
journal: Use 32-bit entry array offsets in compact mode
Before: OBJECT TYPE ENTRIES SIZE Unused 0 0B Data 3610336 595.7M Field 5310 285.2K Entry 3498326 1.2G Data Hash Table 29 103.1M Field Hash Table 29 151.3K Entry Array 605991 1011.6M Tag 0 0B Total 7720021 2.9G After: OBJECT TYPE ENTRIES SIZE Unused 0 0B Data 3562667 591.0M Field 3971 213.6K Entry 3498566 1.2G Data Hash Table 20 71.1M Field Hash Table 20 104.3K Entry Array 582647 505.0M Tag 0 0B Total 7647891 2.4G
This commit is contained in:
parent
d06727aec2
commit
99daf3ce03
@ -71,7 +71,7 @@ thread](https://lists.freedesktop.org/archives/systemd-devel/2012-October/007054
|
||||
|
||||
## Basics
|
||||
|
||||
* All offsets, sizes, time values, hashes (and most other numeric values) are 64bit unsigned integers in LE format.
|
||||
* All offsets, sizes, time values, hashes (and most other numeric values) are 32bit/64bit unsigned integers in LE format.
|
||||
* Offsets are always relative to the beginning of the file.
|
||||
* The 64bit hash function siphash24 is used for newer journal files. For older files [Jenkins lookup3](https://en.wikipedia.org/wiki/Jenkins_hash_function) is used, more specifically `jenkins_hashlittle2()` with the first 32bit integer it returns as higher 32bit part of the 64bit value, and the second one uses as lower 32bit part.
|
||||
* All structures are aligned to 64bit boundaries and padded to multiples of 64bit
|
||||
@ -552,7 +552,10 @@ creativity rather than runtime parameters.
|
||||
_packed_ struct EntryArrayObject {
|
||||
ObjectHeader object;
|
||||
le64_t next_entry_array_offset;
|
||||
le64_t items[];
|
||||
union {
|
||||
le64_t regular[];
|
||||
le32_t compact[];
|
||||
} items;
|
||||
};
|
||||
```
|
||||
|
||||
@ -560,6 +563,9 @@ Entry Arrays are used to store a sorted array of offsets to entries. Entry
|
||||
arrays are strictly sorted by offsets on disk, and hence by their timestamps
|
||||
and sequence numbers (with some restrictions, see above).
|
||||
|
||||
If the `HEADER_INCOMPATIBLE_COMPACT` flag is set, offsets are stored as 32-bit
|
||||
integers instead of 64bit.
|
||||
|
||||
Entry Arrays are chained up. If one entry array is full another one is
|
||||
allocated and the **next_entry_array_offset** field of the old one pointed to
|
||||
it. An Entry Array with **next_entry_array_offset** set to 0 is the last in the
|
||||
|
@ -50,7 +50,7 @@ static int managed_journal_file_entry_array_punch_hole(JournalFile *f, uint64_t
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
n_items += journal_file_entry_array_n_items(&o);
|
||||
n_items += journal_file_entry_array_n_items(f, &o);
|
||||
p = q;
|
||||
}
|
||||
|
||||
@ -67,7 +67,7 @@ static int managed_journal_file_entry_array_punch_hole(JournalFile *f, uint64_t
|
||||
return 0;
|
||||
|
||||
offset = p + offsetof(Object, entry_array.items) +
|
||||
(journal_file_entry_array_n_items(&o) - n_unused) * sizeof(le64_t);
|
||||
(journal_file_entry_array_n_items(f, &o) - n_unused) * journal_file_entry_array_item_size(f);
|
||||
sz = p + le64toh(o.object.size) - offset;
|
||||
|
||||
if (sz < MINIMUM_HOLE_SIZE)
|
||||
|
@ -117,7 +117,10 @@ struct HashTableObject {
|
||||
struct EntryArrayObject {
|
||||
ObjectHeader object;
|
||||
le64_t next_entry_array_offset;
|
||||
le64_t items[];
|
||||
union {
|
||||
le64_t regular[0];
|
||||
le32_t compact[0];
|
||||
} items;
|
||||
} _packed_;
|
||||
|
||||
#define TAG_LENGTH (256/8)
|
||||
|
@ -716,7 +716,7 @@ static int check_object_header(Object *o, ObjectType type, uint64_t offset) {
|
||||
|
||||
/* Lightweight object checks. We want this to be fast, so that we won't
|
||||
* slowdown every journal_file_move_to_object() call too much. */
|
||||
static int check_object(Object *o, uint64_t offset) {
|
||||
static int check_object(JournalFile *f, Object *o, uint64_t offset) {
|
||||
assert(o);
|
||||
|
||||
switch (o->object.type) {
|
||||
@ -827,8 +827,8 @@ static int check_object(Object *o, uint64_t offset) {
|
||||
|
||||
sz = le64toh(READ_NOW(o->object.size));
|
||||
if (sz < offsetof(Object, entry_array.items) ||
|
||||
(sz - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 ||
|
||||
(sz - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0)
|
||||
(sz - offsetof(Object, entry_array.items)) % journal_file_entry_array_item_size(f) != 0 ||
|
||||
(sz - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f) <= 0)
|
||||
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
|
||||
"Invalid object entry array size: %" PRIu64 ": %" PRIu64,
|
||||
sz,
|
||||
@ -895,7 +895,7 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = check_object(o, offset);
|
||||
r = check_object(f, o, offset);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
@ -944,7 +944,7 @@ int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t of
|
||||
"Short read while reading object: %" PRIu64,
|
||||
offset);
|
||||
|
||||
r = check_object(&o, offset);
|
||||
r = check_object(f, &o, offset);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
@ -1672,7 +1672,7 @@ uint64_t journal_file_entry_n_items(Object *o) {
|
||||
return (sz - offsetof(Object, entry.items)) / sizeof(EntryItem);
|
||||
}
|
||||
|
||||
uint64_t journal_file_entry_array_n_items(Object *o) {
|
||||
uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) {
|
||||
uint64_t sz;
|
||||
|
||||
assert(o);
|
||||
@ -1684,7 +1684,7 @@ uint64_t journal_file_entry_array_n_items(Object *o) {
|
||||
if (sz < offsetof(Object, entry_array.items))
|
||||
return 0;
|
||||
|
||||
return (sz - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
|
||||
return (sz - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f);
|
||||
}
|
||||
|
||||
uint64_t journal_file_hash_table_n_items(Object *o) {
|
||||
@ -1702,6 +1702,17 @@ uint64_t journal_file_hash_table_n_items(Object *o) {
|
||||
return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem);
|
||||
}
|
||||
|
||||
static void write_entry_array_item(JournalFile *f, Object *o, uint64_t i, uint64_t p) {
|
||||
assert(f);
|
||||
assert(o);
|
||||
|
||||
if (JOURNAL_HEADER_COMPACT(f->header)) {
|
||||
assert(p <= UINT32_MAX);
|
||||
o->entry_array.items.compact[i] = htole32(p);
|
||||
} else
|
||||
o->entry_array.items.regular[i] = htole64(p);
|
||||
}
|
||||
|
||||
static int link_entry_into_array(JournalFile *f,
|
||||
le64_t *first,
|
||||
le64_t *idx,
|
||||
@ -1724,9 +1735,9 @@ static int link_entry_into_array(JournalFile *f,
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
n = journal_file_entry_array_n_items(o);
|
||||
n = journal_file_entry_array_n_items(f, o);
|
||||
if (i < n) {
|
||||
o->entry_array.items[i] = htole64(p);
|
||||
write_entry_array_item(f, o, i, p);
|
||||
*idx = htole64(hidx + 1);
|
||||
return 0;
|
||||
}
|
||||
@ -1745,7 +1756,7 @@ static int link_entry_into_array(JournalFile *f,
|
||||
n = 4;
|
||||
|
||||
r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
|
||||
offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
|
||||
offsetof(Object, entry_array.items) + n * journal_file_entry_array_item_size(f),
|
||||
&o, &q);
|
||||
if (r < 0)
|
||||
return r;
|
||||
@ -1756,7 +1767,7 @@ static int link_entry_into_array(JournalFile *f,
|
||||
return r;
|
||||
#endif
|
||||
|
||||
o->entry_array.items[i] = htole64(p);
|
||||
write_entry_array_item(f, o, i, p);
|
||||
|
||||
if (ap == 0)
|
||||
*first = htole64(q);
|
||||
@ -2277,7 +2288,7 @@ static int generic_array_get(
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
k = journal_file_entry_array_n_items(o);
|
||||
k = journal_file_entry_array_n_items(f, o);
|
||||
if (i < k)
|
||||
break;
|
||||
|
||||
@ -2297,7 +2308,7 @@ static int generic_array_get(
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
k = journal_file_entry_array_n_items(o);
|
||||
k = journal_file_entry_array_n_items(f, o);
|
||||
if (k == 0)
|
||||
break;
|
||||
|
||||
@ -2305,12 +2316,12 @@ static int generic_array_get(
|
||||
}
|
||||
|
||||
do {
|
||||
p = le64toh(o->entry_array.items[i]);
|
||||
p = journal_file_entry_array_item(f, o, i);
|
||||
|
||||
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);
|
||||
if (r >= 0) {
|
||||
/* Let's cache this item for the next invocation */
|
||||
chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
|
||||
chain_cache_put(f->chain_cache, ci, first, a, journal_file_entry_array_item(f, o, 0), t, i);
|
||||
|
||||
if (ret_offset)
|
||||
*ret_offset = p;
|
||||
@ -2438,13 +2449,13 @@ static int generic_array_bisect(
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
k = journal_file_entry_array_n_items(array);
|
||||
k = journal_file_entry_array_n_items(f, array);
|
||||
right = MIN(k, n);
|
||||
if (right <= 0)
|
||||
return 0;
|
||||
|
||||
i = right - 1;
|
||||
lp = p = le64toh(array->entry_array.items[i]);
|
||||
lp = p = journal_file_entry_array_item(f, array, i);
|
||||
if (p <= 0)
|
||||
r = -EBADMSG;
|
||||
else
|
||||
@ -2477,7 +2488,7 @@ static int generic_array_bisect(
|
||||
if (last_index > 0) {
|
||||
uint64_t x = last_index - 1;
|
||||
|
||||
p = le64toh(array->entry_array.items[x]);
|
||||
p = journal_file_entry_array_item(f, array, x);
|
||||
if (p <= 0)
|
||||
return -EBADMSG;
|
||||
|
||||
@ -2497,7 +2508,7 @@ static int generic_array_bisect(
|
||||
if (last_index < right) {
|
||||
uint64_t y = last_index + 1;
|
||||
|
||||
p = le64toh(array->entry_array.items[y]);
|
||||
p = journal_file_entry_array_item(f, array, y);
|
||||
if (p <= 0)
|
||||
return -EBADMSG;
|
||||
|
||||
@ -2527,7 +2538,7 @@ static int generic_array_bisect(
|
||||
assert(left < right);
|
||||
i = (left + right) / 2;
|
||||
|
||||
p = le64toh(array->entry_array.items[i]);
|
||||
p = journal_file_entry_array_item(f, array, i);
|
||||
if (p <= 0)
|
||||
r = -EBADMSG;
|
||||
else
|
||||
@ -2575,14 +2586,14 @@ found:
|
||||
return 0;
|
||||
|
||||
/* Let's cache this item for the next invocation */
|
||||
chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i);
|
||||
chain_cache_put(f->chain_cache, ci, first, a, journal_file_entry_array_item(f, array, 0), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i);
|
||||
|
||||
if (subtract_one && i == 0)
|
||||
p = last_p;
|
||||
else if (subtract_one)
|
||||
p = le64toh(array->entry_array.items[i-1]);
|
||||
p = journal_file_entry_array_item(f, array, i - 1);
|
||||
else
|
||||
p = le64toh(array->entry_array.items[i]);
|
||||
p = journal_file_entry_array_item(f, array, i);
|
||||
|
||||
if (ret) {
|
||||
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);
|
||||
|
@ -194,7 +194,20 @@ int journal_file_tail_end_by_pread(JournalFile *f, uint64_t *ret_offset);
|
||||
int journal_file_tail_end_by_mmap(JournalFile *f, uint64_t *ret_offset);
|
||||
|
||||
uint64_t journal_file_entry_n_items(Object *o) _pure_;
|
||||
uint64_t journal_file_entry_array_n_items(Object *o) _pure_;
|
||||
uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) _pure_;
|
||||
|
||||
static inline uint64_t journal_file_entry_array_item(JournalFile *f, Object *o, size_t i) {
|
||||
assert(f);
|
||||
assert(o);
|
||||
return JOURNAL_HEADER_COMPACT(f->header) ? le32toh(o->entry_array.items.compact[i]) :
|
||||
le64toh(o->entry_array.items.regular[i]);
|
||||
}
|
||||
|
||||
static inline size_t journal_file_entry_array_item_size(JournalFile *f) {
|
||||
assert(f);
|
||||
return JOURNAL_HEADER_COMPACT(f->header) ? sizeof(le32_t) : sizeof(le64_t);
|
||||
}
|
||||
|
||||
uint64_t journal_file_hash_table_n_items(Object *o) _pure_;
|
||||
|
||||
int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *ret_offset);
|
||||
|
@ -335,8 +335,8 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o
|
||||
break;
|
||||
|
||||
case OBJECT_ENTRY_ARRAY:
|
||||
if ((le64toh(o->object.size) - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 ||
|
||||
(le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0) {
|
||||
if ((le64toh(o->object.size) - offsetof(Object, entry_array.items)) % journal_file_entry_array_item_size(f) != 0 ||
|
||||
(le64toh(o->object.size) - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f) <= 0) {
|
||||
error(offset,
|
||||
"Invalid object entry array size: %"PRIu64,
|
||||
le64toh(o->object.size));
|
||||
@ -350,15 +350,15 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o
|
||||
return -EBADMSG;
|
||||
}
|
||||
|
||||
for (uint64_t i = 0; i < journal_file_entry_array_n_items(o); i++)
|
||||
if (le64toh(o->entry_array.items[i]) != 0 &&
|
||||
!VALID64(le64toh(o->entry_array.items[i]))) {
|
||||
for (uint64_t i = 0; i < journal_file_entry_array_n_items(f, o); i++) {
|
||||
uint64_t q = journal_file_entry_array_item(f, o, i);
|
||||
if (q != 0 && !VALID64(q)) {
|
||||
error(offset,
|
||||
"Invalid object entry array item (%"PRIu64"/%"PRIu64"): "OFSfmt,
|
||||
i, journal_file_entry_array_n_items(o),
|
||||
le64toh(o->entry_array.items[i]));
|
||||
i, journal_file_entry_array_n_items(f, o), q);
|
||||
return -EBADMSG;
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
@ -490,10 +490,10 @@ static int verify_data(
|
||||
return -EBADMSG;
|
||||
}
|
||||
|
||||
m = journal_file_entry_array_n_items(o);
|
||||
m = journal_file_entry_array_n_items(f, o);
|
||||
for (j = 0; i < n && j < m; i++, j++) {
|
||||
|
||||
q = le64toh(o->entry_array.items[j]);
|
||||
q = journal_file_entry_array_item(f, o, j);
|
||||
if (q <= last) {
|
||||
error(p, "Data object's entry array not sorted (%"PRIu64" <= %"PRIu64")", q, last);
|
||||
return -EBADMSG;
|
||||
@ -737,11 +737,11 @@ static int verify_entry_array(
|
||||
return -EBADMSG;
|
||||
}
|
||||
|
||||
m = journal_file_entry_array_n_items(o);
|
||||
m = journal_file_entry_array_n_items(f, o);
|
||||
for (j = 0; i < n && j < m; i++, j++) {
|
||||
uint64_t p;
|
||||
|
||||
p = le64toh(o->entry_array.items[j]);
|
||||
p = journal_file_entry_array_item(f, o, j);
|
||||
if (p <= last) {
|
||||
error(a, "Entry array not sorted at %"PRIu64" of %"PRIu64, i, n);
|
||||
return -EBADMSG;
|
||||
|
Loading…
Reference in New Issue
Block a user