Merge branch 'word-at-a-time'

Merge minor word-at-a-time instruction choice improvements for x86 and
arm64.

This is the second of four branches that came out of me looking at the
code generation for path lookup on arm64.

The word-at-a-time infrastructure is used to do string operations in
chunks of one word both when copying the pathname from user space (in
strncpy_from_user()), and when parsing and hashing the individual path
components (in link_path_walk()).

In particular, the "find the first zero byte" uses various bit tricks to
figure out the end of the string or path component, and get the length
without having to do things one byte at a time.  Both x86-64 and arm64
had less than optimal code choices for that.

The commit message for the arm64 change in particular tries to explain
the exact code flow for the zero byte finding for people who care.  It's
made a bit more complicated by the fact that we support big-endian
hardware too, and so we have some extra abstraction layers to allow
different models for finding the zero byte, quite apart from the issue
of picking specialized instructions.

* word-at-a-time:
  arm64: word-at-a-time: improve byte count calculations for LE
  x86-64: word-at-a-time: improve byte count calculations
This commit is contained in:
Linus Torvalds 2024-07-15 08:55:10 -07:00
commit 6a31ffdfed
2 changed files with 26 additions and 42 deletions

View File

@ -27,20 +27,15 @@ static inline unsigned long has_zero(unsigned long a, unsigned long *bits,
}
#define prep_zero_mask(a, bits, c) (bits)
#define create_zero_mask(bits) (bits)
#define find_zero(bits) (__ffs(bits) >> 3)
static inline unsigned long create_zero_mask(unsigned long bits)
static inline unsigned long zero_bytemask(unsigned long bits)
{
bits = (bits - 1) & ~bits;
return bits >> 7;
}
static inline unsigned long find_zero(unsigned long mask)
{
return fls64(mask) >> 3;
}
#define zero_bytemask(mask) (mask)
#else /* __AARCH64EB__ */
#include <asm-generic/word-at-a-time.h>
#endif

View File

@ -5,45 +5,12 @@
#include <linux/bitops.h>
#include <linux/wordpart.h>
/*
* This is largely generic for little-endian machines, but the
* optimal byte mask counting is probably going to be something
* that is architecture-specific. If you have a reliably fast
* bit count instruction, that might be better than the multiply
* and shift, for example.
*/
struct word_at_a_time {
const unsigned long one_bits, high_bits;
};
#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
#ifdef CONFIG_64BIT
/*
* Jan Achrenius on G+: microoptimized version of
* the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
* that works for the bytemasks without having to
* mask them first.
*/
static inline long count_masked_bytes(unsigned long mask)
{
return mask*0x0001020304050608ul >> 56;
}
#else /* 32-bit case */
/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
static inline long count_masked_bytes(long mask)
{
/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
long a = (0x0ff0001+mask) >> 23;
/* Fix the 1 for 00 case */
return a & mask;
}
#endif
/* Return nonzero if it has a zero */
static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
{
@ -57,6 +24,22 @@ static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits,
return bits;
}
#ifdef CONFIG_64BIT
/* Keep the initial has_zero() value for both bitmask and size calc */
#define create_zero_mask(bits) (bits)
static inline unsigned long zero_bytemask(unsigned long bits)
{
bits = (bits - 1) & ~bits;
return bits >> 7;
}
#define find_zero(bits) (__ffs(bits) >> 3)
#else
/* Create the final mask for both bytemask and size */
static inline unsigned long create_zero_mask(unsigned long bits)
{
bits = (bits - 1) & ~bits;
@ -66,11 +49,17 @@ static inline unsigned long create_zero_mask(unsigned long bits)
/* The mask we created is directly usable as a bytemask */
#define zero_bytemask(mask) (mask)
/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
static inline unsigned long find_zero(unsigned long mask)
{
return count_masked_bytes(mask);
/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
long a = (0x0ff0001+mask) >> 23;
/* Fix the 1 for 00 case */
return a & mask;
}
#endif
/*
* Load an unaligned word from kernel space.
*