linux/arch/m68k/include/asm/bitops.h
Clement Courbet 0ade34c370 lib: optimize cpumask_next_and()
We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and().
It's essentially a joined iteration in search for a non-zero bit, which is
currently implemented as a lookup join (find a nonzero bit on the lhs,
lookup the rhs to see if it's set there).

Implement a direct join (find a nonzero bit on the incrementally built
join).  Also add generic bitmap benchmarks in the new `test_find_bit`
module for new function (see `find_next_and_bit` in [2] and [3] below).

For cpumask_next_and, direct benchmarking shows that it's 1.17x to 14x
faster with a geometric mean of 2.1 on 32 CPUs [1].  No impact on memory
usage.  Note that on Arm, the new pure-C implementation still outperforms
the old one that uses a mix of C and asm (`find_next_bit`) [3].

[1] Approximate benchmark code:

```
  unsigned long src1p[nr_cpumask_longs] = {pattern1};
  unsigned long src2p[nr_cpumask_longs] = {pattern2};
  for (/*a bunch of repetitions*/) {
    for (int n = -1; n <= nr_cpu_ids; ++n) {
      asm volatile("" : "+rm"(src1p)); // prevent any optimization
      asm volatile("" : "+rm"(src2p));
      unsigned long result = cpumask_next_and(n, src1p, src2p);
      asm volatile("" : "+rm"(result));
    }
  }
```

Results:
pattern1    pattern2     time_before/time_after
0x0000ffff  0x0000ffff   1.65
0x0000ffff  0x00005555   2.24
0x0000ffff  0x00001111   2.94
0x0000ffff  0x00000000   14.0
0x00005555  0x0000ffff   1.67
0x00005555  0x00005555   1.71
0x00005555  0x00001111   1.90
0x00005555  0x00000000   6.58
0x00001111  0x0000ffff   1.46
0x00001111  0x00005555   1.49
0x00001111  0x00001111   1.45
0x00001111  0x00000000   3.10
0x00000000  0x0000ffff   1.18
0x00000000  0x00005555   1.18
0x00000000  0x00001111   1.17
0x00000000  0x00000000   1.25
-----------------------------
               geo.mean  2.06

[2] test_find_next_bit, X86 (skylake)

 [ 3913.477422] Start testing find_bit() with random-filled bitmap
 [ 3913.477847] find_next_bit: 160868 cycles, 16484 iterations
 [ 3913.477933] find_next_zero_bit: 169542 cycles, 16285 iterations
 [ 3913.478036] find_last_bit: 201638 cycles, 16483 iterations
 [ 3913.480214] find_first_bit: 4353244 cycles, 16484 iterations
 [ 3913.480216] Start testing find_next_and_bit() with random-filled
 bitmap
 [ 3913.481074] find_next_and_bit: 89604 cycles, 8216 iterations
 [ 3913.481075] Start testing find_bit() with sparse bitmap
 [ 3913.481078] find_next_bit: 2536 cycles, 66 iterations
 [ 3913.481252] find_next_zero_bit: 344404 cycles, 32703 iterations
 [ 3913.481255] find_last_bit: 2006 cycles, 66 iterations
 [ 3913.481265] find_first_bit: 17488 cycles, 66 iterations
 [ 3913.481266] Start testing find_next_and_bit() with sparse bitmap
 [ 3913.481272] find_next_and_bit: 764 cycles, 1 iterations

[3] test_find_next_bit, arm (v7 odroid XU3).

[  267.206928] Start testing find_bit() with random-filled bitmap
[  267.214752] find_next_bit: 4474 cycles, 16419 iterations
[  267.221850] find_next_zero_bit: 5976 cycles, 16350 iterations
[  267.229294] find_last_bit: 4209 cycles, 16419 iterations
[  267.279131] find_first_bit: 1032991 cycles, 16420 iterations
[  267.286265] Start testing find_next_and_bit() with random-filled
bitmap
[  267.302386] find_next_and_bit: 2290 cycles, 8140 iterations
[  267.309422] Start testing find_bit() with sparse bitmap
[  267.316054] find_next_bit: 191 cycles, 66 iterations
[  267.322726] find_next_zero_bit: 8758 cycles, 32703 iterations
[  267.329803] find_last_bit: 84 cycles, 66 iterations
[  267.336169] find_first_bit: 4118 cycles, 66 iterations
[  267.342627] Start testing find_next_and_bit() with sparse bitmap
[  267.356919] find_next_and_bit: 91 cycles, 1 iterations

[courbet@google.com: v6]
  Link: http://lkml.kernel.org/r/20171129095715.23430-1-courbet@google.com
[geert@linux-m68k.org: m68k/bitops: always include <asm-generic/bitops/find.h>]
  Link: http://lkml.kernel.org/r/1512556816-28627-1-git-send-email-geert@linux-m68k.org
Link: http://lkml.kernel.org/r/20171128131334.23491-1-courbet@google.com
Signed-off-by: Clement Courbet <courbet@google.com>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Yury Norov <ynorov@caviumnetworks.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-02-06 18:32:44 -08:00

527 lines
12 KiB
C

#ifndef _M68K_BITOPS_H
#define _M68K_BITOPS_H
/*
* Copyright 1992, Linus Torvalds.
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of this archive
* for more details.
*/
#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
#endif
#include <linux/compiler.h>
#include <asm/barrier.h>
/*
* Bit access functions vary across the ColdFire and 68k families.
* So we will break them out here, and then macro in the ones we want.
*
* ColdFire - supports standard bset/bclr/bchg with register operand only
* 68000 - supports standard bset/bclr/bchg with memory operand
* >= 68020 - also supports the bfset/bfclr/bfchg instructions
*
* Although it is possible to use only the bset/bclr/bchg with register
* operands on all platforms you end up with larger generated code.
* So we use the best form possible on a given platform.
*/
static inline void bset_reg_set_bit(int nr, volatile unsigned long *vaddr)
{
char *p = (char *)vaddr + (nr ^ 31) / 8;
__asm__ __volatile__ ("bset %1,(%0)"
:
: "a" (p), "di" (nr & 7)
: "memory");
}
static inline void bset_mem_set_bit(int nr, volatile unsigned long *vaddr)
{
char *p = (char *)vaddr + (nr ^ 31) / 8;
__asm__ __volatile__ ("bset %1,%0"
: "+m" (*p)
: "di" (nr & 7));
}
static inline void bfset_mem_set_bit(int nr, volatile unsigned long *vaddr)
{
__asm__ __volatile__ ("bfset %1{%0:#1}"
:
: "d" (nr ^ 31), "o" (*vaddr)
: "memory");
}
#if defined(CONFIG_COLDFIRE)
#define set_bit(nr, vaddr) bset_reg_set_bit(nr, vaddr)
#elif defined(CONFIG_CPU_HAS_NO_BITFIELDS)
#define set_bit(nr, vaddr) bset_mem_set_bit(nr, vaddr)
#else
#define set_bit(nr, vaddr) (__builtin_constant_p(nr) ? \
bset_mem_set_bit(nr, vaddr) : \
bfset_mem_set_bit(nr, vaddr))
#endif
#define __set_bit(nr, vaddr) set_bit(nr, vaddr)
static inline void bclr_reg_clear_bit(int nr, volatile unsigned long *vaddr)
{
char *p = (char *)vaddr + (nr ^ 31) / 8;
__asm__ __volatile__ ("bclr %1,(%0)"
:
: "a" (p), "di" (nr & 7)
: "memory");
}
static inline void bclr_mem_clear_bit(int nr, volatile unsigned long *vaddr)
{
char *p = (char *)vaddr + (nr ^ 31) / 8;
__asm__ __volatile__ ("bclr %1,%0"
: "+m" (*p)
: "di" (nr & 7));
}
static inline void bfclr_mem_clear_bit(int nr, volatile unsigned long *vaddr)
{
__asm__ __volatile__ ("bfclr %1{%0:#1}"
:
: "d" (nr ^ 31), "o" (*vaddr)
: "memory");
}
#if defined(CONFIG_COLDFIRE)
#define clear_bit(nr, vaddr) bclr_reg_clear_bit(nr, vaddr)
#elif defined(CONFIG_CPU_HAS_NO_BITFIELDS)
#define clear_bit(nr, vaddr) bclr_mem_clear_bit(nr, vaddr)
#else
#define clear_bit(nr, vaddr) (__builtin_constant_p(nr) ? \
bclr_mem_clear_bit(nr, vaddr) : \
bfclr_mem_clear_bit(nr, vaddr))
#endif
#define __clear_bit(nr, vaddr) clear_bit(nr, vaddr)
static inline void bchg_reg_change_bit(int nr, volatile unsigned long *vaddr)
{
char *p = (char *)vaddr + (nr ^ 31) / 8;
__asm__ __volatile__ ("bchg %1,(%0)"
:
: "a" (p), "di" (nr & 7)
: "memory");
}
static inline void bchg_mem_change_bit(int nr, volatile unsigned long *vaddr)
{
char *p = (char *)vaddr + (nr ^ 31) / 8;
__asm__ __volatile__ ("bchg %1,%0"
: "+m" (*p)
: "di" (nr & 7));
}
static inline void bfchg_mem_change_bit(int nr, volatile unsigned long *vaddr)
{
__asm__ __volatile__ ("bfchg %1{%0:#1}"
:
: "d" (nr ^ 31), "o" (*vaddr)
: "memory");
}
#if defined(CONFIG_COLDFIRE)
#define change_bit(nr, vaddr) bchg_reg_change_bit(nr, vaddr)
#elif defined(CONFIG_CPU_HAS_NO_BITFIELDS)
#define change_bit(nr, vaddr) bchg_mem_change_bit(nr, vaddr)
#else
#define change_bit(nr, vaddr) (__builtin_constant_p(nr) ? \
bchg_mem_change_bit(nr, vaddr) : \
bfchg_mem_change_bit(nr, vaddr))
#endif
#define __change_bit(nr, vaddr) change_bit(nr, vaddr)
static inline int test_bit(int nr, const volatile unsigned long *vaddr)
{
return (vaddr[nr >> 5] & (1UL << (nr & 31))) != 0;
}
static inline int bset_reg_test_and_set_bit(int nr,
volatile unsigned long *vaddr)
{
char *p = (char *)vaddr + (nr ^ 31) / 8;
char retval;
__asm__ __volatile__ ("bset %2,(%1); sne %0"
: "=d" (retval)
: "a" (p), "di" (nr & 7)
: "memory");
return retval;
}
static inline int bset_mem_test_and_set_bit(int nr,
volatile unsigned long *vaddr)
{
char *p = (char *)vaddr + (nr ^ 31) / 8;
char retval;
__asm__ __volatile__ ("bset %2,%1; sne %0"
: "=d" (retval), "+m" (*p)
: "di" (nr & 7));
return retval;
}
static inline int bfset_mem_test_and_set_bit(int nr,
volatile unsigned long *vaddr)
{
char retval;
__asm__ __volatile__ ("bfset %2{%1:#1}; sne %0"
: "=d" (retval)
: "d" (nr ^ 31), "o" (*vaddr)
: "memory");
return retval;
}
#if defined(CONFIG_COLDFIRE)
#define test_and_set_bit(nr, vaddr) bset_reg_test_and_set_bit(nr, vaddr)
#elif defined(CONFIG_CPU_HAS_NO_BITFIELDS)
#define test_and_set_bit(nr, vaddr) bset_mem_test_and_set_bit(nr, vaddr)
#else
#define test_and_set_bit(nr, vaddr) (__builtin_constant_p(nr) ? \
bset_mem_test_and_set_bit(nr, vaddr) : \
bfset_mem_test_and_set_bit(nr, vaddr))
#endif
#define __test_and_set_bit(nr, vaddr) test_and_set_bit(nr, vaddr)
static inline int bclr_reg_test_and_clear_bit(int nr,
volatile unsigned long *vaddr)
{
char *p = (char *)vaddr + (nr ^ 31) / 8;
char retval;
__asm__ __volatile__ ("bclr %2,(%1); sne %0"
: "=d" (retval)
: "a" (p), "di" (nr & 7)
: "memory");
return retval;
}
static inline int bclr_mem_test_and_clear_bit(int nr,
volatile unsigned long *vaddr)
{
char *p = (char *)vaddr + (nr ^ 31) / 8;
char retval;
__asm__ __volatile__ ("bclr %2,%1; sne %0"
: "=d" (retval), "+m" (*p)
: "di" (nr & 7));
return retval;
}
static inline int bfclr_mem_test_and_clear_bit(int nr,
volatile unsigned long *vaddr)
{
char retval;
__asm__ __volatile__ ("bfclr %2{%1:#1}; sne %0"
: "=d" (retval)
: "d" (nr ^ 31), "o" (*vaddr)
: "memory");
return retval;
}
#if defined(CONFIG_COLDFIRE)
#define test_and_clear_bit(nr, vaddr) bclr_reg_test_and_clear_bit(nr, vaddr)
#elif defined(CONFIG_CPU_HAS_NO_BITFIELDS)
#define test_and_clear_bit(nr, vaddr) bclr_mem_test_and_clear_bit(nr, vaddr)
#else
#define test_and_clear_bit(nr, vaddr) (__builtin_constant_p(nr) ? \
bclr_mem_test_and_clear_bit(nr, vaddr) : \
bfclr_mem_test_and_clear_bit(nr, vaddr))
#endif
#define __test_and_clear_bit(nr, vaddr) test_and_clear_bit(nr, vaddr)
static inline int bchg_reg_test_and_change_bit(int nr,
volatile unsigned long *vaddr)
{
char *p = (char *)vaddr + (nr ^ 31) / 8;
char retval;
__asm__ __volatile__ ("bchg %2,(%1); sne %0"
: "=d" (retval)
: "a" (p), "di" (nr & 7)
: "memory");
return retval;
}
static inline int bchg_mem_test_and_change_bit(int nr,
volatile unsigned long *vaddr)
{
char *p = (char *)vaddr + (nr ^ 31) / 8;
char retval;
__asm__ __volatile__ ("bchg %2,%1; sne %0"
: "=d" (retval), "+m" (*p)
: "di" (nr & 7));
return retval;
}
static inline int bfchg_mem_test_and_change_bit(int nr,
volatile unsigned long *vaddr)
{
char retval;
__asm__ __volatile__ ("bfchg %2{%1:#1}; sne %0"
: "=d" (retval)
: "d" (nr ^ 31), "o" (*vaddr)
: "memory");
return retval;
}
#if defined(CONFIG_COLDFIRE)
#define test_and_change_bit(nr, vaddr) bchg_reg_test_and_change_bit(nr, vaddr)
#elif defined(CONFIG_CPU_HAS_NO_BITFIELDS)
#define test_and_change_bit(nr, vaddr) bchg_mem_test_and_change_bit(nr, vaddr)
#else
#define test_and_change_bit(nr, vaddr) (__builtin_constant_p(nr) ? \
bchg_mem_test_and_change_bit(nr, vaddr) : \
bfchg_mem_test_and_change_bit(nr, vaddr))
#endif
#define __test_and_change_bit(nr, vaddr) test_and_change_bit(nr, vaddr)
/*
* The true 68020 and more advanced processors support the "bfffo"
* instruction for finding bits. ColdFire and simple 68000 parts
* (including CPU32) do not support this. They simply use the generic
* functions.
*/
#if defined(CONFIG_CPU_HAS_NO_BITFIELDS)
#include <asm-generic/bitops/ffz.h>
#else
static inline int find_first_zero_bit(const unsigned long *vaddr,
unsigned size)
{
const unsigned long *p = vaddr;
int res = 32;
unsigned int words;
unsigned long num;
if (!size)
return 0;
words = (size + 31) >> 5;
while (!(num = ~*p++)) {
if (!--words)
goto out;
}
__asm__ __volatile__ ("bfffo %1{#0,#0},%0"
: "=d" (res) : "d" (num & -num));
res ^= 31;
out:
res += ((long)p - (long)vaddr - 4) * 8;
return res < size ? res : size;
}
#define find_first_zero_bit find_first_zero_bit
static inline int find_next_zero_bit(const unsigned long *vaddr, int size,
int offset)
{
const unsigned long *p = vaddr + (offset >> 5);
int bit = offset & 31UL, res;
if (offset >= size)
return size;
if (bit) {
unsigned long num = ~*p++ & (~0UL << bit);
offset -= bit;
/* Look for zero in first longword */
__asm__ __volatile__ ("bfffo %1{#0,#0},%0"
: "=d" (res) : "d" (num & -num));
if (res < 32) {
offset += res ^ 31;
return offset < size ? offset : size;
}
offset += 32;
if (offset >= size)
return size;
}
/* No zero yet, search remaining full bytes for a zero */
return offset + find_first_zero_bit(p, size - offset);
}
#define find_next_zero_bit find_next_zero_bit
static inline int find_first_bit(const unsigned long *vaddr, unsigned size)
{
const unsigned long *p = vaddr;
int res = 32;
unsigned int words;
unsigned long num;
if (!size)
return 0;
words = (size + 31) >> 5;
while (!(num = *p++)) {
if (!--words)
goto out;
}
__asm__ __volatile__ ("bfffo %1{#0,#0},%0"
: "=d" (res) : "d" (num & -num));
res ^= 31;
out:
res += ((long)p - (long)vaddr - 4) * 8;
return res < size ? res : size;
}
#define find_first_bit find_first_bit
static inline int find_next_bit(const unsigned long *vaddr, int size,
int offset)
{
const unsigned long *p = vaddr + (offset >> 5);
int bit = offset & 31UL, res;
if (offset >= size)
return size;
if (bit) {
unsigned long num = *p++ & (~0UL << bit);
offset -= bit;
/* Look for one in first longword */
__asm__ __volatile__ ("bfffo %1{#0,#0},%0"
: "=d" (res) : "d" (num & -num));
if (res < 32) {
offset += res ^ 31;
return offset < size ? offset : size;
}
offset += 32;
if (offset >= size)
return size;
}
/* No one yet, search remaining full bytes for a one */
return offset + find_first_bit(p, size - offset);
}
#define find_next_bit find_next_bit
/*
* ffz = Find First Zero in word. Undefined if no zero exists,
* so code should check against ~0UL first..
*/
static inline unsigned long ffz(unsigned long word)
{
int res;
__asm__ __volatile__ ("bfffo %1{#0,#0},%0"
: "=d" (res) : "d" (~word & -~word));
return res ^ 31;
}
#endif
#include <asm-generic/bitops/find.h>
#ifdef __KERNEL__
#if defined(CONFIG_CPU_HAS_NO_BITFIELDS)
/*
* The newer ColdFire family members support a "bitrev" instruction
* and we can use that to implement a fast ffs. Older Coldfire parts,
* and normal 68000 parts don't have anything special, so we use the
* generic functions for those.
*/
#if (defined(__mcfisaaplus__) || defined(__mcfisac__)) && \
!defined(CONFIG_M68000) && !defined(CONFIG_MCPU32)
static inline int __ffs(int x)
{
__asm__ __volatile__ ("bitrev %0; ff1 %0"
: "=d" (x)
: "0" (x));
return x;
}
static inline int ffs(int x)
{
if (!x)
return 0;
return __ffs(x) + 1;
}
#else
#include <asm-generic/bitops/ffs.h>
#include <asm-generic/bitops/__ffs.h>
#endif
#include <asm-generic/bitops/fls.h>
#include <asm-generic/bitops/__fls.h>
#else
/*
* ffs: find first bit set. This is defined the same way as
* the libc and compiler builtin ffs routines, therefore
* differs in spirit from the above ffz (man ffs).
*/
static inline int ffs(int x)
{
int cnt;
__asm__ ("bfffo %1{#0:#0},%0"
: "=d" (cnt)
: "dm" (x & -x));
return 32 - cnt;
}
#define __ffs(x) (ffs(x) - 1)
/*
* fls: find last bit set.
*/
static inline int fls(int x)
{
int cnt;
__asm__ ("bfffo %1{#0,#0},%0"
: "=d" (cnt)
: "dm" (x));
return 32 - cnt;
}
static inline int __fls(int x)
{
return fls(x) - 1;
}
#endif
#include <asm-generic/bitops/ext2-atomic.h>
#include <asm-generic/bitops/le.h>
#include <asm-generic/bitops/fls64.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/hweight.h>
#include <asm-generic/bitops/lock.h>
#endif /* __KERNEL__ */
#endif /* _M68K_BITOPS_H */