8941e93ca5
To optimize memset()/memcpy()/memmove() and so on, we use a jump table to dispatch cases for short data lengths; and for long data lengths, we split the destination into head part (first 8 bytes), tail part (last 8 bytes) and middle part. The head part and tail part may be at unaligned addresses, while the middle part is always aligned (the middle part is allowed to overlap the head/tail part). In this way, the first and last 8 bytes may be unaligned accesses, but we can make sure the data in the middle is processed at an aligned destination address. We have tested micro-bench[1] on a Loongson-3C5000 16-core machine (2.2GHz): 1. memset | length | src offset | dst offset | speed before | speed after | % | |--------|------------|------------|--------------|-------------|---------| | 8 | 0 | 0 | 696.191 | 1518.785 | 118.16% | | 8 | 0 | 1 | 696.325 | 1518.937 | 118.14% | | 50 | 0 | 0 | 969.976 | 8053.902 | 730.32% | | 50 | 0 | 1 | 970.034 | 8058.475 | 730.74% | | 300 | 0 | 0 | 5876.612 | 16544.703 | 181.53% | | 300 | 0 | 1 | 5030.849 | 16549.011 | 228.95% | | 1200 | 0 | 0 | 11797.077 | 16752.137 | 42.00% | | 1200 | 0 | 1 | 5687.141 | 16645.233 | 192.68% | | 4000 | 0 | 0 | 15723.27 | 16761.557 | 6.60% | | 4000 | 0 | 1 | 5906.114 | 16732.316 | 183.30% | | 8000 | 0 | 0 | 16751.403 | 16770.002 | 0.11% | | 8000 | 0 | 1 | 5995.449 | 16754.07 | 179.45% | 2. memcpy | length | src offset | dst offset | speed before | speed after | % | |--------|------------|------------|--------------|-------------|---------| | 8 | 0 | 0 | 696.2 | 1670.605 | 139.96% | | 8 | 0 | 1 | 696.325 | 1671.138 | 139.99% | | 50 | 0 | 0 | 969.974 | 8724.999 | 799.51% | | 50 | 0 | 1 | 970.032 | 8730.138 | 799.98% | | 300 | 0 | 0 | 5564.662 | 16272.652 | 192.43% | | 300 | 0 | 1 | 4670.436 | 14972.842 | 220.59% | | 1200 | 0 | 0 | 10740.23 | 16751.728 | 55.97% | | 1200 | 0 | 1 | 5027.741 | 14874.564 | 195.85% | | 4000 | 0 | 0 | 15122.367 | 16737.642 | 10.68% | | 4000 | 0 | 1 | 5536.918 | 14890.397 | 168.93% | | 8000 | 0 | 0 | 16505.453 | 16553.543 | 0.29% | | 8000 | 0 | 1 | 5821.619 | 14841.804 | 154.94% | 3. memmove | length | src offset | dst offset | speed before | speed after | % | |--------|------------|------------|--------------|-------------|---------| | 8 | 0 | 0 | 982.693 | 1670.568 | 70.00% | | 8 | 0 | 1 | 983.023 | 1671.174 | 70.00% | | 50 | 0 | 0 | 1230.87 | 8727.625 | 609.06% | | 50 | 0 | 1 | 1232.515 | 8730.138 | 608.32% | | 300 | 0 | 0 | 6490.375 | 16296.993 | 151.09% | | 300 | 0 | 1 | 4282.687 | 14972.842 | 249.61% | | 1200 | 0 | 0 | 11742.755 | 16752.546 | 42.66% | | 1200 | 0 | 1 | 5039.338 | 14872.951 | 195.14% | | 4000 | 0 | 0 | 15467.786 | 16737.09 | 8.21% | | 4000 | 0 | 1 | 5009.905 | 14890.542 | 197.22% | | 8000 | 0 | 0 | 16489.664 | 16553.273 | 0.39% | | 8000 | 0 | 1 | 5823.786 | 14858.646 | 155.14% | * speed: MB/s * length: byte [1] https://github.com/heiher/mem-bench Signed-off-by: WANG Rui <wangrui@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
163 lines
2.3 KiB
ArmAsm
163 lines
2.3 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
|
|
*/
|
|
|
|
#include <asm/alternative-asm.h>
|
|
#include <asm/asm.h>
|
|
#include <asm/asmmacro.h>
|
|
#include <asm/cpu.h>
|
|
#include <asm/export.h>
|
|
#include <asm/regdef.h>
|
|
|
|
.macro fill_to_64 r0
|
|
bstrins.d \r0, \r0, 15, 8
|
|
bstrins.d \r0, \r0, 31, 16
|
|
bstrins.d \r0, \r0, 63, 32
|
|
.endm
|
|
|
|
SYM_FUNC_START(memset)
|
|
/*
|
|
* Some CPUs support hardware unaligned access
|
|
*/
|
|
ALTERNATIVE "b __memset_generic", \
|
|
"b __memset_fast", CPU_FEATURE_UAL
|
|
SYM_FUNC_END(memset)
|
|
_ASM_NOKPROBE(memset)
|
|
|
|
EXPORT_SYMBOL(memset)
|
|
|
|
/*
|
|
* void *__memset_generic(void *s, int c, size_t n)
|
|
*
|
|
* a0: s
|
|
* a1: c
|
|
* a2: n
|
|
*/
|
|
SYM_FUNC_START(__memset_generic)
|
|
move a3, a0
|
|
beqz a2, 2f
|
|
|
|
1: st.b a1, a0, 0
|
|
addi.d a0, a0, 1
|
|
addi.d a2, a2, -1
|
|
bgt a2, zero, 1b
|
|
|
|
2: move a0, a3
|
|
jr ra
|
|
SYM_FUNC_END(__memset_generic)
|
|
_ASM_NOKPROBE(__memset_generic)
|
|
|
|
/*
|
|
* void *__memset_fast(void *s, int c, size_t n)
|
|
*
|
|
* a0: s
|
|
* a1: c
|
|
* a2: n
|
|
*/
|
|
SYM_FUNC_START(__memset_fast)
|
|
/* fill a1 to 64 bits */
|
|
fill_to_64 a1
|
|
|
|
sltui t0, a2, 9
|
|
bnez t0, .Lsmall
|
|
|
|
add.d a2, a0, a2
|
|
st.d a1, a0, 0
|
|
|
|
/* align up address */
|
|
addi.d a3, a0, 8
|
|
bstrins.d a3, zero, 2, 0
|
|
|
|
addi.d a4, a2, -64
|
|
bgeu a3, a4, .Llt64
|
|
|
|
/* set 64 bytes at a time */
|
|
.Lloop64:
|
|
st.d a1, a3, 0
|
|
st.d a1, a3, 8
|
|
st.d a1, a3, 16
|
|
st.d a1, a3, 24
|
|
st.d a1, a3, 32
|
|
st.d a1, a3, 40
|
|
st.d a1, a3, 48
|
|
st.d a1, a3, 56
|
|
addi.d a3, a3, 64
|
|
bltu a3, a4, .Lloop64
|
|
|
|
/* set the remaining bytes */
|
|
.Llt64:
|
|
addi.d a4, a2, -32
|
|
bgeu a3, a4, .Llt32
|
|
st.d a1, a3, 0
|
|
st.d a1, a3, 8
|
|
st.d a1, a3, 16
|
|
st.d a1, a3, 24
|
|
addi.d a3, a3, 32
|
|
|
|
.Llt32:
|
|
addi.d a4, a2, -16
|
|
bgeu a3, a4, .Llt16
|
|
st.d a1, a3, 0
|
|
st.d a1, a3, 8
|
|
addi.d a3, a3, 16
|
|
|
|
.Llt16:
|
|
addi.d a4, a2, -8
|
|
bgeu a3, a4, .Llt8
|
|
st.d a1, a3, 0
|
|
|
|
.Llt8:
|
|
st.d a1, a2, -8
|
|
|
|
/* return */
|
|
jr ra
|
|
|
|
.align 4
|
|
.Lsmall:
|
|
pcaddi t0, 4
|
|
slli.d a2, a2, 4
|
|
add.d t0, t0, a2
|
|
jr t0
|
|
|
|
.align 4
|
|
0: jr ra
|
|
|
|
.align 4
|
|
1: st.b a1, a0, 0
|
|
jr ra
|
|
|
|
.align 4
|
|
2: st.h a1, a0, 0
|
|
jr ra
|
|
|
|
.align 4
|
|
3: st.h a1, a0, 0
|
|
st.b a1, a0, 2
|
|
jr ra
|
|
|
|
.align 4
|
|
4: st.w a1, a0, 0
|
|
jr ra
|
|
|
|
.align 4
|
|
5: st.w a1, a0, 0
|
|
st.b a1, a0, 4
|
|
jr ra
|
|
|
|
.align 4
|
|
6: st.w a1, a0, 0
|
|
st.h a1, a0, 4
|
|
jr ra
|
|
|
|
.align 4
|
|
7: st.w a1, a0, 0
|
|
st.w a1, a0, 3
|
|
jr ra
|
|
|
|
.align 4
|
|
8: st.d a1, a0, 0
|
|
jr ra
|
|
SYM_FUNC_END(__memset_fast)
|
|
_ASM_NOKPROBE(__memset_fast)
|