memopt

记录一下阅读memcpy/memcmp/memmove时碰到的一些问题

memcmp

在plain版本中，copy 大块会调用__vm_copy函数按页为单位进行拷贝。此时只是将页面设置为copy on write，待目的地址被写入数据时，才会调用copy_page函数按页面为单位进行数据copy。copy_page的实现如下。

x86

位置：linux/arch/x86/lib/copy_page_64.S

	ALIGN
SYM_FUNC_START(copy_page)
	ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD # X86_FEATURE_REP_GOOD是Linux内核中的一个标志，表示某些x86 CPU在使用带有 REP 前缀的字符串操作指令时性能表现良好（例如 REP MOVSB, REP STOSB 等）
	movl	$4096/8, %ecx 					# movsq一次拷贝8字节
	rep	movsq
	RET
SYM_FUNC_END(copy_page)
EXPORT_SYMBOL(copy_page)

SYM_FUNC_START_LOCAL(copy_page_regs)
	subq	$2*8,	%rsp
	movq	%rbx,	(%rsp)					# x86中，rbx和r12由callee保存
	movq	%r12,	1*8(%rsp)

	movl	$(4096/64)-5,	%ecx			# 将拷贝一个页面分为两个阶段，1)存在数据预取，2)不存在数据预取。5是预取的数据距离当前操作的数据的距离，单位是cacheline的大小
	.p2align 4
.Loop64:									# 页面拷贝的第一阶段
	dec	%rcx
	movq	0x8*0(%rsi), %rax
	movq	0x8*1(%rsi), %rbx
	movq	0x8*2(%rsi), %rdx
	movq	0x8*3(%rsi), %r8
	movq	0x8*4(%rsi), %r9
	movq	0x8*5(%rsi), %r10
	movq	0x8*6(%rsi), %r11
	movq	0x8*7(%rsi), %r12

	prefetcht0 5*64(%rsi)					# 为什么距离是5个cacheline：猜测由benchmark决定的

	movq	%rax, 0x8*0(%rdi)
	movq	%rbx, 0x8*1(%rdi)
	movq	%rdx, 0x8*2(%rdi)
	movq	%r8,  0x8*3(%rdi)
	movq	%r9,  0x8*4(%rdi)
	movq	%r10, 0x8*5(%rdi)
	movq	%r11, 0x8*6(%rdi)
	movq	%r12, 0x8*7(%rdi)

	leaq	64 (%rsi), %rsi
	leaq	64 (%rdi), %rdi

	jnz	.Loop64

	movl	$5, %ecx
	.p2align 4
.Loop2:										# 页面拷贝的第二阶段
	decl	%ecx

	movq	0x8*0(%rsi), %rax
	movq	0x8*1(%rsi), %rbx
	movq	0x8*2(%rsi), %rdx
	movq	0x8*3(%rsi), %r8
	movq	0x8*4(%rsi), %r9
	movq	0x8*5(%rsi), %r10
	movq	0x8*6(%rsi), %r11
	movq	0x8*7(%rsi), %r12

	movq	%rax, 0x8*0(%rdi)
	movq	%rbx, 0x8*1(%rdi)
	movq	%rdx, 0x8*2(%rdi)
	movq	%r8,  0x8*3(%rdi)
	movq	%r9,  0x8*4(%rdi)
	movq	%r10, 0x8*5(%rdi)
	movq	%r11, 0x8*6(%rdi)
	movq	%r12, 0x8*7(%rdi)

	leaq	64(%rdi), %rdi
	leaq	64(%rsi), %rsi
	jnz	.Loop2

	movq	(%rsp), %rbx
	movq	1*8(%rsp), %r12
	addq	$2*8, %rsp
	RET
SYM_FUNC_END(copy_page_regs)

loongarch

ELF psABI

位置：arch/loongarch/mm/page.S

.align 5
SYM_FUNC_START(copy_page)
	lu12i.w	t8, 1 << (PAGE_SHIFT - 12)
	add.d	t8, t8, a0
1:
	ld.d	t0, a1, 0					# a1是源操作数的地址，a0是目的操作数的地址，（需要翻翻loongarch的abi了）
	ld.d	t1, a1, 8
	ld.d	t2, a1, 16
	ld.d	t3, a1, 24
	ld.d	t4, a1, 32
	ld.d	t5, a1, 40
	ld.d	t6, a1, 48
	ld.d	t7, a1, 56

	st.d	t0, a0, 0					# 1)为什么按照如下顺序排列？2)为什么没使用预取指令？
	st.d	t1, a0, 8
	ld.d	t0, a1, 64
	ld.d	t1, a1, 72
	st.d	t2, a0, 16
	st.d	t3, a0, 24
	ld.d	t2, a1, 80
	ld.d	t3, a1, 88
	st.d	t4, a0, 32
	st.d	t5, a0, 40
	ld.d	t4, a1, 96
	ld.d	t5, a1, 104
	st.d	t6, a0, 48
	st.d	t7, a0, 56
	ld.d	t6, a1, 112
	ld.d	t7, a1, 120
	addi.d	a0, a0, 128
	addi.d	a1, a1, 128

	st.d	t0, a0, -64
	st.d	t1, a0, -56
	st.d	t2, a0, -48
	st.d	t3, a0, -40
	st.d	t4, a0, -32
	st.d	t5, a0, -24
	st.d	t6, a0, -16
	st.d	t7, a0, -8

	bne	t8, a0, 1b
	jr	ra
SYM_FUNC_END(copy_page)
EXPORT_SYMBOL(copy_page)

其实和arm64的实现几乎一致

有以下两个问题：

为什么未使用预取指令（loongarch64的预取指令是preld和preldx）？

copy_page的访存行为非常简单，现代cache的硬件预取器已足够预取。
一次循环拷贝128字节数据，为什么不一次性从源地址中取出128字节?

这会使得寄存器压力过大
为什么边取数据边存数据，不能取完再统一存入吗？

cache读写都有对应的端口，存取数据可以并行执行

glibc

memopt

http://example.com/memopt/

作者

发布于

2024年5月15日

许可协议

动态链接相关结构上一篇

vtbl 下一篇