kernel_optimize_test/arch/arc/lib/memset-archs.S

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com)
 */

#include <linux/linkage.h>
#include <asm/cache.h>

/*
 * The memset implementation below is optimized to use prefetchw and prealloc
 * instruction in case of CPU with 64B L1 data cache line (L1_CACHE_SHIFT == 6)
 * If you want to implement optimized memset for other possible L1 data cache
 * line lengths (32B and 128B) you should rewrite code carefully checking
 * we don't call any prefetchw/prealloc instruction for L1 cache lines which
 * don't belongs to memset area.
 */

#if L1_CACHE_SHIFT == 6

.macro PREALLOC_INSTR	reg, off
	prealloc	[\reg, \off]
.endm

.macro PREFETCHW_INSTR	reg, off
	prefetchw	[\reg, \off]
.endm

#else

.macro PREALLOC_INSTR	reg, off
.endm

.macro PREFETCHW_INSTR	reg, off
.endm

#endif

ENTRY_CFI(memset)
	PREFETCHW_INSTR	r0, 0	; Prefetch the first write location
	mov.f	0, r2
;;; if size is zero
	jz.d	[blink]
	mov	r3, r0		; don't clobber ret val

;;; if length < 8
	brls.d.nt	r2, 8, .Lsmallchunk
	mov.f	lp_count,r2

	and.f	r4, r0, 0x03
	rsub	lp_count, r4, 4
	lpnz	@.Laligndestination
	;; LOOP BEGIN
	stb.ab	r1, [r3,1]
	sub	r2, r2, 1
.Laligndestination:

;;; Destination is aligned
	and	r1, r1, 0xFF
	asl	r4, r1, 8
	or	r4, r4, r1
	asl	r5, r4, 16
	or	r5, r5, r4
	mov	r4, r5

	sub3	lp_count, r2, 8
	cmp     r2, 64
	bmsk.hi	r2, r2, 5
	mov.ls	lp_count, 0
	add3.hi	r2, r2, 8

;;; Convert len to Dwords, unfold x8
	lsr.f	lp_count, lp_count, 6

	lpnz	@.Lset64bytes
	;; LOOP START
	PREALLOC_INSTR	r3, 64	; alloc next line w/o fetching

#ifdef CONFIG_ARC_HAS_LL64
	std.ab	r4, [r3, 8]
	std.ab	r4, [r3, 8]
	std.ab	r4, [r3, 8]
	std.ab	r4, [r3, 8]
	std.ab	r4, [r3, 8]
	std.ab	r4, [r3, 8]
	std.ab	r4, [r3, 8]
	std.ab	r4, [r3, 8]
#else
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
#endif
.Lset64bytes:

	lsr.f	lp_count, r2, 5 ;Last remaining  max 124 bytes
	lpnz	.Lset32bytes
	;; LOOP START
#ifdef CONFIG_ARC_HAS_LL64
	std.ab	r4, [r3, 8]
	std.ab	r4, [r3, 8]
	std.ab	r4, [r3, 8]
	std.ab	r4, [r3, 8]
#else
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
	st.ab	r4, [r3, 4]
#endif
.Lset32bytes:

	and.f	lp_count, r2, 0x1F ;Last remaining 31 bytes
.Lsmallchunk:
	lpnz	.Lcopy3bytes
	;; LOOP START
	stb.ab	r1, [r3, 1]
.Lcopy3bytes:

	j	[blink]

END_CFI(memset)

ENTRY_CFI(memzero)
    ; adjust bzero args to memset args
    mov r2, r1
    b.d  memset    ;tail call so need to tinker with blink
    mov r1, 0
END_CFI(memzero)