mirror of
https://github.com/ARM-software/arm-trusted-firmware.git
synced 2025-04-21 20:14:29 +00:00

Trace analysis of FVP_Base_AEMv8A 0.0/6063 model running in Aarch32 mode with the build options listed below: TRUSTED_BOARD_BOOT=1 GENERATE_COT=1 ARM_ROTPK_LOCATION=devel_ecdsa KEY_ALG=ecdsa ROT_KEY=plat/arm/board/common/rotpk/arm_rotprivk_ecdsa.pem shows that when auth_signature() gets called 71.99% of CPU execution time is spent in memset() function written in C using single byte write operations, see lib\libc\memset.c. This patch introduces new libc_asm.mk makefile which replaces C memset() implementation with assembler version giving the following results: - for Aarch32 in auth_signature() call memset() CPU time reduced to 20.56%. The number of CPU instructions (Inst) executed during TF-A boot stage before start of BL33 in RELEASE builds for different versions is presented in the tables below, where: - C TF-A: existing TF-A C code; - C musl: "lightweight code" C "implementation of the standard library for Linux-based systems" https://git.musl-libc.org/cgit/musl/tree/src/string/memset.c - Asm Opt: assemler version from "Arm Optimized Routines" project https://github.com/ARM-software/optimized-routines/blob/ master/string/arm/memset.S - Asm Linux: assembler version from Linux kernel https://github.com/torvalds/linux/blob/master/arch/arm/lib/memset.S - Asm TF-A: assembler version from this patch Aarch32: +-----------+------+------+--------------+----------+ | Variant | Set | Size | Inst | Ratio | +-----------+------+------+--------------+----------+ | C TF-A | T32 | 16 | 2122110003 | 1.000000 | | C musl | T32 | 156 | 1643917668 | 0.774662 | | Asm Opt | T32 | 84 | 1604810003 | 0.756233 | | Asm Linux | A32 | 168 | 1566255018 | 0.738065 | | Asm TF-A | A32 | 160 | 1525865101 | 0.719032 | +-----------+------+------+--------------+----------+ AArch64: +-----------+------+------------+----------+ | Variant | Size | Inst | Ratio | +-----------+------+------------+----------+ | C TF-A | 28 | 2732497518 | 1.000000 | | C musl | 212 | 1802999999 | 0.659836 | | Asm TF-A | 140 | 1680260003 | 0.614917 | +-----------+------+------------+----------+ This patch modifies 'plat\arm\common\arm_common.mk' by overriding libc.mk makefile with libc_asm.mk and does not effect other platforms. Change-Id: Ie89dd0b74ba1079420733a0d76b7366ad0157c2e Signed-off-by: Alexei Fedorov <Alexei.Fedorov@arm.com>
74 lines
1.9 KiB
ArmAsm
74 lines
1.9 KiB
ArmAsm
/*
|
|
* Copyright (c) 2020, Arm Limited. All rights reserved.
|
|
*
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
*/
|
|
|
|
#include <asm_macros.S>
|
|
|
|
.syntax unified
|
|
.global memset
|
|
|
|
/* -----------------------------------------------------------------------
|
|
* void *memset(void *dst, int val, size_t count)
|
|
*
|
|
* Copy the value of 'val' (converted to an unsigned char) into
|
|
* each of the first 'count' characters of the object pointed to by 'dst'.
|
|
*
|
|
* Returns the value of 'dst'.
|
|
* -----------------------------------------------------------------------
|
|
*/
|
|
func memset
|
|
mov r12, r0 /* keep r0 */
|
|
tst r0, #3
|
|
beq aligned /* 4-bytes aligned */
|
|
|
|
/* Unaligned 'dst' */
|
|
unaligned:
|
|
subs r2, r2, #1
|
|
strbhs r1, [r12], #1
|
|
bxls lr /* return if 0 */
|
|
tst r12, #3
|
|
bne unaligned /* continue while unaligned */
|
|
|
|
/* 4-bytes aligned */
|
|
aligned:bfi r1, r1, #8, #8 /* propagate 'val' */
|
|
bfi r1, r1, #16, #16
|
|
|
|
mov r3, r1
|
|
|
|
cmp r2, #16
|
|
blo less_16 /* < 16 */
|
|
|
|
push {r4, lr}
|
|
mov r4, r1
|
|
mov lr, r1
|
|
|
|
write_32:
|
|
subs r2, r2, #32
|
|
stmiahs r12!, {r1, r3, r4, lr}
|
|
stmiahs r12!, {r1, r3, r4, lr}
|
|
bhi write_32 /* write 32 bytes in a loop */
|
|
popeq {r4, pc} /* return if 0 */
|
|
lsls r2, r2, #28 /* C = r2[4]; N = r2[3]; Z = r2[3:0] */
|
|
stmiacs r12!, {r1, r3, r4, lr} /* write 16 bytes */
|
|
popeq {r4, pc} /* return if 16 */
|
|
stmiami r12!, {r1, r3} /* write 8 bytes */
|
|
lsls r2, r2, #2 /* C = r2[2]; N = r2[1]; Z = r2[1:0] */
|
|
strcs r1, [r12], #4 /* write 4 bytes */
|
|
popeq {r4, pc} /* return if 8 or 4 */
|
|
strhmi r1, [r12], #2 /* write 2 bytes */
|
|
lsls r2, r2, #1 /* N = Z = r2[0] */
|
|
strbmi r1, [r12] /* write 1 byte */
|
|
pop {r4, pc}
|
|
|
|
less_16:lsls r2, r2, #29 /* C = r2[3]; N = r2[2]; Z = r2[2:0] */
|
|
stmiacs r12!, {r1, r3} /* write 8 bytes */
|
|
bxeq lr /* return if 8 */
|
|
strmi r1, [r12], #4 /* write 4 bytes */
|
|
lsls r2, r2, #2 /* C = r2[1]; N = Z = r2[0] */
|
|
strhcs r1, [r12], #2 /* write 2 bytes */
|
|
strbmi r1, [r12] /* write 1 byte */
|
|
bx lr
|
|
|
|
endfunc memset
|