arm64: Support for FPU profiling with procfs

Summary:

   To reduce the count of FPU context switching will result at a
performance improve with system. it need to balance between
the using of FPU and counts of FPU trap
   the PR submit a base method to see performance counts for
the FPU with NuttX procfs
   Please read README.txt at chapter of FPU Support and Performance
for more information

Signed-off-by: qinwei1 <qinwei1@xiaomi.com>
This commit is contained in:
qinwei1 2023-03-13 11:10:40 +08:00 committed by Xiang Xiao
parent 165e266502
commit c4f3f8801f
10 changed files with 240 additions and 30 deletions

View File

@ -129,37 +129,37 @@ config ARCH_CORTEX_A53
bool bool
default n default n
select ARCH_ARMV8A select ARCH_ARMV8A
select ARM_HAVE_NEON
select ARCH_HAVE_TRUSTZONE select ARCH_HAVE_TRUSTZONE
select ARCH_DCACHE select ARCH_DCACHE
select ARCH_ICACHE select ARCH_ICACHE
select ARCH_HAVE_MMU select ARCH_HAVE_MMU
select ARCH_HAVE_FPU select ARCH_HAVE_FPU
select ARCH_HAVE_TESTSET select ARCH_HAVE_TESTSET
select ARM_HAVE_NEON
config ARCH_CORTEX_A57 config ARCH_CORTEX_A57
bool bool
default n default n
select ARCH_ARMV8A select ARCH_ARMV8A
select ARM_HAVE_NEON
select ARCH_HAVE_TRUSTZONE select ARCH_HAVE_TRUSTZONE
select ARCH_DCACHE select ARCH_DCACHE
select ARCH_ICACHE select ARCH_ICACHE
select ARCH_HAVE_MMU select ARCH_HAVE_MMU
select ARCH_HAVE_FPU select ARCH_HAVE_FPU
select ARCH_HAVE_TESTSET select ARCH_HAVE_TESTSET
select ARM_HAVE_NEON
config ARCH_CORTEX_A72 config ARCH_CORTEX_A72
bool bool
default n default n
select ARCH_ARMV8A select ARCH_ARMV8A
select ARM_HAVE_NEON
select ARCH_HAVE_TRUSTZONE select ARCH_HAVE_TRUSTZONE
select ARCH_DCACHE select ARCH_DCACHE
select ARCH_ICACHE select ARCH_ICACHE
select ARCH_HAVE_MMU select ARCH_HAVE_MMU
select ARCH_HAVE_FPU select ARCH_HAVE_FPU
select ARCH_HAVE_TESTSET select ARCH_HAVE_TESTSET
select ARM_HAVE_NEON
config ARCH_CORTEX_R82 config ARCH_CORTEX_R82
bool bool
@ -168,7 +168,9 @@ config ARCH_CORTEX_R82
select ARCH_DCACHE select ARCH_DCACHE
select ARCH_ICACHE select ARCH_ICACHE
select ARCH_HAVE_MPU select ARCH_HAVE_MPU
select ARCH_HAVE_FPU
select ARCH_HAVE_TESTSET select ARCH_HAVE_TESTSET
select ARM_HAVE_NEON
config ARCH_FAMILY config ARCH_FAMILY
string string

View File

@ -24,14 +24,20 @@
#include <nuttx/config.h> #include <nuttx/config.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <inttypes.h> #include <inttypes.h>
#include <stdint.h> #include <stdint.h>
#include <string.h> #include <string.h>
#include <assert.h> #include <assert.h>
#include <errno.h> #include <errno.h>
#include <debug.h> #include <debug.h>
#include <fcntl.h>
#include <stdio.h>
#include <nuttx/sched.h> #include <nuttx/sched.h>
#include <nuttx/arch.h> #include <nuttx/arch.h>
#include <nuttx/fs/procfs.h>
#include <arch/irq.h> #include <arch/irq.h>
#include "sched/sched.h" #include "sched/sched.h"
@ -46,6 +52,26 @@
***************************************************************************/ ***************************************************************************/
#define FPU_CALLEE_REGS (8) #define FPU_CALLEE_REGS (8)
#define FPU_PROC_LINELEN (64 * CONFIG_SMP_NCPUS)
/***************************************************************************
* Private Types
***************************************************************************/
/* This structure describes one open "file" */
#ifdef CONFIG_FS_PROCFS_REGISTER
struct arm64_fpu_procfs_file_s
{
struct procfs_file_s base; /* Base open file structure */
unsigned int linesize; /* Number of valid characters in line[] */
/* Pre-allocated buffer for formatted lines */
char line[FPU_PROC_LINELEN];
};
#endif
/*************************************************************************** /***************************************************************************
* Private Data * Private Data
@ -54,6 +80,44 @@
static struct fpu_reg g_idle_thread_fpu[CONFIG_SMP_NCPUS]; static struct fpu_reg g_idle_thread_fpu[CONFIG_SMP_NCPUS];
static struct arm64_cpu_fpu_context g_cpu_fpu_ctx[CONFIG_SMP_NCPUS]; static struct arm64_cpu_fpu_context g_cpu_fpu_ctx[CONFIG_SMP_NCPUS];
#ifdef CONFIG_FS_PROCFS_REGISTER
/* procfs methods */
static int arm64_fpu_procfs_open(struct file *filep, const char *relpath,
int oflags, mode_t mode);
static int arm64_fpu_procfs_close(struct file *filep);
static ssize_t arm64_fpu_procfs_read(struct file *filep, char *buffer,
size_t buflen);
static int arm64_fpu_procfs_stat(const char *relpath, struct stat *buf);
/* See include/nutts/fs/procfs.h
* We use the old-fashioned kind of initializers so that this will compile
* with any compiler.
*/
const struct procfs_operations arm64_fpu_procfs_operations =
{
arm64_fpu_procfs_open, /* open */
arm64_fpu_procfs_close, /* close */
arm64_fpu_procfs_read, /* read */
NULL, /* write */
NULL, /* dup */
NULL, /* opendir */
NULL, /* closedir */
NULL, /* readdir */
NULL, /* rewinddir */
arm64_fpu_procfs_stat /* stat */
};
static const struct procfs_entry_s g_procfs_arm64_fpu =
{
"fpu",
&arm64_fpu_procfs_operations
};
#endif
/*************************************************************************** /***************************************************************************
* Private Functions * Private Functions
***************************************************************************/ ***************************************************************************/
@ -84,6 +148,120 @@ static void arm64_fpu_access_trap_disable(void)
ARM64_ISB(); ARM64_ISB();
} }
#ifdef CONFIG_FS_PROCFS_REGISTER
static int arm64_fpu_procfs_open(struct file *filep, const char *relpath,
int oflags, mode_t mode)
{
struct arm64_fpu_procfs_file_s *priv;
uinfo("Open '%s'\n", relpath);
/* PROCFS is read-only. Any attempt to open with any kind of write
* access is not permitted.
*
* REVISIT: Write-able proc files could be quite useful.
*/
if (((oflags & O_WRONLY) != 0 || (oflags & O_RDONLY) == 0))
{
uerr("ERROR: Only O_RDONLY supported\n");
return -EACCES;
}
/* Allocate the open file structure */
priv = (struct arm64_fpu_procfs_file_s *)kmm_zalloc(
sizeof(struct arm64_fpu_procfs_file_s));
if (priv == NULL)
{
uerr("ERROR: Failed to allocate file attributes\n");
return -ENOMEM;
}
/* Save the open file structure as the open-specific state in
* filep->f_priv.
*/
filep->f_priv = (void *)priv;
return OK;
}
static int arm64_fpu_procfs_close(struct file *filep)
{
struct arm64_fpu_procfs_file_s *priv;
/* Recover our private data from the struct file instance */
priv = (struct arm64_fpu_procfs_file_s *)filep->f_priv;
DEBUGASSERT(priv);
/* Release the file attributes structure */
kmm_free(priv);
filep->f_priv = NULL;
return OK;
}
static ssize_t arm64_fpu_procfs_read(struct file *filep, char *buffer,
size_t buflen)
{
struct arm64_fpu_procfs_file_s *attr;
struct arm64_cpu_fpu_context *ctx;
off_t offset;
int linesize;
int ret;
int i;
uinfo("buffer=%p buflen=%zu\n", buffer, buflen);
/* Recover our private data from the struct file instance */
attr = (struct arm64_fpu_procfs_file_s *)filep->f_priv;
DEBUGASSERT(attr);
/* Traverse all FPU context */
linesize = 0;
for (i = 0; i < CONFIG_SMP_NCPUS; i++)
{
ctx = &g_cpu_fpu_ctx[i];
linesize += snprintf(attr->line + linesize,
FPU_PROC_LINELEN,
"CPU%d: save: %d restore: %d "
"switch: %d exedepth: %d\n",
i, ctx->save_count, ctx->restore_count,
ctx->switch_count, ctx->exe_depth_count);
}
attr->linesize = linesize;
/* Transfer the system up time to user receive buffer */
offset = filep->f_pos;
ret = procfs_memcpy(attr->line, attr->linesize,
buffer, buflen, &offset);
/* Update the file offset */
if (ret > 0)
{
filep->f_pos += ret;
}
return ret;
}
static int arm64_fpu_procfs_stat(const char *relpath, struct stat *buf)
{
buf->st_mode = S_IFREG | S_IROTH | S_IRGRP | S_IRUSR;
buf->st_size = 0;
buf->st_blksize = 0;
buf->st_blocks = 0;
return OK;
}
#endif
/*************************************************************************** /***************************************************************************
* Public Functions * Public Functions
***************************************************************************/ ***************************************************************************/
@ -258,3 +436,18 @@ bool up_fpucmp(const void *saveregs1, const void *saveregs2)
return memcmp(&regs1[FPU_REG_Q4], &regs2[FPU_REG_Q4], return memcmp(&regs1[FPU_REG_Q4], &regs2[FPU_REG_Q4],
8 * FPU_CALLEE_REGS) == 0; 8 * FPU_CALLEE_REGS) == 0;
} }
/***************************************************************************
* Name: arm64_fpu_procfs_register
*
* Description:
* Register the arm64 fpu procfs file system entry
*
***************************************************************************/
#ifdef CONFIG_FS_PROCFS_REGISTER
int arm64_fpu_procfs_register(void)
{
return procfs_register(&g_procfs_arm64_fpu);
}
#endif

View File

@ -63,6 +63,10 @@ struct arm64_cpu_fpu_context
void arm64_init_fpu(struct tcb_s *tcb); void arm64_init_fpu(struct tcb_s *tcb);
void arm64_destory_fpu(struct tcb_s *tcb); void arm64_destory_fpu(struct tcb_s *tcb);
#ifdef CONFIG_FS_PROCFS_REGISTER
int arm64_fpu_procfs_register(void);
#endif
void arm64_fpu_disable(void); void arm64_fpu_disable(void);
void arm64_fpu_enable(void); void arm64_fpu_enable(void);

View File

@ -218,5 +218,10 @@ void up_initialize(void)
g_fpu_panic_block.notifier_call = arm64_panic_disable_fpu; g_fpu_panic_block.notifier_call = arm64_panic_disable_fpu;
g_fpu_panic_block.priority = INT_MAX; g_fpu_panic_block.priority = INT_MAX;
panic_notifier_chain_register(&g_fpu_panic_block); panic_notifier_chain_register(&g_fpu_panic_block);
#ifdef CONFIG_FS_PROCFS_REGISTER
arm64_fpu_procfs_register();
#endif
#endif #endif
} }

View File

@ -294,7 +294,7 @@ void up_schedule_sigaction(struct tcb_s *tcb, sig_deliver_t sigdeliver)
tcb->xcp.saved_reg = tcb->xcp.regs; tcb->xcp.saved_reg = tcb->xcp.regs;
#ifdef CONFIG_ARCH_FPU #ifdef CONFIG_ARCH_FPU
tcb->xcp.sig_save_fpu_regs = tcb->xcp.fpu_regs; tcb->xcp.saved_fpu_regs = tcb->xcp.fpu_regs;
#endif #endif
arm64_init_signal_process(tcb); arm64_init_signal_process(tcb);
@ -341,7 +341,7 @@ void up_schedule_sigaction(struct tcb_s *tcb, sig_deliver_t sigdeliver)
tcb->xcp.sigdeliver = sigdeliver; tcb->xcp.sigdeliver = sigdeliver;
#ifdef CONFIG_ARCH_FPU #ifdef CONFIG_ARCH_FPU
tcb->xcp.sig_save_fpu_regs = tcb->xcp.fpu_regs; tcb->xcp.saved_fpu_regs = tcb->xcp.fpu_regs;
#endif #endif
tcb->xcp.saved_reg = tcb->xcp.regs; tcb->xcp.saved_reg = tcb->xcp.regs;

View File

@ -30,6 +30,7 @@ CONFIG_DEV_ZERO=y
CONFIG_EXAMPLES_HELLO=y CONFIG_EXAMPLES_HELLO=y
CONFIG_EXPERIMENTAL=y CONFIG_EXPERIMENTAL=y
CONFIG_FS_PROCFS=y CONFIG_FS_PROCFS=y
CONFIG_FS_PROCFS_REGISTER=y
CONFIG_FS_ROMFS=y CONFIG_FS_ROMFS=y
CONFIG_FVP_UART_PL011=y CONFIG_FVP_UART_PL011=y
CONFIG_IDLETHREAD_STACKSIZE=8192 CONFIG_IDLETHREAD_STACKSIZE=8192

View File

@ -30,6 +30,7 @@ CONFIG_DEV_ZERO=y
CONFIG_EXAMPLES_HELLO=y CONFIG_EXAMPLES_HELLO=y
CONFIG_EXPERIMENTAL=y CONFIG_EXPERIMENTAL=y
CONFIG_FS_PROCFS=y CONFIG_FS_PROCFS=y
CONFIG_FS_PROCFS_REGISTER=y
CONFIG_FS_ROMFS=y CONFIG_FS_ROMFS=y
CONFIG_FVP_UART_PL011=y CONFIG_FVP_UART_PL011=y
CONFIG_IDLETHREAD_STACKSIZE=8192 CONFIG_IDLETHREAD_STACKSIZE=8192

View File

@ -239,17 +239,7 @@ need to be considered:
In many cases, the FPU trap is triggered by va_start() that copies In many cases, the FPU trap is triggered by va_start() that copies
the content of FP registers used for floating point argument passing the content of FP registers used for floating point argument passing
into the va_list object in case there were actual float arguments from into the va_list object in case there were actual float arguments from
the caller. But In practice this is almost never the case. the caller.
Seeing the save_count/restore_count at the g_cpu_fpu_ctx, which will
be increase when saving/restoring FPU context. After running ostest,
we can see the count with GDB:
(gdb) p g_cpu_fpu_ctx
$1 = {{fpu_owner = 0x0, idle_thread = 0x402b3110 <g_idletcb>,
save_count = 1293, restore_count = 2226, switch_count = 4713,
exe_depth_count = 0}}
(gdb)
adding -mgeneral-regs-only option will make compiler not use the FPU adding -mgeneral-regs-only option will make compiler not use the FPU
register, we can use the following patch to syslog: register, we can use the following patch to syslog:
@ -262,24 +252,33 @@ index c58fb45512..acac6febaa
DEPPATH += --dep-path syslog DEPPATH += --dep-path syslog
VPATH += :syslog VPATH += :syslog
+syslog/lib_syslog.c_CFLAGS += -mgeneral-regs-only +syslog/lib_syslog.c_CFLAGS += -mgeneral-regs-only
I cannot commit the patch for NuttX mainline because it's very special case
With the option to make NuttX and booting. After running ostest, see
the count with GDB again:
(gdb) p g_cpu_fpu_ctx
$1 = {{fpu_owner = 0x0, idle_thread = 0x402b3110 <g_idletcb>, save_count = 141,
restore_count = 170, switch_count = 4715, exe_depth_count = 0}}
(gdb)
it's only 141/170 for saving/restoring FPU context, which is 1293/2226 before
add this compile option. Almost all of FPU accessing switch is argument passing
at the syslog.
I cannot commit the patch for NuttX mainline because it's very special case
since ostest is using syslog for lots of information printing. but this is since ostest is using syslog for lots of information printing. but this is
a clue for FPU performance analysis. va_list object is using for many C code to a clue for FPU performance analysis. va_list object is using for many C code to
handle argument passing, but if it's not passing floating point argument indeed. handle argument passing, but if it's not passing floating point argument indeed.
Add the option to your code maybe increase FPU performance Add the option to your code maybe increase FPU performance
2. memset/memcpy issue
For improve performance, the memset/memcpy implement for libc will
use the neon/fpu instruction/register. The FPU trap is also triggered
in this case.
we can trace this issue with Procfs:
nsh> cat /proc/arm64fpu
CPU0: save: 7 restore: 8 switch: 62 exedepth: 0
nsh>
after ostest
nsh> cat /proc/arm64fpu
CPU0: save: 1329 restore: 2262 switch: 4613 exedepth: 0
nsh>
Note:
save: the counts of save for task FPU context
restore: the counts of restore for task FPU context
switch: the counts of task switch
2. FPU trap at IRQ handler 2. FPU trap at IRQ handler
it's probably need to handle FPU trap at IRQ routine. Exception_depth is it's probably need to handle FPU trap at IRQ routine. Exception_depth is
handling for this case, it will inc/dec at enter/leave exception. If the handling for this case, it will inc/dec at enter/leave exception. If the
@ -295,6 +294,10 @@ save/restore FPU context directly maybe become a solution. Linux kernel introduc
kernel_neon_begin/kernel_neon_end function for this case. Similar function will kernel_neon_begin/kernel_neon_end function for this case. Similar function will
be add to NuttX if this issue need to be handle. be add to NuttX if this issue need to be handle.
3. More reading
for Linux kernel, please reference:
- https://www.kernel.org/doc/html/latest/arm/kernel_mode_neon.html
SMP Support SMP Support
=========== ===========
1. Booting 1. Booting

View File

@ -31,6 +31,7 @@ CONFIG_DEV_ZERO=y
CONFIG_EXAMPLES_HELLO=y CONFIG_EXAMPLES_HELLO=y
CONFIG_EXPERIMENTAL=y CONFIG_EXPERIMENTAL=y
CONFIG_FS_PROCFS=y CONFIG_FS_PROCFS=y
CONFIG_FS_PROCFS_REGISTER=y
CONFIG_FS_ROMFS=y CONFIG_FS_ROMFS=y
CONFIG_HAVE_CXX=y CONFIG_HAVE_CXX=y
CONFIG_HAVE_CXXINITIALIZE=y CONFIG_HAVE_CXXINITIALIZE=y

View File

@ -5,7 +5,6 @@
# You can then do "make savedefconfig" to generate a new defconfig file that includes your # You can then do "make savedefconfig" to generate a new defconfig file that includes your
# modifications. # modifications.
# #
# CONFIG_ARCH_FPU is not set
CONFIG_ARCH="arm64" CONFIG_ARCH="arm64"
CONFIG_ARCH_ARM64=y CONFIG_ARCH_ARM64=y
CONFIG_ARCH_BOARD="qemu-armv8a" CONFIG_ARCH_BOARD="qemu-armv8a"
@ -32,6 +31,7 @@ CONFIG_DEV_ZERO=y
CONFIG_EXAMPLES_HELLO=y CONFIG_EXAMPLES_HELLO=y
CONFIG_EXPERIMENTAL=y CONFIG_EXPERIMENTAL=y
CONFIG_FS_PROCFS=y CONFIG_FS_PROCFS=y
CONFIG_FS_PROCFS_REGISTER=y
CONFIG_FS_ROMFS=y CONFIG_FS_ROMFS=y
CONFIG_IDLETHREAD_STACKSIZE=16384 CONFIG_IDLETHREAD_STACKSIZE=16384
CONFIG_INIT_ENTRYPOINT="nsh_main" CONFIG_INIT_ENTRYPOINT="nsh_main"