arm64: Support for FPU profiling with procfs

Summary:

   To reduce the count of FPU context switching will result at a
performance improve with system. it need to balance between
the using of FPU and counts of FPU trap
   the PR submit a base method to see performance counts for
the FPU with NuttX procfs
   Please read README.txt at chapter of FPU Support and Performance
for more information

Signed-off-by: qinwei1 <qinwei1@xiaomi.com>
This commit is contained in:
qinwei1 2023-03-13 11:10:40 +08:00 committed by Xiang Xiao
parent 165e266502
commit c4f3f8801f
10 changed files with 240 additions and 30 deletions

View File

@ -129,37 +129,37 @@ config ARCH_CORTEX_A53
bool
default n
select ARCH_ARMV8A
select ARM_HAVE_NEON
select ARCH_HAVE_TRUSTZONE
select ARCH_DCACHE
select ARCH_ICACHE
select ARCH_HAVE_MMU
select ARCH_HAVE_FPU
select ARCH_HAVE_TESTSET
select ARM_HAVE_NEON
config ARCH_CORTEX_A57
bool
default n
select ARCH_ARMV8A
select ARM_HAVE_NEON
select ARCH_HAVE_TRUSTZONE
select ARCH_DCACHE
select ARCH_ICACHE
select ARCH_HAVE_MMU
select ARCH_HAVE_FPU
select ARCH_HAVE_TESTSET
select ARM_HAVE_NEON
config ARCH_CORTEX_A72
bool
default n
select ARCH_ARMV8A
select ARM_HAVE_NEON
select ARCH_HAVE_TRUSTZONE
select ARCH_DCACHE
select ARCH_ICACHE
select ARCH_HAVE_MMU
select ARCH_HAVE_FPU
select ARCH_HAVE_TESTSET
select ARM_HAVE_NEON
config ARCH_CORTEX_R82
bool
@ -168,7 +168,9 @@ config ARCH_CORTEX_R82
select ARCH_DCACHE
select ARCH_ICACHE
select ARCH_HAVE_MPU
select ARCH_HAVE_FPU
select ARCH_HAVE_TESTSET
select ARM_HAVE_NEON
config ARCH_FAMILY
string

View File

@ -24,14 +24,20 @@
#include <nuttx/config.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <inttypes.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include <errno.h>
#include <debug.h>
#include <fcntl.h>
#include <stdio.h>
#include <nuttx/sched.h>
#include <nuttx/arch.h>
#include <nuttx/fs/procfs.h>
#include <arch/irq.h>
#include "sched/sched.h"
@ -46,6 +52,26 @@
***************************************************************************/
#define FPU_CALLEE_REGS (8)
#define FPU_PROC_LINELEN (64 * CONFIG_SMP_NCPUS)
/***************************************************************************
* Private Types
***************************************************************************/
/* This structure describes one open "file" */
#ifdef CONFIG_FS_PROCFS_REGISTER
struct arm64_fpu_procfs_file_s
{
struct procfs_file_s base; /* Base open file structure */
unsigned int linesize; /* Number of valid characters in line[] */
/* Pre-allocated buffer for formatted lines */
char line[FPU_PROC_LINELEN];
};
#endif
/***************************************************************************
* Private Data
@ -54,6 +80,44 @@
static struct fpu_reg g_idle_thread_fpu[CONFIG_SMP_NCPUS];
static struct arm64_cpu_fpu_context g_cpu_fpu_ctx[CONFIG_SMP_NCPUS];
#ifdef CONFIG_FS_PROCFS_REGISTER
/* procfs methods */
static int arm64_fpu_procfs_open(struct file *filep, const char *relpath,
int oflags, mode_t mode);
static int arm64_fpu_procfs_close(struct file *filep);
static ssize_t arm64_fpu_procfs_read(struct file *filep, char *buffer,
size_t buflen);
static int arm64_fpu_procfs_stat(const char *relpath, struct stat *buf);
/* See include/nutts/fs/procfs.h
* We use the old-fashioned kind of initializers so that this will compile
* with any compiler.
*/
const struct procfs_operations arm64_fpu_procfs_operations =
{
arm64_fpu_procfs_open, /* open */
arm64_fpu_procfs_close, /* close */
arm64_fpu_procfs_read, /* read */
NULL, /* write */
NULL, /* dup */
NULL, /* opendir */
NULL, /* closedir */
NULL, /* readdir */
NULL, /* rewinddir */
arm64_fpu_procfs_stat /* stat */
};
static const struct procfs_entry_s g_procfs_arm64_fpu =
{
"fpu",
&arm64_fpu_procfs_operations
};
#endif
/***************************************************************************
* Private Functions
***************************************************************************/
@ -84,6 +148,120 @@ static void arm64_fpu_access_trap_disable(void)
ARM64_ISB();
}
#ifdef CONFIG_FS_PROCFS_REGISTER
static int arm64_fpu_procfs_open(struct file *filep, const char *relpath,
int oflags, mode_t mode)
{
struct arm64_fpu_procfs_file_s *priv;
uinfo("Open '%s'\n", relpath);
/* PROCFS is read-only. Any attempt to open with any kind of write
* access is not permitted.
*
* REVISIT: Write-able proc files could be quite useful.
*/
if (((oflags & O_WRONLY) != 0 || (oflags & O_RDONLY) == 0))
{
uerr("ERROR: Only O_RDONLY supported\n");
return -EACCES;
}
/* Allocate the open file structure */
priv = (struct arm64_fpu_procfs_file_s *)kmm_zalloc(
sizeof(struct arm64_fpu_procfs_file_s));
if (priv == NULL)
{
uerr("ERROR: Failed to allocate file attributes\n");
return -ENOMEM;
}
/* Save the open file structure as the open-specific state in
* filep->f_priv.
*/
filep->f_priv = (void *)priv;
return OK;
}
static int arm64_fpu_procfs_close(struct file *filep)
{
struct arm64_fpu_procfs_file_s *priv;
/* Recover our private data from the struct file instance */
priv = (struct arm64_fpu_procfs_file_s *)filep->f_priv;
DEBUGASSERT(priv);
/* Release the file attributes structure */
kmm_free(priv);
filep->f_priv = NULL;
return OK;
}
static ssize_t arm64_fpu_procfs_read(struct file *filep, char *buffer,
size_t buflen)
{
struct arm64_fpu_procfs_file_s *attr;
struct arm64_cpu_fpu_context *ctx;
off_t offset;
int linesize;
int ret;
int i;
uinfo("buffer=%p buflen=%zu\n", buffer, buflen);
/* Recover our private data from the struct file instance */
attr = (struct arm64_fpu_procfs_file_s *)filep->f_priv;
DEBUGASSERT(attr);
/* Traverse all FPU context */
linesize = 0;
for (i = 0; i < CONFIG_SMP_NCPUS; i++)
{
ctx = &g_cpu_fpu_ctx[i];
linesize += snprintf(attr->line + linesize,
FPU_PROC_LINELEN,
"CPU%d: save: %d restore: %d "
"switch: %d exedepth: %d\n",
i, ctx->save_count, ctx->restore_count,
ctx->switch_count, ctx->exe_depth_count);
}
attr->linesize = linesize;
/* Transfer the system up time to user receive buffer */
offset = filep->f_pos;
ret = procfs_memcpy(attr->line, attr->linesize,
buffer, buflen, &offset);
/* Update the file offset */
if (ret > 0)
{
filep->f_pos += ret;
}
return ret;
}
static int arm64_fpu_procfs_stat(const char *relpath, struct stat *buf)
{
buf->st_mode = S_IFREG | S_IROTH | S_IRGRP | S_IRUSR;
buf->st_size = 0;
buf->st_blksize = 0;
buf->st_blocks = 0;
return OK;
}
#endif
/***************************************************************************
* Public Functions
***************************************************************************/
@ -258,3 +436,18 @@ bool up_fpucmp(const void *saveregs1, const void *saveregs2)
return memcmp(&regs1[FPU_REG_Q4], &regs2[FPU_REG_Q4],
8 * FPU_CALLEE_REGS) == 0;
}
/***************************************************************************
* Name: arm64_fpu_procfs_register
*
* Description:
* Register the arm64 fpu procfs file system entry
*
***************************************************************************/
#ifdef CONFIG_FS_PROCFS_REGISTER
int arm64_fpu_procfs_register(void)
{
return procfs_register(&g_procfs_arm64_fpu);
}
#endif

View File

@ -63,6 +63,10 @@ struct arm64_cpu_fpu_context
void arm64_init_fpu(struct tcb_s *tcb);
void arm64_destory_fpu(struct tcb_s *tcb);
#ifdef CONFIG_FS_PROCFS_REGISTER
int arm64_fpu_procfs_register(void);
#endif
void arm64_fpu_disable(void);
void arm64_fpu_enable(void);

View File

@ -218,5 +218,10 @@ void up_initialize(void)
g_fpu_panic_block.notifier_call = arm64_panic_disable_fpu;
g_fpu_panic_block.priority = INT_MAX;
panic_notifier_chain_register(&g_fpu_panic_block);
#ifdef CONFIG_FS_PROCFS_REGISTER
arm64_fpu_procfs_register();
#endif
#endif
}

View File

@ -294,7 +294,7 @@ void up_schedule_sigaction(struct tcb_s *tcb, sig_deliver_t sigdeliver)
tcb->xcp.saved_reg = tcb->xcp.regs;
#ifdef CONFIG_ARCH_FPU
tcb->xcp.sig_save_fpu_regs = tcb->xcp.fpu_regs;
tcb->xcp.saved_fpu_regs = tcb->xcp.fpu_regs;
#endif
arm64_init_signal_process(tcb);
@ -341,7 +341,7 @@ void up_schedule_sigaction(struct tcb_s *tcb, sig_deliver_t sigdeliver)
tcb->xcp.sigdeliver = sigdeliver;
#ifdef CONFIG_ARCH_FPU
tcb->xcp.sig_save_fpu_regs = tcb->xcp.fpu_regs;
tcb->xcp.saved_fpu_regs = tcb->xcp.fpu_regs;
#endif
tcb->xcp.saved_reg = tcb->xcp.regs;

View File

@ -30,6 +30,7 @@ CONFIG_DEV_ZERO=y
CONFIG_EXAMPLES_HELLO=y
CONFIG_EXPERIMENTAL=y
CONFIG_FS_PROCFS=y
CONFIG_FS_PROCFS_REGISTER=y
CONFIG_FS_ROMFS=y
CONFIG_FVP_UART_PL011=y
CONFIG_IDLETHREAD_STACKSIZE=8192

View File

@ -30,6 +30,7 @@ CONFIG_DEV_ZERO=y
CONFIG_EXAMPLES_HELLO=y
CONFIG_EXPERIMENTAL=y
CONFIG_FS_PROCFS=y
CONFIG_FS_PROCFS_REGISTER=y
CONFIG_FS_ROMFS=y
CONFIG_FVP_UART_PL011=y
CONFIG_IDLETHREAD_STACKSIZE=8192

View File

@ -239,17 +239,7 @@ need to be considered:
In many cases, the FPU trap is triggered by va_start() that copies
the content of FP registers used for floating point argument passing
into the va_list object in case there were actual float arguments from
the caller. But In practice this is almost never the case.
Seeing the save_count/restore_count at the g_cpu_fpu_ctx, which will
be increase when saving/restoring FPU context. After running ostest,
we can see the count with GDB:
(gdb) p g_cpu_fpu_ctx
$1 = {{fpu_owner = 0x0, idle_thread = 0x402b3110 <g_idletcb>,
save_count = 1293, restore_count = 2226, switch_count = 4713,
exe_depth_count = 0}}
(gdb)
the caller.
adding -mgeneral-regs-only option will make compiler not use the FPU
register, we can use the following patch to syslog:
@ -262,24 +252,33 @@ index c58fb45512..acac6febaa
DEPPATH += --dep-path syslog
VPATH += :syslog
+syslog/lib_syslog.c_CFLAGS += -mgeneral-regs-only
With the option to make NuttX and booting. After running ostest, see
the count with GDB again:
(gdb) p g_cpu_fpu_ctx
$1 = {{fpu_owner = 0x0, idle_thread = 0x402b3110 <g_idletcb>, save_count = 141,
restore_count = 170, switch_count = 4715, exe_depth_count = 0}}
(gdb)
it's only 141/170 for saving/restoring FPU context, which is 1293/2226 before
add this compile option. Almost all of FPU accessing switch is argument passing
at the syslog.
I cannot commit the patch for NuttX mainline because it's very special case
since ostest is using syslog for lots of information printing. but this is
a clue for FPU performance analysis. va_list object is using for many C code to
handle argument passing, but if it's not passing floating point argument indeed.
Add the option to your code maybe increase FPU performance
2. memset/memcpy issue
For improve performance, the memset/memcpy implement for libc will
use the neon/fpu instruction/register. The FPU trap is also triggered
in this case.
we can trace this issue with Procfs:
nsh> cat /proc/arm64fpu
CPU0: save: 7 restore: 8 switch: 62 exedepth: 0
nsh>
after ostest
nsh> cat /proc/arm64fpu
CPU0: save: 1329 restore: 2262 switch: 4613 exedepth: 0
nsh>
Note:
save: the counts of save for task FPU context
restore: the counts of restore for task FPU context
switch: the counts of task switch
2. FPU trap at IRQ handler
it's probably need to handle FPU trap at IRQ routine. Exception_depth is
handling for this case, it will inc/dec at enter/leave exception. If the
@ -295,6 +294,10 @@ save/restore FPU context directly maybe become a solution. Linux kernel introduc
kernel_neon_begin/kernel_neon_end function for this case. Similar function will
be add to NuttX if this issue need to be handle.
3. More reading
for Linux kernel, please reference:
- https://www.kernel.org/doc/html/latest/arm/kernel_mode_neon.html
SMP Support
===========
1. Booting

View File

@ -31,6 +31,7 @@ CONFIG_DEV_ZERO=y
CONFIG_EXAMPLES_HELLO=y
CONFIG_EXPERIMENTAL=y
CONFIG_FS_PROCFS=y
CONFIG_FS_PROCFS_REGISTER=y
CONFIG_FS_ROMFS=y
CONFIG_HAVE_CXX=y
CONFIG_HAVE_CXXINITIALIZE=y

View File

@ -5,7 +5,6 @@
# You can then do "make savedefconfig" to generate a new defconfig file that includes your
# modifications.
#
# CONFIG_ARCH_FPU is not set
CONFIG_ARCH="arm64"
CONFIG_ARCH_ARM64=y
CONFIG_ARCH_BOARD="qemu-armv8a"
@ -32,6 +31,7 @@ CONFIG_DEV_ZERO=y
CONFIG_EXAMPLES_HELLO=y
CONFIG_EXPERIMENTAL=y
CONFIG_FS_PROCFS=y
CONFIG_FS_PROCFS_REGISTER=y
CONFIG_FS_ROMFS=y
CONFIG_IDLETHREAD_STACKSIZE=16384
CONFIG_INIT_ENTRYPOINT="nsh_main"