diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index eb97c1b2011..05d41177199 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -129,37 +129,37 @@ config ARCH_CORTEX_A53 bool default n select ARCH_ARMV8A - select ARM_HAVE_NEON select ARCH_HAVE_TRUSTZONE select ARCH_DCACHE select ARCH_ICACHE select ARCH_HAVE_MMU select ARCH_HAVE_FPU select ARCH_HAVE_TESTSET + select ARM_HAVE_NEON config ARCH_CORTEX_A57 bool default n select ARCH_ARMV8A - select ARM_HAVE_NEON select ARCH_HAVE_TRUSTZONE select ARCH_DCACHE select ARCH_ICACHE select ARCH_HAVE_MMU select ARCH_HAVE_FPU select ARCH_HAVE_TESTSET + select ARM_HAVE_NEON config ARCH_CORTEX_A72 bool default n select ARCH_ARMV8A - select ARM_HAVE_NEON select ARCH_HAVE_TRUSTZONE select ARCH_DCACHE select ARCH_ICACHE select ARCH_HAVE_MMU select ARCH_HAVE_FPU select ARCH_HAVE_TESTSET + select ARM_HAVE_NEON config ARCH_CORTEX_R82 bool @@ -168,7 +168,9 @@ config ARCH_CORTEX_R82 select ARCH_DCACHE select ARCH_ICACHE select ARCH_HAVE_MPU + select ARCH_HAVE_FPU select ARCH_HAVE_TESTSET + select ARM_HAVE_NEON config ARCH_FAMILY string diff --git a/arch/arm64/src/common/arm64_fpu.c b/arch/arm64/src/common/arm64_fpu.c index 88c28ff8316..6cfd75741f9 100644 --- a/arch/arm64/src/common/arm64_fpu.c +++ b/arch/arm64/src/common/arm64_fpu.c @@ -24,14 +24,20 @@ #include +#include +#include + #include #include #include #include #include #include +#include +#include #include #include +#include #include #include "sched/sched.h" @@ -46,6 +52,26 @@ ***************************************************************************/ #define FPU_CALLEE_REGS (8) +#define FPU_PROC_LINELEN (64 * CONFIG_SMP_NCPUS) + +/*************************************************************************** + * Private Types + ***************************************************************************/ + +/* This structure describes one open "file" */ + +#ifdef CONFIG_FS_PROCFS_REGISTER + +struct arm64_fpu_procfs_file_s +{ + struct procfs_file_s base; /* Base open file structure */ + unsigned int linesize; /* Number of valid characters in line[] */ + + /* Pre-allocated buffer for formatted lines */ + + char line[FPU_PROC_LINELEN]; +}; +#endif /*************************************************************************** * Private Data @@ -54,6 +80,44 @@ static struct fpu_reg g_idle_thread_fpu[CONFIG_SMP_NCPUS]; static struct arm64_cpu_fpu_context g_cpu_fpu_ctx[CONFIG_SMP_NCPUS]; +#ifdef CONFIG_FS_PROCFS_REGISTER + +/* procfs methods */ + +static int arm64_fpu_procfs_open(struct file *filep, const char *relpath, + int oflags, mode_t mode); +static int arm64_fpu_procfs_close(struct file *filep); +static ssize_t arm64_fpu_procfs_read(struct file *filep, char *buffer, + size_t buflen); +static int arm64_fpu_procfs_stat(const char *relpath, struct stat *buf); + +/* See include/nutts/fs/procfs.h + * We use the old-fashioned kind of initializers so that this will compile + * with any compiler. + */ + +const struct procfs_operations arm64_fpu_procfs_operations = +{ + arm64_fpu_procfs_open, /* open */ + arm64_fpu_procfs_close, /* close */ + arm64_fpu_procfs_read, /* read */ + NULL, /* write */ + NULL, /* dup */ + NULL, /* opendir */ + NULL, /* closedir */ + NULL, /* readdir */ + NULL, /* rewinddir */ + arm64_fpu_procfs_stat /* stat */ +}; + +static const struct procfs_entry_s g_procfs_arm64_fpu = +{ + "fpu", + &arm64_fpu_procfs_operations +}; + +#endif + /*************************************************************************** * Private Functions ***************************************************************************/ @@ -84,6 +148,120 @@ static void arm64_fpu_access_trap_disable(void) ARM64_ISB(); } +#ifdef CONFIG_FS_PROCFS_REGISTER + +static int arm64_fpu_procfs_open(struct file *filep, const char *relpath, + int oflags, mode_t mode) +{ + struct arm64_fpu_procfs_file_s *priv; + + uinfo("Open '%s'\n", relpath); + + /* PROCFS is read-only. Any attempt to open with any kind of write + * access is not permitted. + * + * REVISIT: Write-able proc files could be quite useful. + */ + + if (((oflags & O_WRONLY) != 0 || (oflags & O_RDONLY) == 0)) + { + uerr("ERROR: Only O_RDONLY supported\n"); + return -EACCES; + } + + /* Allocate the open file structure */ + + priv = (struct arm64_fpu_procfs_file_s *)kmm_zalloc( + sizeof(struct arm64_fpu_procfs_file_s)); + if (priv == NULL) + { + uerr("ERROR: Failed to allocate file attributes\n"); + return -ENOMEM; + } + + /* Save the open file structure as the open-specific state in + * filep->f_priv. + */ + + filep->f_priv = (void *)priv; + return OK; +} + +static int arm64_fpu_procfs_close(struct file *filep) +{ + struct arm64_fpu_procfs_file_s *priv; + + /* Recover our private data from the struct file instance */ + + priv = (struct arm64_fpu_procfs_file_s *)filep->f_priv; + DEBUGASSERT(priv); + + /* Release the file attributes structure */ + + kmm_free(priv); + filep->f_priv = NULL; + return OK; +} + +static ssize_t arm64_fpu_procfs_read(struct file *filep, char *buffer, + size_t buflen) +{ + struct arm64_fpu_procfs_file_s *attr; + struct arm64_cpu_fpu_context *ctx; + off_t offset; + int linesize; + int ret; + int i; + + uinfo("buffer=%p buflen=%zu\n", buffer, buflen); + + /* Recover our private data from the struct file instance */ + + attr = (struct arm64_fpu_procfs_file_s *)filep->f_priv; + DEBUGASSERT(attr); + + /* Traverse all FPU context */ + + linesize = 0; + for (i = 0; i < CONFIG_SMP_NCPUS; i++) + { + ctx = &g_cpu_fpu_ctx[i]; + linesize += snprintf(attr->line + linesize, + FPU_PROC_LINELEN, + "CPU%d: save: %d restore: %d " + "switch: %d exedepth: %d\n", + i, ctx->save_count, ctx->restore_count, + ctx->switch_count, ctx->exe_depth_count); + } + + attr->linesize = linesize; + + /* Transfer the system up time to user receive buffer */ + + offset = filep->f_pos; + ret = procfs_memcpy(attr->line, attr->linesize, + buffer, buflen, &offset); + + /* Update the file offset */ + + if (ret > 0) + { + filep->f_pos += ret; + } + + return ret; +} + +static int arm64_fpu_procfs_stat(const char *relpath, struct stat *buf) +{ + buf->st_mode = S_IFREG | S_IROTH | S_IRGRP | S_IRUSR; + buf->st_size = 0; + buf->st_blksize = 0; + buf->st_blocks = 0; + return OK; +} +#endif + /*************************************************************************** * Public Functions ***************************************************************************/ @@ -258,3 +436,18 @@ bool up_fpucmp(const void *saveregs1, const void *saveregs2) return memcmp(®s1[FPU_REG_Q4], ®s2[FPU_REG_Q4], 8 * FPU_CALLEE_REGS) == 0; } + +/*************************************************************************** + * Name: arm64_fpu_procfs_register + * + * Description: + * Register the arm64 fpu procfs file system entry + * + ***************************************************************************/ + +#ifdef CONFIG_FS_PROCFS_REGISTER +int arm64_fpu_procfs_register(void) +{ + return procfs_register(&g_procfs_arm64_fpu); +} +#endif diff --git a/arch/arm64/src/common/arm64_fpu.h b/arch/arm64/src/common/arm64_fpu.h index ba2ec64d85b..66e2787272a 100644 --- a/arch/arm64/src/common/arm64_fpu.h +++ b/arch/arm64/src/common/arm64_fpu.h @@ -63,6 +63,10 @@ struct arm64_cpu_fpu_context void arm64_init_fpu(struct tcb_s *tcb); void arm64_destory_fpu(struct tcb_s *tcb); +#ifdef CONFIG_FS_PROCFS_REGISTER +int arm64_fpu_procfs_register(void); +#endif + void arm64_fpu_disable(void); void arm64_fpu_enable(void); diff --git a/arch/arm64/src/common/arm64_initialize.c b/arch/arm64/src/common/arm64_initialize.c index 25179a8ce38..16b6b2b8c94 100644 --- a/arch/arm64/src/common/arm64_initialize.c +++ b/arch/arm64/src/common/arm64_initialize.c @@ -218,5 +218,10 @@ void up_initialize(void) g_fpu_panic_block.notifier_call = arm64_panic_disable_fpu; g_fpu_panic_block.priority = INT_MAX; panic_notifier_chain_register(&g_fpu_panic_block); + +#ifdef CONFIG_FS_PROCFS_REGISTER + arm64_fpu_procfs_register(); +#endif + #endif } diff --git a/arch/arm64/src/common/arm64_schedulesigaction.c b/arch/arm64/src/common/arm64_schedulesigaction.c index 3ddbffe0a65..c9fe543d2a4 100644 --- a/arch/arm64/src/common/arm64_schedulesigaction.c +++ b/arch/arm64/src/common/arm64_schedulesigaction.c @@ -294,7 +294,7 @@ void up_schedule_sigaction(struct tcb_s *tcb, sig_deliver_t sigdeliver) tcb->xcp.saved_reg = tcb->xcp.regs; #ifdef CONFIG_ARCH_FPU - tcb->xcp.sig_save_fpu_regs = tcb->xcp.fpu_regs; + tcb->xcp.saved_fpu_regs = tcb->xcp.fpu_regs; #endif arm64_init_signal_process(tcb); @@ -341,7 +341,7 @@ void up_schedule_sigaction(struct tcb_s *tcb, sig_deliver_t sigdeliver) tcb->xcp.sigdeliver = sigdeliver; #ifdef CONFIG_ARCH_FPU - tcb->xcp.sig_save_fpu_regs = tcb->xcp.fpu_regs; + tcb->xcp.saved_fpu_regs = tcb->xcp.fpu_regs; #endif tcb->xcp.saved_reg = tcb->xcp.regs; diff --git a/boards/arm64/fvp-v8r/fvp-armv8r/configs/nsh/defconfig b/boards/arm64/fvp-v8r/fvp-armv8r/configs/nsh/defconfig index 545d02c7ead..aa34ab9e380 100644 --- a/boards/arm64/fvp-v8r/fvp-armv8r/configs/nsh/defconfig +++ b/boards/arm64/fvp-v8r/fvp-armv8r/configs/nsh/defconfig @@ -30,6 +30,7 @@ CONFIG_DEV_ZERO=y CONFIG_EXAMPLES_HELLO=y CONFIG_EXPERIMENTAL=y CONFIG_FS_PROCFS=y +CONFIG_FS_PROCFS_REGISTER=y CONFIG_FS_ROMFS=y CONFIG_FVP_UART_PL011=y CONFIG_IDLETHREAD_STACKSIZE=8192 diff --git a/boards/arm64/fvp-v8r/fvp-armv8r/configs/nsh_smp/defconfig b/boards/arm64/fvp-v8r/fvp-armv8r/configs/nsh_smp/defconfig index 76651deb514..204c7b0899a 100644 --- a/boards/arm64/fvp-v8r/fvp-armv8r/configs/nsh_smp/defconfig +++ b/boards/arm64/fvp-v8r/fvp-armv8r/configs/nsh_smp/defconfig @@ -30,6 +30,7 @@ CONFIG_DEV_ZERO=y CONFIG_EXAMPLES_HELLO=y CONFIG_EXPERIMENTAL=y CONFIG_FS_PROCFS=y +CONFIG_FS_PROCFS_REGISTER=y CONFIG_FS_ROMFS=y CONFIG_FVP_UART_PL011=y CONFIG_IDLETHREAD_STACKSIZE=8192 diff --git a/boards/arm64/qemu/qemu-armv8a/README.txt b/boards/arm64/qemu/qemu-armv8a/README.txt index 7628ec3d548..97b6cb58851 100644 --- a/boards/arm64/qemu/qemu-armv8a/README.txt +++ b/boards/arm64/qemu/qemu-armv8a/README.txt @@ -239,17 +239,7 @@ need to be considered: In many cases, the FPU trap is triggered by va_start() that copies the content of FP registers used for floating point argument passing into the va_list object in case there were actual float arguments from -the caller. But In practice this is almost never the case. -Seeing the save_count/restore_count at the g_cpu_fpu_ctx, which will -be increase when saving/restoring FPU context. After running ostest, -we can see the count with GDB: - -(gdb) p g_cpu_fpu_ctx - $1 = {{fpu_owner = 0x0, idle_thread = 0x402b3110 , - save_count = 1293, restore_count = 2226, switch_count = 4713, - exe_depth_count = 0}} -(gdb) - +the caller. adding -mgeneral-regs-only option will make compiler not use the FPU register, we can use the following patch to syslog: @@ -262,24 +252,33 @@ index c58fb45512..acac6febaa DEPPATH += --dep-path syslog VPATH += :syslog +syslog/lib_syslog.c_CFLAGS += -mgeneral-regs-only - - With the option to make NuttX and booting. After running ostest, see -the count with GDB again: - -(gdb) p g_cpu_fpu_ctx -$1 = {{fpu_owner = 0x0, idle_thread = 0x402b3110 , save_count = 141, - restore_count = 170, switch_count = 4715, exe_depth_count = 0}} -(gdb) - - it's only 141/170 for saving/restoring FPU context, which is 1293/2226 before -add this compile option. Almost all of FPU accessing switch is argument passing -at the syslog. - I cannot commit the patch for NuttX mainline because it's very special case + I cannot commit the patch for NuttX mainline because it's very special case since ostest is using syslog for lots of information printing. but this is a clue for FPU performance analysis. va_list object is using for many C code to handle argument passing, but if it's not passing floating point argument indeed. Add the option to your code maybe increase FPU performance +2. memset/memcpy issue + For improve performance, the memset/memcpy implement for libc will +use the neon/fpu instruction/register. The FPU trap is also triggered +in this case. + +we can trace this issue with Procfs: + +nsh> cat /proc/arm64fpu +CPU0: save: 7 restore: 8 switch: 62 exedepth: 0 +nsh> + +after ostest +nsh> cat /proc/arm64fpu +CPU0: save: 1329 restore: 2262 switch: 4613 exedepth: 0 +nsh> + +Note: +save: the counts of save for task FPU context +restore: the counts of restore for task FPU context +switch: the counts of task switch + 2. FPU trap at IRQ handler it's probably need to handle FPU trap at IRQ routine. Exception_depth is handling for this case, it will inc/dec at enter/leave exception. If the @@ -295,6 +294,10 @@ save/restore FPU context directly maybe become a solution. Linux kernel introduc kernel_neon_begin/kernel_neon_end function for this case. Similar function will be add to NuttX if this issue need to be handle. +3. More reading +for Linux kernel, please reference: +- https://www.kernel.org/doc/html/latest/arm/kernel_mode_neon.html + SMP Support =========== 1. Booting diff --git a/boards/arm64/qemu/qemu-armv8a/configs/nsh/defconfig b/boards/arm64/qemu/qemu-armv8a/configs/nsh/defconfig index 5b8ef369e22..c1d0d712c71 100644 --- a/boards/arm64/qemu/qemu-armv8a/configs/nsh/defconfig +++ b/boards/arm64/qemu/qemu-armv8a/configs/nsh/defconfig @@ -31,6 +31,7 @@ CONFIG_DEV_ZERO=y CONFIG_EXAMPLES_HELLO=y CONFIG_EXPERIMENTAL=y CONFIG_FS_PROCFS=y +CONFIG_FS_PROCFS_REGISTER=y CONFIG_FS_ROMFS=y CONFIG_HAVE_CXX=y CONFIG_HAVE_CXXINITIALIZE=y diff --git a/boards/arm64/qemu/qemu-armv8a/configs/nsh_smp/defconfig b/boards/arm64/qemu/qemu-armv8a/configs/nsh_smp/defconfig index bcd98306689..858b59a6d36 100644 --- a/boards/arm64/qemu/qemu-armv8a/configs/nsh_smp/defconfig +++ b/boards/arm64/qemu/qemu-armv8a/configs/nsh_smp/defconfig @@ -5,7 +5,6 @@ # You can then do "make savedefconfig" to generate a new defconfig file that includes your # modifications. # -# CONFIG_ARCH_FPU is not set CONFIG_ARCH="arm64" CONFIG_ARCH_ARM64=y CONFIG_ARCH_BOARD="qemu-armv8a" @@ -32,6 +31,7 @@ CONFIG_DEV_ZERO=y CONFIG_EXAMPLES_HELLO=y CONFIG_EXPERIMENTAL=y CONFIG_FS_PROCFS=y +CONFIG_FS_PROCFS_REGISTER=y CONFIG_FS_ROMFS=y CONFIG_IDLETHREAD_STACKSIZE=16384 CONFIG_INIT_ENTRYPOINT="nsh_main"