Skip to content

Commit

Permalink
arm64: FPU context switching support
Browse files Browse the repository at this point in the history
This adds FPU sharing support with a lazy context switching algorithm.

Every thread is allowed to use FPU/SIMD registers. In fact, the compiler
may insert FPU reg accesses in anycontext to optimize even non-FP code
unless the -mgeneral-regs-only compiler flag is used, but Zephyr
currently doesn't support such a build.

It is therefore possible to do FP access in IRS as well with this patch
although IRQs are then disabled to prevent nested IRQs in such cases.

Because the thread object grows in size, some tests have to be adjusted.

Signed-off-by: Nicolas Pitre <[email protected]>
  • Loading branch information
Nicolas Pitre authored and carlescufi committed May 3, 2021
1 parent a82fff0 commit f1f63dd
Show file tree
Hide file tree
Showing 20 changed files with 588 additions and 5 deletions.
1 change: 1 addition & 0 deletions arch/arm64/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ zephyr_library_sources(
vector_table.S
)

zephyr_library_sources_ifdef(CONFIG_FPU_SHARING fpu.c fpu.S)
zephyr_library_sources_ifdef(CONFIG_ARM_MMU mmu.c mmu.S)
zephyr_library_sources_ifdef(CONFIG_USERSPACE userspace.S)
zephyr_library_sources_ifdef(CONFIG_GEN_SW_ISR_TABLE isr_wrapper.S)
Expand Down
3 changes: 3 additions & 0 deletions arch/arm64/core/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ config CPU_CORTEX_A
select CPU_CORTEX
select HAS_FLASH_LOAD_OFFSET
select SCHED_IPI_SUPPORTED if SMP
select CPU_HAS_FPU
imply FPU
imply FPU_SHARING
help
This option signifies the use of a CPU of the Cortex-A family.

Expand Down
65 changes: 65 additions & 0 deletions arch/arm64/core/fpu.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
* Copyright (c) 2021 BayLibre SAS
* Written by: Nicolas Pitre
*
* SPDX-License-Identifier: Apache-2.0
*/

#include <toolchain.h>
#include <linker/sections.h>

_ASM_FILE_PROLOGUE

GTEXT(z_arm64_fpu_save)
SECTION_FUNC(TEXT, z_arm64_fpu_save)

stp q0, q1, [x0, #(16 * 0)]
stp q2, q3, [x0, #(16 * 2)]
stp q4, q5, [x0, #(16 * 4)]
stp q6, q7, [x0, #(16 * 6)]
stp q8, q9, [x0, #(16 * 8)]
stp q10, q11, [x0, #(16 * 10)]
stp q12, q13, [x0, #(16 * 12)]
stp q14, q15, [x0, #(16 * 14)]
stp q16, q17, [x0, #(16 * 16)]
stp q18, q19, [x0, #(16 * 18)]
stp q20, q21, [x0, #(16 * 20)]
stp q22, q23, [x0, #(16 * 22)]
stp q24, q25, [x0, #(16 * 24)]
stp q26, q27, [x0, #(16 * 26)]
stp q28, q29, [x0, #(16 * 28)]
stp q30, q31, [x0, #(16 * 30)]

mrs x1, fpsr
mrs x2, fpcr
str w1, [x0, #(16 * 32 + 0)]
str w2, [x0, #(16 * 32 + 4)]

ret

GTEXT(z_arm64_fpu_restore)
SECTION_FUNC(TEXT, z_arm64_fpu_restore)

ldp q0, q1, [x0, #(16 * 0)]
ldp q2, q3, [x0, #(16 * 2)]
ldp q4, q5, [x0, #(16 * 4)]
ldp q6, q7, [x0, #(16 * 6)]
ldp q8, q9, [x0, #(16 * 8)]
ldp q10, q11, [x0, #(16 * 10)]
ldp q12, q13, [x0, #(16 * 12)]
ldp q14, q15, [x0, #(16 * 14)]
ldp q16, q17, [x0, #(16 * 16)]
ldp q18, q19, [x0, #(16 * 18)]
ldp q20, q21, [x0, #(16 * 20)]
ldp q22, q23, [x0, #(16 * 22)]
ldp q24, q25, [x0, #(16 * 24)]
ldp q26, q27, [x0, #(16 * 26)]
ldp q28, q29, [x0, #(16 * 28)]
ldp q30, q31, [x0, #(16 * 30)]

ldr w1, [x0, #(16 * 32 + 0)]
ldr w2, [x0, #(16 * 32 + 4)]
msr fpsr, x1
msr fpcr, x2

ret
260 changes: 260 additions & 0 deletions arch/arm64/core/fpu.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
/*
* Copyright (c) 2021 BayLibre SAS
* Written by: Nicolas Pitre
*
* SPDX-License-Identifier: Apache-2.0
*/

#include <kernel.h>
#include <kernel_structs.h>
#include <kernel_arch_interface.h>
#include <arch/cpu.h>

/* to be found in fpu.S */
extern void z_arm64_fpu_save(struct z_arm64_fp_context *saved_fp_context);
extern void z_arm64_fpu_restore(struct z_arm64_fp_context *saved_fp_context);

#define FPU_DEBUG 0

#if FPU_DEBUG

/*
* Debug traces have to be produced without printk() or any other functions
* using a va_list as va_start() always copy the FPU registers that could be
* used to pass float arguments, and that triggers an FPU access trap.
*/

#include <string.h>

static void DBG(char *msg, struct k_thread *th)
{
char buf[80], *p;
unsigned int v;

strcpy(buf, "CPU# exc# ");
buf[3] = '0' + _current_cpu->id;
buf[8] = '0' + arch_exception_depth();
strcat(buf, _current->name);
strcat(buf, ": ");
strcat(buf, msg);
strcat(buf, " ");
strcat(buf, th->name);


v = *(unsigned char *)&th->arch.saved_fp_context;
p = buf + strlen(buf);
*p++ = ' ';
*p++ = ((v >> 4) < 10) ? ((v >> 4) + '0') : ((v >> 4) - 10 + 'a');
*p++ = ((v & 15) < 10) ? ((v & 15) + '0') : ((v & 15) - 10 + 'a');
*p++ = '\n';
*p = 0;

k_str_out(buf, p - buf);
}

#else

static inline void DBG(char *msg, struct k_thread *t) { }

#endif /* FPU_DEBUG */

/*
* Flush FPU content and disable access.
* This is called locally and also from flush_fpu_ipi_handler().
*/
void z_arm64_flush_local_fpu(void)
{
__ASSERT(read_daif() & DAIF_IRQ_BIT, "must be called with IRQs disabled");

struct k_thread *owner = _current_cpu->arch.fpu_owner;

if (owner != NULL) {
uint64_t cpacr = read_cpacr_el1();

/* turn on FPU access */
write_cpacr_el1(cpacr | CPACR_EL1_FPEN_NOTRAP);
isb();

/* save current owner's content */
z_arm64_fpu_save(&owner->arch.saved_fp_context);
/* make sure content made it to memory before releasing */
dsb();
/* release ownership */
_current_cpu->arch.fpu_owner = NULL;
DBG("disable", owner);

/* disable FPU access */
write_cpacr_el1(cpacr & ~CPACR_EL1_FPEN_NOTRAP);
}
}

#ifdef CONFIG_SMP
static void flush_owned_fpu(struct k_thread *thread)
{
__ASSERT(read_daif() & DAIF_IRQ_BIT, "must be called with IRQs disabled");

int i;

/* search all CPUs for the owner we want */
for (i = 0; i < CONFIG_MP_NUM_CPUS; i++) {
if (_kernel.cpus[i].arch.fpu_owner != thread) {
continue;
}
/* we found it live on CPU i */
if (i == _current_cpu->id) {
z_arm64_flush_local_fpu();
} else {
/* the FPU context is live on another CPU */
z_arm64_flush_fpu_ipi(i);

/*
* Wait for it only if this is about the thread
* currently running on this CPU. Otherwise the
* other CPU running some other thread could regain
* ownership the moment it is removed from it and
* we would be stuck here.
*
* Also, if this is for the thread running on this
* CPU, then we preemptively flush any live context
* on this CPU as well since we're likely to
* replace it, and this avoids a deadlock where
* two CPUs want to pull each other's FPU context.
*/
if (thread == _current) {
z_arm64_flush_local_fpu();
while (_kernel.cpus[i].arch.fpu_owner == thread) {
dsb();
}
}
}
break;
}
}
#endif

void z_arm64_fpu_enter_exc(void)
{
__ASSERT(read_daif() & DAIF_IRQ_BIT, "must be called with IRQs disabled");

/* always deny FPU access whenever an exception is entered */
write_cpacr_el1(read_cpacr_el1() & ~CPACR_EL1_FPEN_NOTRAP);
isb();
}

/*
* Process the FPU trap.
*
* This usually means that FP regs belong to another thread. Save them
* to that thread's save area and restore the current thread's content.
*
* We also get here when FP regs are used while in exception as FP access
* is always disabled by default in that case. If so we save the FPU content
* to the owning thread and simply enable FPU access. Exceptions should be
* short and don't have persistent register contexts when they're done so
* there is nothing to save/restore for that context... as long as we
* don't get interrupted that is. To ensure that we mask interrupts to
* the triggering exception context.
*/
void z_arm64_fpu_trap(z_arch_esf_t *esf)
{
__ASSERT(read_daif() & DAIF_IRQ_BIT, "must be called with IRQs disabled");

/* turn on FPU access */
write_cpacr_el1(read_cpacr_el1() | CPACR_EL1_FPEN_NOTRAP);
isb();

/* save current owner's content if any */
struct k_thread *owner = _current_cpu->arch.fpu_owner;

if (owner) {
z_arm64_fpu_save(&owner->arch.saved_fp_context);
dsb();
_current_cpu->arch.fpu_owner = NULL;
DBG("save", owner);
}

if (arch_exception_depth() > 1) {
/*
* We were already in exception when the FPU access trap.
* We give it access and prevent any further IRQ recursion
* by disabling IRQs as we wouldn't be able to preserve the
* interrupted exception's FPU context.
*/
esf->spsr |= DAIF_IRQ_BIT;
return;
}

#ifdef CONFIG_SMP
/*
* Make sure the FPU context we need isn't live on another CPU.
* The current CPU's FPU context is NULL at this point.
*/
flush_owned_fpu(_current);
#endif

/* become new owner */
_current_cpu->arch.fpu_owner = _current;

/* restore our content */
z_arm64_fpu_restore(&_current->arch.saved_fp_context);
DBG("restore", _current);
}

/*
* Perform lazy FPU context switching by simply granting or denying
* access to FP regs based on FPU ownership before leaving the last
* exception level. If current thread doesn't own the FP regs then
* it will trap on its first access and then the actual FPU context
* switching will occur.
*
* This is called on every exception exit except for z_arm64_fpu_trap().
*/
void z_arm64_fpu_exit_exc(void)
{
__ASSERT(read_daif() & DAIF_IRQ_BIT, "must be called with IRQs disabled");

uint64_t cpacr = read_cpacr_el1();

if (arch_exception_depth() == 1) {
/* We're about to leave exception mode */
if (_current_cpu->arch.fpu_owner == _current) {
/* turn on FPU access */
write_cpacr_el1(cpacr | CPACR_EL1_FPEN_NOTRAP);
} else {
/* deny FPU access */
write_cpacr_el1(cpacr & ~CPACR_EL1_FPEN_NOTRAP);
}
} else {
/*
* Shallower exception levels should always trap on FPU
* access as we want to make sure IRQs are disabled before
* granting them access.
*/
write_cpacr_el1(cpacr & ~CPACR_EL1_FPEN_NOTRAP);
}
}

int arch_float_disable(struct k_thread *thread)
{
if (thread != NULL) {
unsigned int key = arch_irq_lock();

#ifdef CONFIG_SMP
flush_owned_fpu(thread);
#else
if (thread == _current_cpu->arch.fpu_owner) {
z_arm64_flush_local_fpu();
}
#endif

arch_irq_unlock(key);
}

return 0;
}

int arch_float_enable(struct k_thread *thread, unsigned int options)
{
/* floats always gets enabled automatically at the moment */
return 0;
}
2 changes: 1 addition & 1 deletion arch/arm64/core/reset.c
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ void z_arm64_el1_init(void)
isb();

reg = 0U; /* RES0 */
reg |= CPACR_EL1_FPEN_NOTRAP; /* Do not trap NEON/SIMD/FP */
reg |= CPACR_EL1_FPEN_NOTRAP; /* Do not trap NEON/SIMD/FP initially */
/* TODO: CONFIG_FLOAT_*_FORBIDDEN */
write_cpacr_el1(reg);

Expand Down
Loading

0 comments on commit f1f63dd

Please sign in to comment.