diff --git a/TODO b/TODO index 366533c5348..6234b44e6d1 100644 --- a/TODO +++ b/TODO @@ -308,33 +308,33 @@ o Task/Scheduler (sched/) o SMP ^^^ - Title: SPINLOCKS AND DATA CACHES - Description: If spinlocks are used in a system with a data cache, then there - may be a problem with cache coherency in some CPU architectures: - When one CPU modifies the spinlock, the changes may not be - visible to another CPU if it does not share the data cache. - That would cause failure in the spinlock logic. + Title: SMP AND DATA CACHES + Description: When spinlocks, semaphores, etc. are used in an SMP system with + a data cache, then there may be problems with cache coherency + in some CPU architectures: When one CPU modifies the shared + object, the changes may not be visible to another CPU if it + does not share the data cache. That would cause failure in + the IPC logic. Flushing the D-cache on writes and invalidating before a read is - not really an option. spinlocks are normally 8-bits in size and - cache lines are typically 32-bytes so that would have side effects - unless the spinlocks were made to be the same size as one cache - line. + not really an option. That would essentially effect every memory + access and there may be side-effects due to cache line sizes + and alignment. - This might be doable if a write-through cache is used. Then you - could always safely invalidate the cache line before reading the - spinlock because there should never be any dirty cache lines in - this case. + For the same reason a separate, non-cacheable memory region is + not an option. Essentially all data would have to go in the + non-cached region and you would have no benefit from the data + cache. - The better option is to add compiler independent "ornamentation" - to the spinlock so that the spinlocks are all linked together - into a separate, non-cacheable memory regions. Because of - region alignment and minimum region mapping sizes this could - still be wasteful of memory. This would work in systems that - have both data cache and either an MPU or an MMU. - Status: Open - Priority: High. spinlocks, and hence SMP, will not work on such systems - without this change. + On ARM Cortex-A, each CPU has a separate data cache. However, + the MPCore's Snoop Controller Unit supports coherency among + the different caches. The SCU is enabled by the SCU control + register and each CPU participates in the SMP coherency by + setting the ACTLR_SMP bit in the auxiliary control register + (ACTLR). + + Status: Closed + Priority: High on platforms that may have the issue. o Memory Management (mm/) ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/arch/arm/src/armv7-a/arm_cpuhead.S b/arch/arm/src/armv7-a/arm_cpuhead.S index 487fee0a46b..2f8387bdcdc 100644 --- a/arch/arm/src/armv7-a/arm_cpuhead.S +++ b/arch/arm/src/armv7-a/arm_cpuhead.S @@ -308,7 +308,11 @@ __cpu3_start: orr r0, r0, #(SCTLR_RR) #endif -#ifndef CPU_DCACHE_DISABLE + /* In SMP configurations, the data cache will not be enabled until later + * after SMP cache coherency has been setup. + */ + +#if !defined(CPU_DCACHE_DISABLE) && !defined(CONFIG_SMP) /* Dcache enable * * SCTLR_C Bit 2: DCache enable diff --git a/arch/arm/src/armv7-a/arm_cpustart.c b/arch/arm/src/armv7-a/arm_cpustart.c index 88906a717de..38928e7aec5 100644 --- a/arch/arm/src/armv7-a/arm_cpustart.c +++ b/arch/arm/src/armv7-a/arm_cpustart.c @@ -45,8 +45,9 @@ #include #include "up_internal.h" -#include "gic.h" #include "cp15_cacheops.h" +#include "gic.h" +#include "scu.h" #include "sched/sched.h" #ifdef CONFIG_SMP @@ -105,8 +106,13 @@ static inline void arm_registerdump(FAR struct tcb_s *tcb) int arm_start_handler(int irq, FAR void *context) { FAR struct tcb_s *tcb; + int cpu = up_cpu_index(); - sinfo("CPU%d Started\n", up_cpu_index()); + sinfo("CPU%d Started\n", cpu); + + /* Enable SMP cache coherency for the CPU */ + + arm_enable_smp(cpu); /* Reset scheduler parameters */ diff --git a/arch/arm/src/armv7-a/arm_head.S b/arch/arm/src/armv7-a/arm_head.S index c98ab30719d..27c2a5b4dcf 100644 --- a/arch/arm/src/armv7-a/arm_head.S +++ b/arch/arm/src/armv7-a/arm_head.S @@ -450,7 +450,11 @@ __start: orr r0, r0, #(SCTLR_RR) #endif -#ifndef CPU_DCACHE_DISABLE + /* In SMP configurations, the data cache will not be enabled until later + * after SMP cache coherency has been setup. + */ + +#if !defined(CPU_DCACHE_DISABLE) && !defined(CONFIG_SMP) /* Dcache enable * * SCTLR_C Bit 2: DCache enable @@ -638,7 +642,7 @@ __start: #endif /* Perform early C-level, platform-specific initialization. Logic - * within arm_boot() must configure SDRAM and call arm_ram_initailize. + * within arm_boot() must configure SDRAM and call arm_data_initialize(). */ bl arm_boot diff --git a/arch/arm/src/armv7-a/arm_pghead.S b/arch/arm/src/armv7-a/arm_pghead.S index 1a546c813d4..1dda0acdd92 100644 --- a/arch/arm/src/armv7-a/arm_pghead.S +++ b/arch/arm/src/armv7-a/arm_pghead.S @@ -434,7 +434,11 @@ __start: orr r0, r0, #(SCTLR_RR) #endif -#ifndef CPU_DCACHE_DISABLE + /* In SMP configurations, the data cache will not be enabled until later + * after SMP cache coherency has been setup. + */ + +#if !defined(CPU_DCACHE_DISABLE) && !defined(CONFIG_SMP) /* Dcache enable * * SCTLR_C Bit 2: DCache enable @@ -670,7 +674,7 @@ __start: #endif /* Perform early C-level, platform-specific initialization. Logic - * within arm_boot() must configure SDRAM and call arm_ram_initailize. + * within arm_boot() must configure SDRAM and call arm_data_initialize(). */ bl arm_boot diff --git a/arch/arm/src/armv7-a/arm_scu.c b/arch/arm/src/armv7-a/arm_scu.c index 8779b046065..e1fdd52925e 100644 --- a/arch/arm/src/armv7-a/arm_scu.c +++ b/arch/arm/src/armv7-a/arm_scu.c @@ -41,18 +41,59 @@ #include -#include - #include "up_arch.h" +#include "cp15_cacheops.h" #include "sctlr.h" #include "scu.h" #ifdef CONFIG_SMP /**************************************************************************** - * Public Functions + * Private Functions ****************************************************************************/ +/**************************************************************************** + * Name: arm_get_sctlr + * + * Description: + * Get the contents of the SCTLR register + * + ****************************************************************************/ + +static inline uint32_t arm_get_sctlr(void) +{ + uint32_t sctlr; + + __asm__ __volatile__ + ( + "\tmrc p15, 0, %0, c1, c0, 0\n" /* Read SCTLR */ + : "=r"(sctlr) + : + : + ); + + return sctlr; +} + +/**************************************************************************** + * Name: arm_set_sctlr + * + * Description: + * Set the contents of the SCTLR register + * + ****************************************************************************/ + +static inline void arm_set_sctlr(uint32_t sctlr) +{ + __asm__ __volatile__ + ( + "\tmcr p15, 0, %0, c1, c0, 0\n" /* Write SCTLR */ + : + : "r"(sctlr) + : + ); +} + /**************************************************************************** * Name: arm_get_actlr * @@ -95,22 +136,6 @@ static inline void arm_set_actlr(uint32_t actlr) ); } -/**************************************************************************** - * Name: arm_modify_actlr - * - * Description: - * Set the bits in the ACTLR register - * - ****************************************************************************/ - -static inline void arm_modify_actlr(uint32_t setbits) -{ - irqstate_t flags = enter_critical_section(); - uint32_t actlr = arm_get_actlr(); - arm_set_actlr(actlr | setbits); - leave_critical_section(flags); -} - /**************************************************************************** * Public Functions ****************************************************************************/ @@ -122,13 +147,17 @@ static inline void arm_modify_actlr(uint32_t setbits) * Enable the SCU and make certain that current CPU is participating in * the SMP cache coherency. * + * Assumption: + * Called early in the CPU start-up. No special critical sections are + * needed if only CPU-private registers are modified. + * ****************************************************************************/ void arm_enable_smp(int cpu) { - /* Invalidate the data cache -- Missing logic. */ + uint32_t regval; - /* Handle actions unique to CPU0 */ + /* Handle actions unique to CPU0 which comes up first */ if (cpu == 0) { @@ -140,18 +169,45 @@ void arm_enable_smp(int cpu) (SCU_INVALIDATE_ALL_WAYS << SCU_INVALIDATE_CPU3_SHIFT), SCU_INVALIDATE); + /* Invalidate CPUn L1 data cache so that is will we be reloaded from + * coherent L2. + */ + + cp15_invalidate_dcache_all(); + /* Invalidate the L2C-310 -- Missing logic. */ /* Enable the SCU */ - modifyreg32(SCU_CTRL, 0, SCU_CTRL_ENABLE); /* CPU0 only */ + regval = getreg32(SCU_CTRL); + regval |= SCU_CTRL_ENABLE; + putreg32(regval, SCU_CTRL); } - /* Enable the data cache -- Missing logic. */ + /* Actions for other CPUs */ - /* This CPU now participates the SMP cache coherency */ + else + { + /* Invalidate CPUn L1 data cache so that is will we be reloaded from + * coherent L2. + */ - arm_modify_actlr(ACTLR_SMP); + cp15_invalidate_dcache_all(); + + /* Wait for the SCU to be enabled by the primary processor -- should + * not be necessary. + */ + } + + /* Enable the data cache, set the SMP mode with ACTLR.SMP=1. */ + + regval = arm_get_actlr(); + regval |= ACTLR_SMP; + arm_set_actlr(regval); + + regval = arm_get_sctlr(); + regval |= SCTLR_C; + arm_set_sctlr(regval); } #endif diff --git a/arch/arm/src/imx6/imx_boot.c b/arch/arm/src/imx6/imx_boot.c index eb731a33153..aaa249e5c09 100644 --- a/arch/arm/src/imx6/imx_boot.c +++ b/arch/arm/src/imx6/imx_boot.c @@ -52,6 +52,7 @@ #include "chip.h" #include "arm.h" #include "mmu.h" +#include "scu.h" #include "cache.h" #include "fpu.h" #include "up_internal.h" @@ -450,13 +451,21 @@ void arm_boot(void) */ imx_cpu_disable(); + imx_lowputc('B'); + +#ifdef CONFIG_SMP + /* Enable SMP cache coherency for CPU0 */ + + arm_enable_smp(0); + imx_lowputc('C'); +#endif /* Provide a special mapping for the OCRAM interrupt vector positioned in * high memory. */ imx_vectormapping(); - imx_lowputc('B'); + imx_lowputc('D'); #if defined(CONFIG_SMP) && defined(SMP_INTERCPU_NONCACHED) /* Provide a special mapping for the OCRAM interrupt vector positioned in @@ -464,7 +473,7 @@ void arm_boot(void) */ imx_intercpu_mapping(); - imx_lowputc('C'); + imx_lowputc('E'); #endif #ifdef CONFIG_ARCH_RAMFUNCS @@ -479,14 +488,14 @@ void arm_boot(void) *dest++ = *src++; } - imx_lowputc('D'); + imx_lowputc('F'); /* Flush the copied RAM functions into physical RAM so that will * be available when fetched into the I-Cache. */ arch_clean_dcache((uintptr_t)&_sramfuncs, (uintptr_t)&_eramfuncs) - imx_lowputc('E'); + imx_lowputc('G'); #endif /* Setup up vector block. _vector_start and _vector_end are exported from @@ -494,37 +503,35 @@ void arm_boot(void) */ imx_copyvectorblock(); - imx_lowputc('F'); + imx_lowputc('H'); /* Disable the watchdog timer */ imx_wdtdisable(); - imx_lowputc('G'); + imx_lowputc('I'); /* Initialize clocking to settings provided by board-specific logic */ imx_clockconfig(); - imx_lowputc('H'); + imx_lowputc('J'); #ifdef CONFIG_ARCH_FPU /* Initialize the FPU */ arm_fpuconfig(); - imx_lowputc('I'); + imx_lowputc('K'); #endif - /* Perform board-specific initialization, This must include: - * - * - Initialization of board-specific memory resources (e.g., SDRAM) - * - Configuration of board specific resources (PIOs, LEDs, etc). + /* Perform board-specific memroy initialization, This must include + * initialization of board-specific memory resources (e.g., SDRAM) * * NOTE: We must use caution prior to this point to make sure that * the logic does not access any global variables that might lie * in SDRAM. */ - imx_board_initialize(); - imx_lowputc('J'); + imx_memory_initialize(); + imx_lowputc('L'); #ifdef NEED_SDRAM_REMAPPING /* SDRAM was configured in a temporary state to support low-level @@ -533,7 +540,7 @@ void arm_boot(void) */ imx_remap(); - imx_lowputc('K'); + imx_lowputc('M'); #endif #ifdef CONFIG_BOOT_SDRAM_DATA @@ -542,9 +549,16 @@ void arm_boot(void) */ arm_data_initialize(); - imx_lowputc('L'); + imx_lowputc('N'); #endif + /* Perform board-specific device initialization. This would include + * configuration of board specific resources such as GPIOs, LEDs, etc. + */ + + imx_board_initialize(); + imx_lowputc('O'); + #if defined(CONFIG_SMP) && defined(SMP_INTERCPU_NONCACHED) /* Initialize the uncached, inter-CPU communications area */ @@ -553,13 +567,13 @@ void arm_boot(void) *dest++ = 0; } - imx_lowputc('M'); + imx_lowputc('P'); #endif /* Perform common, low-level chip initialization (might do nothing) */ imx_lowsetup(); - imx_lowputc('N'); + imx_lowputc('Q'); #ifdef USE_EARLYSERIALINIT /* Perform early serial initialization if we are going to use the serial @@ -567,7 +581,7 @@ void arm_boot(void) */ imx_earlyserialinit(); - imx_lowputc('O'); + imx_lowputc('R'); #endif /* Now we can enable all other CPUs. The enabled CPUs will start execution @@ -576,6 +590,6 @@ void arm_boot(void) */ imx_cpu_enable(); - imx_lowputc('P'); + imx_lowputc('S'); imx_lowputc('\n'); } diff --git a/arch/arm/src/imx6/imx_boot.h b/arch/arm/src/imx6/imx_boot.h index 11c1ef56fe3..7f98d347acd 100644 --- a/arch/arm/src/imx6/imx_boot.h +++ b/arch/arm/src/imx6/imx_boot.h @@ -110,14 +110,37 @@ void imx_cpu_enable(void); # define imx_cpu_enable() #endif +/**************************************************************************** + * Name: imx_memory_initialize + * + * Description: + * All i.MX6 architectures must provide the following entry point. This + * entry point is called early in the initialization before memory has + * been configured. This board-specific function is responsible for + * configuring any on-board memories. + * + * Logic in imx_memory_initialize must be careful to avoid using any + * global variables because those will be uninitialized at the time this + * function is called. + * + * Input Parameters: + * None + * + * Returned Value: + * None + * + ****************************************************************************/ + +void imx_memory_initialize(void); + /**************************************************************************** * Name: imx_board_initialize * * Description: * All i.MX6 architectures must provide the following entry point. This - * entry point is called early in the initialization -- after all memory - * has been configured and mapped but before any devices have been - * initialized. + * entry point is called in the initialization phase -- after + * imx_memory_initialize and after all memory has been configured and + * mapped but before any devices have been initialized. * * Input Parameters: * None diff --git a/arch/arm/src/imx6/imx_cpuboot.c b/arch/arm/src/imx6/imx_cpuboot.c index db087575797..4b288b17d1c 100644 --- a/arch/arm/src/imx6/imx_cpuboot.c +++ b/arch/arm/src/imx6/imx_cpuboot.c @@ -52,9 +52,7 @@ #include "sctlr.h" #include "smp.h" #include "fpu.h" -#include "scu.h" #include "gic.h" -#include "cp15_cacheops.h" #ifdef CONFIG_SMP @@ -267,12 +265,6 @@ void arm_cpu_boot(int cpu) arm_fpuconfig(); #endif -#ifdef CONFIG_SMP - /* Enable SMP cache coherency for CPU0 */ - - arm_enable_smp(cpu); -#endif - /* Initialize the Generic Interrupt Controller (GIC) for CPUn (n != 0) */ arm_gic_initialize(); @@ -304,10 +296,6 @@ void arm_cpu_boot(int cpu) (void)up_irq_enable(); #endif - /* Invalidate CPUn L1 so that is will be reloaded from coherent L2. */ - - cp15_invalidate_dcache_all(); - /* The next thing that we expect to happen is for logic running on CPU0 * to call up_cpu_start() which generate an SGI and a context switch to * the configured NuttX IDLE task. diff --git a/arch/arm/src/imx6/imx_irq.c b/arch/arm/src/imx6/imx_irq.c index ecf2f197ac8..e00a8e9527d 100644 --- a/arch/arm/src/imx6/imx_irq.c +++ b/arch/arm/src/imx6/imx_irq.c @@ -45,7 +45,6 @@ #include "up_internal.h" #include "sctlr.h" -#include "scu.h" #include "gic.h" /**************************************************************************** @@ -109,12 +108,6 @@ void up_irqinitialize(void) arm_gic0_initialize(); /* Initialization unique to CPU0 */ arm_gic_initialize(); /* Initialization common to all CPUs */ -#ifdef CONFIG_SMP - /* Enable SMP cache coherency for CPU0 */ - - arm_enable_smp(0); -#endif - #ifdef CONFIG_ARCH_LOWVECTORS /* If CONFIG_ARCH_LOWVECTORS is defined, then the vectors located at the * beginning of the .text region must appear at address at the address diff --git a/configs/sabre-6quad/README.txt b/configs/sabre-6quad/README.txt index 3a7988d0296..13584158a57 100644 --- a/configs/sabre-6quad/README.txt +++ b/configs/sabre-6quad/README.txt @@ -104,13 +104,13 @@ Status 2016-11-26: With regard to SMP, the major issue is cache coherency. I added some special build logic to move spinlock data into the separate, non- cached section. That gives an improvement in performance but there are - still hangs. These, I have determined are to other kinds of cache + still hangs. These, I have determined, are to other kinds of cache coherency problems. Semaphores, message queues, etc. basically all - shared data must be made coherent. I am not sure how to do that. See - the SMP section below for more information. + shared data must be made coherent. I also added some SCU controls that should enable cache consistency for SMP - CPUs, but I don't think I have that working right yet. + CPUs, but I don't think I have that working right yet. See the SMP section + below for more information. Platform Features ================= @@ -511,22 +511,13 @@ Open Issues: This will cause the interrupt handlers on other CPUs to spin until leave_critical_section() is called. More verification is needed. -2. Cache Concurency. Cache coherency in SMP configurations is managed by the the - CPU. I don't think I have the set up correctly yet. +2. Cache Concurency. Cache coherency in SMP configurations is managed by the + MPCore snoop control unit (SCU). But I don't think I have the set up + correctly yet. Currently cache inconsistencies appear to be the root cause of all current SMP issues. - 2016-11-26: With regard to SMP, the major issue is cache coherency. I added - some special build logic to move spinlock data into the separate, non- - cached section. That gives an improvement in performance but there are - still hangs. These, I have determined are to other kinds of cache - coherency problems. Semaphores, message queues, etc. basically all - shared data must be made coherent. - - I also added some SCU controls that should enable cache consistency for SMP - CPUs, but I don't think I have that working right yet. - Configurations ============== diff --git a/configs/sabre-6quad/src/imx_boardinit.c b/configs/sabre-6quad/src/imx_boardinit.c index f67592caf87..80dbc589a1b 100644 --- a/configs/sabre-6quad/src/imx_boardinit.c +++ b/configs/sabre-6quad/src/imx_boardinit.c @@ -62,14 +62,40 @@ * Public Functions ****************************************************************************/ +/**************************************************************************** + * Name: imx_memory_initialize + * + * Description: + * All i.MX6 architectures must provide the following entry point. This + * entry point is called early in the initialization before memory has + * been configured. This board-specific function is responsible for + * configuring any on-board memories. + * + * Logic in imx_memory_initialize must be careful to avoid using any + * global variables because those will be uninitialized at the time this + * function is called. + * + * Input Parameters: + * None + * + * Returned Value: + * None + * + ****************************************************************************/ + +void imx_memory_initialize(void) +{ + /* SDRAM was initialized by a bootloader in the supported configurations. */ +} + /**************************************************************************** * Name: imx_board_initialize * * Description: * All i.MX6 architectures must provide the following entry point. This - * entry point is called early in the initialization -- after all memory - * has been configured and mapped but before any devices have been - * initialized. + * entry point is called in the initialization phase -- after + * imx_memory_initialize and after all memory has been configured and + * mapped but before any devices have been initialized. * * Input Parameters: * None