arm-trusted-firmware/fdts/tc2.dts
Boyan Karatotev 83ec7e452c perf(amu): greatly simplify AMU context management
The current code is incredibly resilient to updates to the spec and
has worked quite well so far. However, recent implementations expose a
weakness in that this is rather slow. A large part of it is written in
assembly, making it opaque to the compiler for optimisations. The
future proofness requires reading registers that are effectively
`volatile`, making it even harder for the compiler, as well as adding
lots of implicit barriers, making it hard for the microarchitecutre to
optimise as well.

We can make a few assumptions, checked by a few well placed asserts, and
remove a lot of this burden. For a start, at the moment there are 4
group 0 counters with static assignments. Contexting them is a trivial
affair that doesn't need a loop. Similarly, there can only be up to 16
group 1 counters. Contexting them is a bit harder, but we can do with a
single branch with a falling through switch. If/when both of these
change, we have a pair of asserts and the feature detection mechanism to
guard us against pretending that we support something we don't.

We can drop contexting of the offset registers. They are fully
accessible by EL2 and as such are its responsibility to preserve on
powerdown.

Another small thing we can do, is pass the core_pos into the hook.
The caller already knows which core we're running on, we don't need to
call this non-trivial function again.

Finally, knowing this, we don't really need the auxiliary AMUs to be
described by the device tree. Linux doesn't care at the moment, and any
information we need for EL3 can be neatly placed in a simple array.

All of this, combined with lifting the actual saving out of assembly,
reduces the instructions to save the context from 180 to 40, including a
lot fewer branches. The code is also much shorter and easier to read.

Also propagate to aarch32 so that the two don't diverge too much.

Change-Id: Ib62e6e9ba5be7fb9fb8965c8eee148d5598a5361
Signed-off-by: Boyan Karatotev <boyan.karatotev@arm.com>
2025-02-25 08:50:46 +00:00

287 lines
6.1 KiB
Text

/*
* Copyright (c) 2020-2024, Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
/dts-v1/;
#include <dt-bindings/interrupt-controller/arm-gic.h>
#include <dt-bindings/interrupt-controller/irq.h>
#include <platform_def.h>
#if TARGET_FLAVOUR_FVP
#define LIT_CAPACITY 406
#define MID_CAPACITY 912
#else /* TARGET_FLAVOUR_FPGA */
#define LIT_CAPACITY 280
#define MID_CAPACITY 775
/* this is an area optimized configuration of the big core */
#define BIG2_CAPACITY 930
#endif /* TARGET_FLAVOUR_FPGA */
#define BIG_CAPACITY 1024
#define MHU_TX_ADDR 45000000 /* hex */
#define MHU_TX_COMPAT "arm,mhuv2-tx","arm,primecell"
#define MHU_TX_INT_NAME "mhu_tx"
#define MHU_RX_ADDR 45010000 /* hex */
#define MHU_RX_COMPAT "arm,mhuv2-rx","arm,primecell"
#define MHU_OFFSET 0x1000
#define MHU_MBOX_CELLS 2
#define MHU_RX_INT_NUM 317
#define MHU_RX_INT_NAME "mhu_rx"
#define LIT_CPU_PMU_COMPATIBLE "arm,cortex-a520-pmu"
#define MID_CPU_PMU_COMPATIBLE "arm,cortex-a720-pmu"
#define BIG_CPU_PMU_COMPATIBLE "arm,cortex-x4-pmu"
#define DSU_MPAM_ADDR 0x1 0x00010000 /* 0x1_0001_0000 */
#define DPU_ADDR 2cc00000
#define DPU_IRQ 69
#define ETHERNET_ADDR 18000000
#define ETHERNET_INT 109
#define SYS_REGS_ADDR 1c010000
#define MMC_ADDR 1c050000
#define MMC_INT_0 107
#define MMC_INT_1 108
#define RTC_ADDR 1c170000
#define RTC_INT 100
#define KMI_0_ADDR 1c060000
#define KMI_0_INT 197
#define KMI_1_ADDR 1c070000
#define KMI_1_INT 103
#define VIRTIO_BLOCK_ADDR 1c130000
#define VIRTIO_BLOCK_INT 204
#include "tc-common.dtsi"
#if TARGET_FLAVOUR_FVP
#include "tc-fvp.dtsi"
#else
#include "tc-fpga.dtsi"
#endif /* TARGET_FLAVOUR_FVP */
#include "tc-base.dtsi"
/ {
cpus {
#if TARGET_FLAVOUR_FPGA
cpu-map {
cluster0 {
core8 {
cpu = <&CPU8>;
};
core9 {
cpu = <&CPU9>;
};
core10 {
cpu = <&CPU10>;
};
core11 {
cpu = <&CPU11>;
};
core12 {
cpu = <&CPU12>;
};
core13 {
cpu = <&CPU13>;
};
};
};
#endif
CPU2:cpu@200 {
clocks = <&scmi_dvfs 0>;
capacity-dmips-mhz = <LIT_CAPACITY>;
};
CPU3:cpu@300 {
clocks = <&scmi_dvfs 0>;
capacity-dmips-mhz = <LIT_CAPACITY>;
};
CPU6:cpu@600 {
clocks = <&scmi_dvfs 1>;
capacity-dmips-mhz = <MID_CAPACITY>;
};
CPU7:cpu@700 {
clocks = <&scmi_dvfs 1>;
capacity-dmips-mhz = <MID_CAPACITY>;
};
#if TARGET_FLAVOUR_FPGA
CPU8:cpu@800 {
device_type = "cpu";
compatible = "arm,armv8";
reg = <0x800>;
enable-method = "psci";
clocks = <&scmi_dvfs 1>;
capacity-dmips-mhz = <MID_CAPACITY>;
};
CPU9:cpu@900 {
device_type = "cpu";
compatible = "arm,armv8";
reg = <0x900>;
enable-method = "psci";
clocks = <&scmi_dvfs 2>;
capacity-dmips-mhz = <BIG2_CAPACITY>;
};
CPU10:cpu@A00 {
device_type = "cpu";
compatible = "arm,armv8";
reg = <0xA00>;
enable-method = "psci";
clocks = <&scmi_dvfs 2>;
capacity-dmips-mhz = <BIG2_CAPACITY>;
};
CPU11:cpu@B00 {
device_type = "cpu";
compatible = "arm,armv8";
reg = <0xB00>;
enable-method = "psci";
clocks = <&scmi_dvfs 2>;
capacity-dmips-mhz = <BIG2_CAPACITY>;
};
CPU12:cpu@C00 {
device_type = "cpu";
compatible = "arm,armv8";
reg = <0xC00>;
enable-method = "psci";
clocks = <&scmi_dvfs 3>;
capacity-dmips-mhz = <BIG_CAPACITY>;
};
CPU13:cpu@D00 {
device_type = "cpu";
compatible = "arm,armv8";
reg = <0xD00>;
enable-method = "psci";
clocks = <&scmi_dvfs 3>;
capacity-dmips-mhz = <BIG_CAPACITY>;
};
#endif
};
#if TARGET_FLAVOUR_FPGA
ete8 {
compatible = "arm,embedded-trace-extension";
cpu = <&CPU8>;
};
ete9 {
compatible = "arm,embedded-trace-extension";
cpu = <&CPU9>;
};
ete10 {
compatible = "arm,embedded-trace-extension";
cpu = <&CPU10>;
};
ete11 {
compatible = "arm,embedded-trace-extension";
cpu = <&CPU11>;
};
ete12 {
compatible = "arm,embedded-trace-extension";
cpu = <&CPU12>;
};
ete13 {
compatible = "arm,embedded-trace-extension";
cpu = <&CPU13>;
};
#endif /* TARGET_FLAVOUR_FPGA */
cmn-pmu {
compatible = "arm,ci-700";
reg = <0x0 0x50000000 0x0 0x10000000>;
interrupts = <GIC_SPI 460 IRQ_TYPE_LEVEL_HIGH 0>;
};
mbox_db_rx: mhu@MHU_RX_ADDR {
arm,mhuv2-protocols = <0 1>;
};
mbox_db_tx: mhu@MHU_TX_ADDR {
arm,mhuv2-protocols = <0 1>;
};
firmware {
/*
* TC2 does not have a P2A channel, but wiring one was needed to make Linux work
* (by chance). At the time the SCMI driver did not support bidirectional
* mailboxes so as a workaround, the A2P channel was wired for TX communication
* and the synchronous replies would be read asyncrhonously as if coming from
* the P2A channel, while being the actual A2P channel.
*
* This will not work with kernels > 5.15, but keep it around to keep TC2
* working with its target kernel. Newer kernels will still work, but SCMI
* won't as they check that the two regions are distinct.
*/
scmi {
mboxes = <&mbox_db_tx 0 0 &mbox_db_rx 0 0>;
shmem = <&cpu_scp_scmi_a2p &cpu_scp_scmi_a2p>;
};
};
gic: interrupt-controller@GIC_CTRL_ADDR {
ppi-partitions {
ppi_partition_little: interrupt-partition-0 {
affinity = <&CPU0>, <&CPU1>, <&CPU2>, <&CPU3>;
};
#if TARGET_FLAVOUR_FVP
ppi_partition_mid: interrupt-partition-1 {
affinity = <&CPU4>, <&CPU5>, <&CPU6>;
};
ppi_partition_big: interrupt-partition-2 {
affinity = <&CPU7>;
};
#elif TARGET_FLAVOUR_FPGA
ppi_partition_mid: interrupt-partition-1 {
affinity = <&CPU4>, <&CPU5>, <&CPU6>, <&CPU7>, <&CPU8>;
};
ppi_partition_big: interrupt-partition-2 {
affinity = <&CPU9>, <&CPU10>, <&CPU11>, <&CPU12>, <&CPU13>;
};
#endif
};
};
spe-pmu-big {
status = "okay";
};
smmu_700: iommu@3f000000 {
status = "okay";
};
dp0: display@DPU_ADDR {
#if TC_SCMI_PD_CTRL_EN
power-domains = <&scmi_devpd (PLAT_MAX_CPUS_PER_CLUSTER + 2)>;
#endif
iommus = <&smmu_700 0x100>;
};
gpu: gpu@2d000000 {
interrupts = <GIC_SPI 66 IRQ_TYPE_LEVEL_HIGH 0>,
<GIC_SPI 67 IRQ_TYPE_LEVEL_HIGH 0>,
<GIC_SPI 65 IRQ_TYPE_LEVEL_HIGH 0>;
interrupt-names = "JOB", "MMU", "GPU";
iommus = <&smmu_700 0x200>;
};
};