summaryrefslogtreecommitdiff
path: root/tools/src/kboot_gpu.c
diff options
context:
space:
mode:
Diffstat (limited to 'tools/src/kboot_gpu.c')
-rw-r--r--tools/src/kboot_gpu.c452
1 files changed, 452 insertions, 0 deletions
diff --git a/tools/src/kboot_gpu.c b/tools/src/kboot_gpu.c
new file mode 100644
index 0000000..54e6d03
--- /dev/null
+++ b/tools/src/kboot_gpu.c
@@ -0,0 +1,452 @@
+/* SPDX-License-Identifier: MIT */
+
+#include "kboot.h"
+#include "adt.h"
+#include "assert.h"
+#include "firmware.h"
+#include "math.h"
+#include "pmgr.h"
+#include "soc.h"
+#include "utils.h"
+
+#include "libfdt/libfdt.h"
+
+#define bail(...) \
+ do { \
+ printf(__VA_ARGS__); \
+ return -1; \
+ } while (0)
+
+#define MAX_PSTATES 16
+#define MAX_CLUSTERS 8
+
+struct perf_state {
+ u32 freq;
+ u32 volt;
+};
+
+static int get_core_counts(u32 *count, u32 nclusters, u32 ncores)
+{
+ u64 base;
+ pmgr_adt_power_enable("/arm-io/sgx");
+
+ int adt_sgx_path[8];
+ if (adt_path_offset_trace(adt, "/arm-io/sgx", adt_sgx_path) < 0)
+ bail("ADT: GPU: Failed to get sgx\n");
+
+ if (adt_get_reg(adt, adt_sgx_path, "reg", 0, &base, NULL) < 0)
+ bail("ADT: GPU: Failed to get sgx reg 0\n");
+
+ u32 cores_lo = read32(base + 0xd01500);
+ u32 cores_hi = read32(base + 0xd01514);
+
+ u64 cores = (((u64)cores_hi) << 32) | cores_lo;
+
+ for (u32 i = 0; i < nclusters; i++) {
+ count[i] = __builtin_popcount(cores & MASK(ncores));
+ cores >>= ncores;
+ }
+
+ return 0;
+}
+
+static void adjust_leakage(float *val, u32 clusters, u32 *cores, u32 max, float uncore_fraction)
+{
+ for (u32 i = 0; i < clusters; i++) {
+ float uncore = val[i] * uncore_fraction;
+ float core = val[i] - uncore;
+
+ val[i] = uncore + (cores[i] / (float)max) * core;
+ }
+}
+
+static void load_fuses(float *out, u32 count, u64 base, u32 start, u32 width, float scale,
+ float offset, bool flip)
+{
+ for (u32 i = 0; i < count; i++) {
+ base += (start / 32) * 4;
+ start &= 31;
+
+ u32 low = read32(base);
+ u32 high = read32(base + 4);
+ u32 val = (((((u64)high) << 32) | low) >> start) & MASK(width);
+
+ float fval = (float)val * scale + offset;
+
+ if (flip)
+ out[count - i - 1] = fval;
+ else
+ out[i] = fval;
+
+ start += width;
+ }
+}
+
+static u32 t8103_pwr_scale[] = {0, 63, 80, 108, 150, 198, 210};
+
+static int calc_power_t8103(u32 count, u32 table_count, const struct perf_state *core,
+ const struct perf_state *sram, u32 *max_pwr, float *core_leak,
+ float *sram_leak)
+{
+ UNUSED(sram);
+ UNUSED(core_leak);
+ UNUSED(sram_leak);
+ u32 *pwr_scale;
+ u32 pwr_scale_count;
+ u32 core_count;
+ u32 max_cores;
+
+ switch (chip_id) {
+ case T8103:
+ pwr_scale = t8103_pwr_scale;
+ pwr_scale_count = ARRAY_SIZE(t8103_pwr_scale);
+ max_cores = 8;
+ break;
+ default:
+ bail("ADT: GPU: Unsupported chip\n");
+ }
+
+ if (get_core_counts(&core_count, 1, max_cores))
+ return -1;
+
+ if (table_count != 1)
+ bail("ADT: GPU: expected 1 perf state table but got %d\n", table_count);
+
+ if (count != pwr_scale_count)
+ bail("ADT: GPU: expected %d perf states but got %d\n", pwr_scale_count, count);
+
+ for (u32 i = 0; i < pwr_scale_count; i++)
+ max_pwr[i] = (u32)core[i].volt * (u32)pwr_scale[i] * 100;
+
+ core_leak[0] = 1000.0;
+ sram_leak[0] = 45.0;
+
+ adjust_leakage(core_leak, 1, &core_count, max_cores, 0.12);
+ adjust_leakage(sram_leak, 1, &core_count, max_cores, 0.2);
+
+ return 0;
+}
+
+static int calc_power_t600x(u32 count, u32 table_count, const struct perf_state *core,
+ const struct perf_state *sram, u32 *max_pwr, float *core_leak,
+ float *sram_leak)
+{
+ float s_sram, k_sram, s_core, k_core;
+ float dk_core, dk_sram;
+ float imax = 1000;
+
+ u32 nclusters = 0;
+ u32 ncores = 0;
+ u32 core_count[MAX_CLUSTERS];
+
+ bool simple_exps = false;
+ bool adjust_leakages = true;
+
+ switch (chip_id) {
+ case T6002:
+ nclusters += 4;
+ load_fuses(core_leak + 4, 4, 0x22922bc1b8, 25, 13, 2, 2, true);
+ load_fuses(sram_leak + 4, 4, 0x22922bc1cc, 4, 9, 1, 1, true);
+ // fallthrough
+ case T6001:
+ nclusters += 2;
+ case T6000:
+ nclusters += 2;
+ load_fuses(core_leak + 0, min(4, nclusters), 0x2922bc1b8, 25, 13, 2, 2, false);
+ load_fuses(sram_leak + 0, min(4, nclusters), 0x2922bc1cc, 4, 9, 1, 1, false);
+
+ s_sram = 4.3547606;
+ k_sram = 0.024927923;
+ // macOS difference: macOS uses a misbehaved piecewise function here
+ // Since it's obviously wrong, let's just use only the first component
+ s_core = 1.48461742;
+ k_core = 0.39013552;
+ dk_core = 8.558;
+ dk_sram = 0.05;
+
+ ncores = 8;
+ adjust_leakages = true;
+ imax = 26.0;
+ break;
+ case T8112:
+ nclusters = 1;
+ load_fuses(core_leak, 1, 0x23d2c84dc, 30, 13, 2, 2, false);
+ load_fuses(sram_leak, 1, 0x23d2c84b0, 15, 9, 1, 1, false);
+
+ s_sram = 3.61619841;
+ k_sram = 0.0529281;
+ // macOS difference: macOS uses a misbehaved piecewise function here
+ // Since it's obviously wrong, let's just use only the first component
+ s_core = 1.21356187;
+ k_core = 0.43328839;
+ dk_core = 9.83196;
+ dk_sram = 0.07828;
+
+ simple_exps = true;
+ ncores = 10;
+ adjust_leakages = false; // pre-adjusted?
+ imax = 24.0;
+ break;
+ }
+
+ if (get_core_counts(core_count, nclusters, ncores))
+ return -1;
+
+ printf("FDT: GPU: Core counts: ");
+ for (u32 i = 0; i < nclusters; i++) {
+ printf("%d ", core_count[i]);
+ }
+ printf("\n");
+
+ if (adjust_leakages) {
+ adjust_leakage(core_leak, nclusters, core_count, ncores, 0.0825);
+ adjust_leakage(sram_leak, nclusters, core_count, ncores, 0.2247);
+ }
+
+ if (table_count != nclusters)
+ bail("ADT: GPU: expected %d perf state tables but got %d\n", nclusters, table_count);
+
+ max_pwr[0] = 0;
+
+ for (u32 i = 1; i < count; i++) {
+ u32 total_mw = 0;
+
+ for (u32 j = 0; j < nclusters; j++) {
+ // macOS difference: macOS truncates Hz to integer MHz before doing this math.
+ // That's probably wrong, so let's not do that.
+
+ float mw = 0;
+ size_t idx = j * count + i;
+
+ mw += sram[idx].volt / 1000.f * sram_leak[j] * k_sram *
+ expf(sram[idx].volt / 1000.f * s_sram);
+ mw += core[idx].volt / 1000.f * core_leak[j] * k_core *
+ expf(core[idx].volt / 1000.f * s_core);
+
+ float sbase = sram[idx].volt / 750.f;
+ float sram_v_p;
+ if (simple_exps)
+ sram_v_p = sbase * sbase; // v ^ 2
+ else
+ sram_v_p = sbase * sbase * sbase; // v ^ 3
+ mw += dk_sram * (sram[idx].freq / 1000000.f) * sram_v_p;
+
+ float cbase = core[idx].volt / 750.f;
+ float core_v_p;
+ if (simple_exps || core[idx].volt < 750)
+ core_v_p = cbase * cbase; // v ^ 2
+ else
+ core_v_p = cbase * cbase * cbase; // v ^ 3
+ mw += dk_core * (core[idx].freq / 1000000.f) * core_v_p;
+
+ if (mw > imax * core[idx].volt)
+ mw = imax * core[idx].volt;
+
+ total_mw += mw;
+ }
+
+ max_pwr[i] = total_mw * 1000;
+ }
+
+ return 0;
+}
+
+static int dt_set_region(void *dt, int sgx, const char *name, const char *path)
+{
+ u64 base, size;
+ char prop[64];
+
+ snprintf(prop, sizeof(prop), "%s-base", name);
+ if (ADT_GETPROP(adt, sgx, prop, &base) < 0 || !base)
+ bail("ADT: GPU: failed to find %s property\n", prop);
+
+ snprintf(prop, sizeof(prop), "%s-size", name);
+ if (ADT_GETPROP(adt, sgx, prop, &size) < 0 || !base)
+ bail("ADT: GPU: failed to find %s property\n", prop);
+
+ int node = fdt_path_offset(dt, path);
+ if (node < 0)
+ bail("FDT: GPU: failed to find %s node\n", path);
+
+ fdt64_t reg[2];
+
+ fdt64_st(&reg[0], base);
+ fdt64_st(&reg[1], size);
+
+ if (fdt_setprop_inplace(dt, node, "reg", reg, sizeof(reg)))
+ bail("FDT: GPU: failed to set reg prop for %s\n", path);
+
+ return 0;
+}
+
+int fdt_set_float_array(void *dt, int node, const char *name, float *val, int count)
+{
+ fdt32_t data[MAX_CLUSTERS];
+
+ if (count > MAX_CLUSTERS)
+ bail("FDT: GPU: fdt_set_float_array() with too many values\n");
+
+ memcpy(data, val, sizeof(float) * count);
+ for (int i = 0; i < count; i++) {
+ data[i] = cpu_to_fdt32(data[i]);
+ }
+
+ if (fdt_setprop_inplace(dt, node, name, data, sizeof(u32) * count))
+ bail("FDT: GPU: Failed to set %s\n", name);
+
+ return 0;
+}
+
+int dt_set_gpu(void *dt)
+{
+ int (*calc_power)(u32 count, u32 table_count, const struct perf_state *perf,
+ const struct perf_state *sram, u32 *max_pwr, float *core_leak,
+ float *sram_leak);
+
+ printf("FDT: GPU: Initializing GPU info\n");
+
+ switch (chip_id) {
+ case T8103:
+ calc_power = calc_power_t8103;
+ break;
+ case T6000:
+ case T6001:
+ case T6002:
+ case T8112:
+ calc_power = calc_power_t600x;
+ break;
+ default:
+ printf("ADT: GPU: unsupported chip!\n");
+ return 0;
+ }
+
+ int gpu = fdt_path_offset(dt, "gpu");
+ if (gpu < 0) {
+ printf("FDT: GPU: gpu alias not found in device tree\n");
+ return 0;
+ }
+
+ int len;
+ const fdt32_t *opps_ph = fdt_getprop(dt, gpu, "operating-points-v2", &len);
+ if (!opps_ph || len != 4)
+ bail("FDT: GPU: operating-points-v2 not found\n");
+
+ int opps = fdt_node_offset_by_phandle(dt, fdt32_ld(opps_ph));
+ if (opps < 0)
+ bail("FDT: GPU: node for phandle %u not found\n", fdt32_ld(opps_ph));
+
+ int sgx = adt_path_offset(adt, "/arm-io/sgx");
+ if (sgx < 0)
+ bail("ADT: GPU: /arm-io/sgx node not found\n");
+
+ u32 perf_state_count;
+ if (ADT_GETPROP(adt, sgx, "perf-state-count", &perf_state_count) < 0 || !perf_state_count)
+ bail("ADT: GPU: missing perf-state-count\n");
+
+ u32 perf_state_table_count;
+ if (ADT_GETPROP(adt, sgx, "perf-state-table-count", &perf_state_table_count) < 0 ||
+ !perf_state_table_count)
+ bail("ADT: GPU: missing perf-state-table-count\n");
+
+ if (perf_state_count > MAX_PSTATES)
+ bail("ADT: GPU: perf-state-count too large\n");
+
+ if (perf_state_table_count > MAX_CLUSTERS)
+ bail("ADT: GPU: perf-state-table-count too large\n");
+
+ u32 perf_states_len;
+ const struct perf_state *perf_states, *perf_states_sram;
+
+ perf_states = adt_getprop(adt, sgx, "perf-states", &perf_states_len);
+ if (!perf_states ||
+ perf_states_len != sizeof(*perf_states) * perf_state_count * perf_state_table_count)
+ bail("ADT: GPU: invalid perf-states length\n");
+
+ perf_states_sram = adt_getprop(adt, sgx, "perf-states-sram", &perf_states_len);
+ if (perf_states_sram &&
+ perf_states_len != sizeof(*perf_states) * perf_state_count * perf_state_table_count)
+ bail("ADT: GPU: invalid perf-states-sram length\n");
+
+ u32 max_pwr[MAX_PSTATES];
+ float core_leak[MAX_CLUSTERS];
+ float sram_leak[MAX_CLUSTERS];
+
+ if (calc_power(perf_state_count, perf_state_table_count, perf_states, perf_states_sram, max_pwr,
+ core_leak, sram_leak))
+ return -1;
+
+ printf("FDT: GPU: Max power table: ");
+ for (u32 i = 0; i < perf_state_count; i++) {
+ printf("%d ", max_pwr[i]);
+ }
+ printf("\nFDT: GPU: Core leakage table: ");
+ for (u32 i = 0; i < perf_state_table_count; i++) {
+ printf("%d.%03d ", (int)core_leak[i], ((int)(core_leak[i] * 1000) % 1000));
+ }
+ printf("\nFDT: GPU: SRAM leakage table: ");
+ for (u32 i = 0; i < perf_state_table_count; i++) {
+ printf("%d.%03d ", (int)sram_leak[i], ((int)(sram_leak[i] * 1000) % 1000));
+ }
+ printf("\n");
+
+ if (fdt_set_float_array(dt, gpu, "apple,core-leak-coef", core_leak, perf_state_table_count))
+ return -1;
+
+ if (fdt_set_float_array(dt, gpu, "apple,sram-leak-coef", sram_leak, perf_state_table_count))
+ return -1;
+
+ if (firmware_set_fdt(dt, gpu, "apple,firmware-version", &os_firmware))
+ return -1;
+
+ const struct fw_version_info *compat;
+
+ switch (os_firmware.version) {
+ case V12_3_1:
+ compat = &fw_versions[V12_3];
+ break;
+ default:
+ compat = &os_firmware;
+ break;
+ }
+
+ if (firmware_set_fdt(dt, gpu, "apple,firmware-compat", compat))
+ return -1;
+
+ u32 i = 0;
+ int opp;
+ fdt_for_each_subnode(opp, dt, opps)
+ {
+ fdt32_t volts[MAX_CLUSTERS];
+
+ for (u32 j = 0; j < perf_state_table_count; j++) {
+ volts[j] = cpu_to_fdt32(perf_states[i + j * perf_state_count].volt * 1000);
+ }
+
+ if (i >= perf_state_count)
+ bail("FDT: GPU: Expected %d operating points, but found more\n", perf_state_count);
+
+ if (fdt_setprop_inplace(dt, opp, "opp-microvolt", &volts,
+ sizeof(u32) * perf_state_table_count))
+ bail("FDT: GPU: Failed to set opp-microvolt for PS %d\n", i);
+
+ if (fdt_setprop_inplace_u64(dt, opp, "opp-hz", perf_states[i].freq))
+ bail("FDT: GPU: Failed to set opp-hz for PS %d\n", i);
+
+ if (fdt_setprop_inplace_u32(dt, opp, "opp-microwatt", max_pwr[i]))
+ bail("FDT: GPU: Failed to set opp-microwatt for PS %d\n", i);
+
+ i++;
+ }
+
+ if (i != perf_state_count)
+ bail("FDT: GPU: Expected %d operating points, but found %d\n", perf_state_count, i);
+
+ if (dt_set_region(dt, sgx, "gfx-handoff", "/reserved-memory/uat-handoff"))
+ return -1;
+ if (dt_set_region(dt, sgx, "gfx-shared-region", "/reserved-memory/uat-pagetables"))
+ return -1;
+ if (dt_set_region(dt, sgx, "gpu-region", "/reserved-memory/uat-ttbs"))
+ return -1;
+
+ return 0;
+}