summaryrefslogtreecommitdiff
path: root/tools/proxyclient/experiments/cpu_pstate_latencies.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools/proxyclient/experiments/cpu_pstate_latencies.py')
-rwxr-xr-xtools/proxyclient/experiments/cpu_pstate_latencies.py239
1 files changed, 239 insertions, 0 deletions
diff --git a/tools/proxyclient/experiments/cpu_pstate_latencies.py b/tools/proxyclient/experiments/cpu_pstate_latencies.py
new file mode 100755
index 0000000..0acfc28
--- /dev/null
+++ b/tools/proxyclient/experiments/cpu_pstate_latencies.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+import sys, pathlib, time
+sys.path.append(str(pathlib.Path(__file__).resolve().parents[1]))
+
+from m1n1.setup import *
+from m1n1 import asm
+
+p.smp_start_secondaries()
+
+tfreq = u.mrs(CNTFRQ_EL0)
+
+TEST_CPUS = [1, 4]
+
+CLUSTER_PSTATE = 0x20020
+CLUSTER_STATUS = 0x20050
+
+if u.adt["/chosen"].chip_id == 0x8103:
+ CREG = [
+ 0x210e00000,
+ 0x211e00000,
+ ]
+
+ MAX_PSTATE = [5, 15]
+
+elif u.adt["/chosen"].chip_id == 0x8112:
+ CREG = [
+ 0x210e00000,
+ 0x211e00000,
+ ]
+
+ MAX_PSTATE = [7, 17]
+
+code = u.malloc(0x1000)
+
+util = asm.ARMAsm(f"""
+bench:
+ mrs x1, CNTPCT_EL0
+1:
+ sub x0, x0, #1
+ cbnz x0, 1b
+
+ mrs x2, CNTPCT_EL0
+ sub x0, x2, x1
+ ret
+
+signal_and_write:
+ sev
+ mrs x2, CNTPCT_EL0
+ add x2, x2, #0x800
+1:
+ mrs x3, CNTPCT_EL0
+ sub x4, x3, x2
+ cbnz x4, 1b
+ str x1, [x0]
+ mov x0, x3
+ ret
+
+timelog:
+ mrs x2, s3_1_c15_c0_0 /* SYS_IMP_APL_PMCR0 */
+ orr x2, x2, #1
+ msr s3_1_c15_c0_0, x2
+ mov x2, #0xffffffffffffffff
+ msr s3_1_c15_c1_0, x2
+ isb
+ wfe
+1:
+ mrs x2, CNTPCT_EL0
+ mrs x3, s3_2_c15_c0_0
+ isb
+ stp x2, x3, [x0], #16
+ mov x4, #0x40
+2:
+ sub x4, x4, #1
+ cbnz x4, 2b
+ sub x1, x1, #1
+ cbnz x1, 1b
+
+ ret
+""", code)
+iface.writemem(code, util.data)
+p.dc_cvau(code, len(util.data))
+p.ic_ivau(code, len(util.data))
+
+def bench_cpu(idx, loops=10000000):
+ if idx == 0:
+ elapsed = p.call(util.bench, loops) / tfreq
+ else:
+ elapsed = p.smp_call_sync(idx, util.bench, loops) / tfreq
+ if elapsed == 0:
+ return 0
+ mhz = (loops / elapsed) / 1000000
+ return mhz
+
+def set_pstate(cluster, pstate):
+ p.mask64(CREG[cluster] + CLUSTER_PSTATE, 0x1f01f, (1<<25) | pstate | (pstate << 12))
+
+print()
+
+LOG_ITERS = 10000
+logbuf = u.malloc(LOG_ITERS * 16)
+
+def bench_latency(cluster, cpu, from_pstate, to_pstate, verbose=False):
+ set_pstate(cluster, from_pstate)
+ bench_cpu(cpu)
+
+ p.smp_call(cpu, util.timelog, logbuf, LOG_ITERS)
+ psreg = (p.read64(CREG[cluster] + CLUSTER_PSTATE) & ~0x1f001f) | (1<<25) | to_pstate | (to_pstate << 12)
+ tval = p.call(util.signal_and_write, CREG[cluster] + CLUSTER_PSTATE, psreg)
+ p.smp_wait(cpu)
+
+ logdata = iface.readmem(logbuf, LOG_ITERS * 16)
+ lts, lcyc = None, None
+
+ log = []
+ for i in range(LOG_ITERS):
+ ts, cyc = struct.unpack("<QQ", logdata [i*16:i*16+16])
+ log.append((ts, cyc))
+
+ off = 256
+
+ ts_0, cyc_0 = log[off]
+ ts_e, cyc_e = log[-1]
+ f_init = None
+ f_end = None
+ lts, lcyc = ts_0, cyc_0
+
+ inc = to_pstate > from_pstate
+
+ blip = 0
+ cnt = dts_sum = 0
+ for i in range(off, len(log)):
+ ts, cyc = log[i]
+ dts = ts - lts
+ dcyc = cyc - lcyc
+
+ cnt += 1
+ dts_sum += dts
+
+ blip = max(blip, dts)
+
+ if f_init is None and ts > tval:
+ tidx = i
+ f_init = (lcyc - cyc_0) / (lts - ts_0) * tfreq / 1000000
+ dts_init = dts_sum / cnt
+ if f_end is None and ts > (tval + ts_e) / 2:
+ f_end = (cyc_e - cyc) / (ts_e - ts) * tfreq / 1000000
+ cnt = dts_sum = 0
+
+ #if lts is not None:
+ #print(f"{i}: {ts}: {cyc} ({ts-lts}: {cyc-lcyc})")
+ #else:
+ #print(f"{i}: {ts}: {cyc}")
+ lts, lcyc = ts, cyc
+
+ dts_end = dts_sum / cnt
+
+ window = 32
+
+ if verbose:
+ print(f"Triggered at {tval}")
+
+ thresh = 2/ (1/f_init + 1/f_end)
+
+ for i in range(tidx, LOG_ITERS - window - 1):
+ ts0, cyc0 = log[i - window]
+ ts1, cyc1 = log[i + window]
+ f = (cyc1 - cyc0) / (ts1 - ts0) * tfreq / 1000000
+ if inc and (f > thresh) or ((not inc) and f < thresh):
+ tts = log[i][0]
+ tidx = i
+ if verbose:
+ print(f"Frequency transition at #{i} {tts}")
+ break
+
+ if verbose:
+ print(f"Initial frequency: {f_init:.2f}")
+ print(f"Final frequency: {f_end:.2f}")
+ print(f"Threshold: {thresh:.2f}")
+
+ for i in range(max(window, tidx - 10 * window), tidx + 10 * window):
+ ts0, cyc0 = log[i - window]
+ ts1, cyc1 = log[i + window]
+ lts, lcyc = log[i - 1]
+ ts, cyc = log[i]
+ f = (cyc1 - cyc0) / (ts1 - ts0) * tfreq / 1000000
+ print(f"{i}: {ts}: {cyc} ({ts-lts}: {cyc-lcyc}): {f:.2f}")
+
+ blip -= min(dts_init, dts_end)
+
+ return (tts - tval) / tfreq * 1000000000, blip / tfreq * 1000000000
+
+for cluster, creg in enumerate(CREG):
+ cpu = TEST_CPUS[cluster]
+
+ freqs = []
+
+ print(f"#### Cluster {cluster} ####")
+ print(" P-States:")
+ print(" ", end="")
+ for pstate in range(MAX_PSTATE[cluster] + 1):
+ set_pstate(cluster, pstate)
+ freq = int(round(bench_cpu(cpu)))
+ freqs.append(freq)
+ print(f"{pstate}:{freq}MHz", end=" ")
+ print()
+ print()
+
+ print(" To-> |", end="")
+ for to_pstate in range(1, MAX_PSTATE[cluster] + 1):
+ print(f" {freqs[to_pstate]:7d} |", end="")
+ print()
+ print(" From |", end="")
+ for to_pstate in range(1, MAX_PSTATE[cluster] + 1):
+ print(f"---------+", end="")
+ print()
+
+ maxblip = 0
+
+ for from_pstate in range(1, MAX_PSTATE[cluster] + 1):
+ print(f" {freqs[from_pstate]:4d} |", end="")
+ for to_pstate in range(1, MAX_PSTATE[cluster] + 1):
+ if from_pstate == to_pstate:
+ print(f" ******* |", end="")
+ continue
+ lat, blip = bench_latency(cluster, cpu, from_pstate, to_pstate)
+ print(f" {lat:7.0f} |", end="")
+ maxblip = max(maxblip, blip)
+ print()
+
+ print()
+ print(f"Maximum execution latency spike: {maxblip:.0f} ns")
+ print()
+
+print()
+
+#bench_latency(1, TEST_CPUS[1], 15, 14, True)
+
+