diff options
Diffstat (limited to 'tools/proxyclient/m1n1/agx')
| -rw-r--r-- | tools/proxyclient/m1n1/agx/__init__.py | 343 | ||||
| -rw-r--r-- | tools/proxyclient/m1n1/agx/channels.py | 178 | ||||
| -rw-r--r-- | tools/proxyclient/m1n1/agx/context.py | 247 | ||||
| -rw-r--r-- | tools/proxyclient/m1n1/agx/event.py | 58 | ||||
| -rw-r--r-- | tools/proxyclient/m1n1/agx/initdata.py | 387 | ||||
| -rw-r--r-- | tools/proxyclient/m1n1/agx/object.py | 263 | ||||
| -rw-r--r-- | tools/proxyclient/m1n1/agx/render.py | 1075 | ||||
| -rw-r--r-- | tools/proxyclient/m1n1/agx/shim.py | 244 | ||||
| -rw-r--r-- | tools/proxyclient/m1n1/agx/uapi.py | 116 |
9 files changed, 2911 insertions, 0 deletions
diff --git a/tools/proxyclient/m1n1/agx/__init__.py b/tools/proxyclient/m1n1/agx/__init__.py new file mode 100644 index 0000000..26368ce --- /dev/null +++ b/tools/proxyclient/m1n1/agx/__init__.py @@ -0,0 +1,343 @@ +# SPDX-License-Identifier: MIT +import bisect, time + +from .object import GPUObject, GPUAllocator +from .initdata import build_initdata +from .channels import * +from .event import GPUEventManager +from ..proxy import IODEV +from ..malloc import Heap +from ..hw.uat import UAT, MemoryAttr +from ..hw.agx import * +from ..fw.agx import AGXASC +from ..fw.agx.channels import ChannelInfoSet, ChannelInfo + +class AGXChannels: + pass + +class AGXQueue: + pass + +class AGX: + PAGE_SIZE = 0x4000 + MAX_EVENTS = 128 + + def __init__(self, u): + self.start_time = time.time() + self.u = u + self.p = u.proxy + + self.iface = u.iface + self.show_stats = False + + self.asc_dev = u.adt["/arm-io/gfx-asc"] + self.sgx_dev = u.adt["/arm-io/sgx"] + self.sgx = SGXRegs(u, self.sgx_dev.get_reg(0)[0]) + + self.log("Initializing allocations") + + self.aic_base = u.adt["/arm-io/aic"].get_reg(0)[0] + + self.all_objects = {} + self.tracked_objects = {} + + # Memory areas + self.fw_va_base = self.sgx_dev.rtkit_private_vm_region_base + self.fw_va_size = self.sgx_dev.rtkit_private_vm_region_size + self.kern_va_base = self.fw_va_base + self.fw_va_size + + # Set up UAT + self.uat = UAT(self.u.iface, self.u) + + # Allocator for RTKit/ASC objects + self.uat.allocator = Heap(self.kern_va_base + 0x80000000, + self.kern_va_base + 0x81000000, + self.PAGE_SIZE) + + self.asc = AGXASC(self.u, self.asc_dev.get_reg(0)[0], self, self.uat) + self.asc.verbose = 0 + self.asc.mgmt.verbose = 0 + + self.kobj = GPUAllocator(self, "kernel", + self.kern_va_base, 0x10000000, + AttrIndex=MemoryAttr.Shared, AP=1, guard_pages=4) + self.cmdbuf = GPUAllocator(self, "cmdbuf", + self.kern_va_base + 0x10000000, 0x10000000, + AttrIndex=MemoryAttr.Shared, AP=0, guard_pages=4) + self.kshared = GPUAllocator(self, "kshared", + self.kern_va_base + 0x20000000, 0x10000000, + AttrIndex=MemoryAttr.Shared, AP=1, guard_pages=4) + self.kshared2 = GPUAllocator(self, "kshared2", + self.kern_va_base + 0x30000000, 0x100000, + AttrIndex=MemoryAttr.Shared, AP=0, PXN=1, guard_pages=4) + + self.io_allocator = Heap(self.kern_va_base + 0x38000000, + self.kern_va_base + 0x40000000, + block=self.PAGE_SIZE) + + self.mon = None + self.event_mgr = GPUEventManager(self) + + self.p.iodev_set_usage(IODEV.FB, 0) + + self.initdata_hook = None + + # Early init, needed? + self.poke_sgx() + + def poke_sgx(self): + self.sgx_base = self.sgx_dev.get_reg(0)[0] + self.p.read32(self.sgx_base + 0xd14000) + self.p.write32(self.sgx_base + 0xd14000, 0x70001) + + def find_object(self, addr, ctx=0): + all_objects = list(self.all_objects.items()) + all_objects.sort() + + idx = bisect.bisect_left(all_objects, ((ctx, addr + 1), "")) - 1 + if idx < 0 or idx >= len(all_objects): + return None, None + + (ctx, base), obj = all_objects[idx] + return base, obj + + def reg_object(self, obj, track=True): + self.all_objects[(obj._ctx, obj._addr)] = obj + if track: + if self.mon is not None: + obj.add_to_mon(self.mon) + self.tracked_objects[(obj._ctx, obj._addr)] = obj + + def unreg_object(self, obj): + del self.all_objects[(obj._ctx, obj._addr)] + if obj._addr in self.tracked_objects: + del self.tracked_objects[(obj._ctx, obj._addr)] + + def poll_objects(self): + for obj in self.tracked_objects.values(): + diff = obj.poll() + if diff is not None: + self.log(diff) + + def alloc_channels(self, cls, name, channel_id, count=1, ring_size=0x100, rx=False): + + # All channels have 0x100 items + item_count = ring_size + item_size = cls.item_size + ring_size = item_count * item_size + + self.log(f"Allocating {count} channel(s) for {name} ({item_count} * {item_size:#x} bytes each)") + + state_obj = self.kshared.new_buf(0x30 * count, f"Channel.{name}.state", track=False) + if rx: + ring_buf = self.kshared.new_buf(ring_size * count, f"Channel.{name}.ring", track=False) + else: + ring_buf = self.kobj.new_buf(ring_size * count, f"Channel.{name}.ring", track=False) + + info = ChannelInfo() + info.state_addr = state_obj._addr + info.ringbuffer_addr = ring_buf._addr + if name == "FWCtl": + self.fwctl_chinfo = info + else: + setattr(self.ch_info, name, info) + + return [cls(self, name + ("" if count == 1 else f"[{i}]"), channel_id, + state_obj._paddr + 0x30 * i, + ring_buf._paddr + ring_size * i, item_count) + for i in range(count)] + + def init_channels(self): + self.log("Initializing channels...") + self.ch_info = ChannelInfoSet() + self.ch = AGXChannels() + self.ch.queue = [] + + # Command queue submission channels + for index in range(4): + queue = AGXQueue() + self.ch.queue.append(queue) + for typeid, chtype in enumerate(("TA", "3D", "CL")): + name = f"{chtype}_{index}" + chan = self.alloc_channels(GPUCmdQueueChannel, name, + (index << 2) | typeid)[0] + setattr(queue, "q_" + chtype, chan) + + # Device control channel + self.ch.devctrl = self.alloc_channels(GPUDeviceControlChannel, "DevCtrl", 0x11)[0] + + # GPU -> CPU channels + self.ch.event = self.alloc_channels(GPUEventChannel, "Event", None, rx=True)[0] + self.ch.log = self.alloc_channels(GPULogChannel, "FWLog", None, 6, rx=True) + self.ch.ktrace = self.alloc_channels(GPUKTraceChannel, "KTrace", None, ring_size=0x200, rx=True)[0] + self.ch.stats = self.alloc_channels(GPUStatsChannel, "Stats", None, rx=True)[0] + + self.ch.fwctl = self.alloc_channels(GPUFWCtlChannel, "FWCtl", None, rx=False)[0] + + # For some reason, the FWLog channels have their rings in a different place... + self.fwlog_ring = self.ch_info.FWLog.ringbuffer_addr + self.ch_info.FWLog.ringbuffer_addr = self.kshared.buf(0x150000, "FWLog_Dummy") + + def poll_channels(self): + for chan in self.ch.log: + chan.poll() + self.ch.ktrace.poll() + if self.show_stats: + self.ch.stats.poll() + self.ch.event.poll() + + def kick_firmware(self): + self.asc.db.doorbell(0x10) + + def show_irqs(self): + hw_state = self.aic_base + 0x4200 + irqs = [] + for irq in self.sgx_dev.interrupts: + v = int(bool((self.p.read32(hw_state + (irq // 32) * 4) & (1 << (irq % 32))))) + irqs.append(v) + self.log(f' SGX IRQ state: {irqs}') + + def timeout(self, msg): + if self.mon: + self.mon.poll() + self.poll_objects() + self.log(msg) + self.log(r' (\________/) ') + self.log(r' | | ') + self.log(r"'.| \ , / |.'") + self.log(r'--| / (( \ |--') + self.log(r".'| _-_- |'.") + self.log(r' |________| ') + self.log(r'') + self.log(r' Timeout nya~!!!!!') + self.log(r'') + self.log(f' Stamp index: {int(msg.stamp_index)}') + self.show_pending_stamps() + self.log(f' Fault info:') + self.log(self.initdata.regionC.fault_info) + + self.show_irqs() + self.check_fault() + self.recover() + + def faulted(self, msg): + if self.mon: + self.mon.poll() + self.poll_objects() + self.log(msg) + self.log(r' (\________/) ') + self.log(r' | | ') + self.log(r"'.| \ , / |.'") + self.log(r'--| / (( \ |--') + self.log(r".'| _-_- |'.") + self.log(r' |________| ') + self.log(r'') + self.log(r' Fault nya~!!!!!') + self.log(r'') + self.show_pending_stamps() + self.log(f' Fault info:') + self.log(self.initdata.regionC.fault_info) + + self.show_irqs() + self.check_fault() + self.recover() + + def show_pending_stamps(self): + self.initdata.regionC.pull() + self.log(f' Pending stamps:') + for i in self.initdata.regionC.pending_stamps: + if i.info or i.wait_value: + self.log(f" - #{i.info >> 3:3d}: {i.info & 0x7}/{i.wait_value:#x}") + i.info = 0 + i.wait_value = 0 + tmp = i.regmap() + tmp.info.val = 0 + tmp.wait_value.val = 0 + + #self.initdata.regionC.push() + + def check_fault(self): + fault_info = self.sgx.FAULT_INFO.reg + if fault_info.value == 0xacce5515abad1dea: + raise Exception("Got fault notification, but fault address is unreadable") + + self.log(f" Fault info: {fault_info}") + + if not fault_info.FAULTED: + return + + fault_addr = fault_info.ADDR + if fault_addr & 0x8000000000: + fault_addr |= 0xffffff8000000000 + base, obj = self.find_object(fault_addr) + info = "" + if obj is not None: + info = f" ({obj!s} + {fault_addr - base:#x})" + self.log(f" GPU fault at {fault_addr:#x}{info}") + self.log(f" Faulting unit: {agx_decode_unit(fault_info.UNIT)}") + + def recover(self): + status = self.fw_status + self.log(f" Halt count: {status.halt_count.val}") + halted = bool(status.halted.val) + self.log(f" Halted: {halted}") + if halted: + self.log(f" Attempting recovery...") + status.halted.val = 0 + status.resume.val = 1 + else: + raise Exception("Cannot recover") + self.show_irqs() + + def resume(self): + self.log("Starting ASC") + self.asc.start() + + self.log("Starting endpoints") + self.asc.start_ep(0x20) + self.asc.start_ep(0x21) + + def start(self): + self.resume() + + self.init_channels() + + self.log("Building initdata") + self.initdata = build_initdata(self) + if self.initdata_hook: + self.initdata_hook(self) + + self.fw_status = self.initdata.fw_status.regmap() + self.uat.flush_dirty() + + self.log("Sending initdata") + self.asc.fw.send_initdata(self.initdata._addr & 0xfff_ffffffff) + self.asc.work() + + self.log("Sending DC_Init") + self.ch.devctrl.send_init() + self.asc.work() + + self.log("Sending DC_UpdateIdleTS") + self.ch.devctrl.update_idle_ts() + self.asc.work() + + def stop(self): + self.asc.stop() + + def work(self): + self.asc.work() + + def wait_for_events(self, timeout=1.0): + now = time.time() + deadline = now + timeout + cnt = self.event_mgr.event_count + while now < deadline and self.event_mgr.event_count == cnt: + self.asc.work() + now = time.time() + if self.event_mgr.event_count == cnt: + raise Exception("Timed out waiting for events") + + def log(self, msg): + t = time.time() - self.start_time + print(f"[AGX][{t:10.03f}] " + str(msg)) diff --git a/tools/proxyclient/m1n1/agx/channels.py b/tools/proxyclient/m1n1/agx/channels.py new file mode 100644 index 0000000..c91f347 --- /dev/null +++ b/tools/proxyclient/m1n1/agx/channels.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: MIT + +from construct import * +from ..fw.agx.channels import * +from ..fw.agx.cmdqueue import * + +class GPUChannel: + STATE_FIELDS = ChannelStateFields + + def __init__(self, agx, name, channel_id, state_addr, ring_addr, ring_size): + self.agx = agx + self.u = agx.u + self.name = name + self.channel_id = channel_id + self.iface = agx.u.iface + self.state_addr = state_addr + self.ring_addr = ring_addr + self.ring_size = ring_size + self.state = self.STATE_FIELDS(self.u, self.state_addr) + self.state.READ_PTR.val = 0 + self.state.WRITE_PTR.val = 0 + + @classmethod + @property + def item_size(cls): + return cls.MSG_CLASS.sizeof() + + def log(self, msg): + self.agx.log(f"[{self.name}] {msg}") + +class GPUTXChannel(GPUChannel): + def doorbell(self): + self.agx.asc.db.doorbell(self.channel_id) + + def send_message(self, msg): + wptr = self.state.WRITE_PTR.val + self.iface.writemem(self.ring_addr + self.item_size * wptr, + msg.build()) + self.state.WRITE_PTR.val = (wptr + 1) % self.ring_size + self.doorbell() + +class GPURXChannel(GPUChannel): + def poll(self): + wptr = self.state.WRITE_PTR.val + rptr = self.state.READ_PTR.val + + if wptr >= self.ring_size: + raise Exception(f"wptr = {wptr:#x} > {self.ring_size:#x}") + + while rptr != wptr: + msg = self.iface.readmem(self.ring_addr + self.item_size * rptr, + self.item_size) + self.handle_message(self.MSG_CLASS.parse(msg)) + rptr = (rptr + 1) % self.ring_size + self.state.READ_PTR.val = rptr + + def handle_message(self, msg): + self.log(f"Message: {msg}") + +class GPUCmdQueueChannel(GPUTXChannel): + MSG_CLASS = RunCmdQueueMsg + + def run(self, queue, event): + msg = RunCmdQueueMsg() + msg.queue_type = queue.TYPE + msg.cmdqueue = queue.info + msg.cmdqueue_addr = queue.info._addr + msg.head = queue.wptr + msg.event_number = event + msg.new_queue = 1 if queue.first_time else 0 + queue.first_time = False + #print(msg) + self.send_message(msg) + +class GPUDeviceControlChannel(GPUTXChannel): + MSG_CLASS = DeviceControlMsg + + def send_init(self): + self.send_message(DC_Init()) + + def dc_09(self, a, ptr, b): + # Writes to InitData.RegionB + msg = DC_09() + msg.unk_4 = a + msg.unkptr_c = ptr + msg.unk_14 = b + self.send_message(msg) + + def send_foo(self, t, d=None): + msg = DC_Any() + msg.msg_type = t + if d is not None: + msg.data = d + self.send_message(msg) + + def update_idle_ts(self): + self.send_message(DC_UpdateIdleTS()) + + def destroy_context(self, ctx): + msg = DC_DestroyContext() + msg.unk_4 = 0 + msg.unk_8 = 2 + msg.unk_c = 0 + msg.unk_10 = 0 + msg.unk_14 = 0xffff + msg.unk_18 = 0 + msg.context_addr = ctx.gpu_context._addr + print(msg) + self.send_message(msg) + + # Maybe related to stamps? + def write32(self, addr, val): + msg = DC_Write32() + msg.addr = addr + msg.data = val + msg.unk_10 = 0 + msg.unk_14 = 0 + msg.unk_18 = 0 + msg.unk_1c = 0 + print(msg) + self.send_message(msg) + + def dc_1e(self, a, b): + msg = DC_1e() + msg.unk_4 = a + msg.unk_c = b + print(msg) + self.send_message(msg) + +class GPUFWCtlChannel(GPUTXChannel): + STATE_FIELDS = FWControlStateFields + MSG_CLASS = FWCtlMsg + + def doorbell(self): + self.agx.asc.db.fwctl_doorbell() + + def send_inval(self, ctx, addr=0): + msg = FWCtlMsg() + msg.addr = addr + msg.unk_8 = 0 + msg.context_id = ctx + msg.unk_10 = 1 + msg.unk_12 = 2 + print(msg) + self.send_message(msg) + +class GPUEventChannel(GPURXChannel): + MSG_CLASS = EventMsg + + def handle_message(self, msg): + if isinstance(msg, FlagMsg): + self.agx.event_mgr.fired(msg.firing) + elif isinstance(msg, FaultMsg): + self.agx.faulted(msg) + elif isinstance(msg, TimeoutMsg): + self.agx.timeout(msg) + else: + self.log(f"Unknown event: {msg}") + +class GPULogChannel(GPURXChannel): + MSG_CLASS = FWLogMsg + + def handle_message(self, msg): + ts = msg.timestamp / 24000000 + self.log(f"[{msg.seq_no:<4d}{ts:14.7f}] {msg.msg}") + +class GPUKTraceChannel(GPURXChannel): + MSG_CLASS = KTraceMsg + + def handle_message(self, msg): + self.log(f"{msg}") + +class GPUStatsChannel(GPURXChannel): + MSG_CLASS = HexDump(Bytes(0x60)) + + def handle_message(self, msg): + if self.agx.show_stats: + self.log(f"stat {msg}") diff --git a/tools/proxyclient/m1n1/agx/context.py b/tools/proxyclient/m1n1/agx/context.py new file mode 100644 index 0000000..41ebed5 --- /dev/null +++ b/tools/proxyclient/m1n1/agx/context.py @@ -0,0 +1,247 @@ +# SPDX-License-Identifier: MIT +from ..utils import chexdump +from ..malloc import Heap +from construct.core import * +from ..fw.agx.channels import * +from ..fw.agx.cmdqueue import * +from ..fw.agx.microsequence import * +from ..hw.uat import MemoryAttr +from .object import * +import textwrap + +class GPUContext: + def __init__(self, agx): + self.agx = agx + self.uat = self.agx.uat + self.u = self.agx.u + self.p = self.agx.p + self.verbose = False + + #self.job_list = agx.kshared.new(JobList) + #self.job_list.first_job = 0 + #self.job_list.last_head = self.job_list._addr # Empty list has self as last_head + #self.job_list.unkptr_10 = 0 + #self.job_list.push() + + self.gpu_context = agx.kobj.new(GPUContextData).push() + + self.ttbr0_base = self.u.memalign(self.agx.PAGE_SIZE, self.agx.PAGE_SIZE) + self.p.memset32(self.ttbr0_base, 0, self.agx.PAGE_SIZE) + + self.objects = {} + + # 32K VA pages since buffer manager needs that + self.uobj = GPUAllocator(agx, "Userspace", 0x1600000000, 0x100000000, ctx=None, + guard_pages=16, + va_block=32768, nG=1, AP=0, PXN=1, UXN=1) + + self.gobj = GPUAllocator(agx, "GEM", 0x1500000000, 0x100000000, ctx=None, + guard_pages=16, nG=1, AP=0, PXN=1, UXN=1) + + self.pipeline_base = 0x1100000000 + self.pipeline_size = 1 << 32 + self.pobj = GPUAllocator(agx, "Pipelines", self.pipeline_base + 0x10000, self.pipeline_size, + ctx=None, guard_pages=1, nG=1, AP=0, PXN=1, UXN=1) + + def bind(self, ctx_id): + self.ctx = ctx_id + self.uobj.ctx = ctx_id + self.gobj.ctx = ctx_id + self.pobj.ctx = ctx_id + self.uat.bind_context(ctx_id, self.ttbr0_base) + self.thing = self.buf_at(0x6fffff8000, 0, 0x4000, "thing") + + def make_stream(self, base): + return self.uat.iostream(self.ctx, base, recurse=False) + + def new_at(self, addr, objtype, name=None, track=True, **flags): + obj = GPUObject(self, objtype) + obj._stream = self.make_stream + if name is not None: + obj._name = name + + size_align = align_up(obj._size, self.agx.PAGE_SIZE) + obj._addr = addr + + obj._paddr = self.agx.u.memalign(self.agx.PAGE_SIZE, size_align) + #if isinstance(obj.val, ConstructClassBase): + #obj.val._addr = obj._addr + + self.agx.log(f"[Context@{self.gpu_context._addr:#x}] Map {obj._name} size {obj._size:#x} @ {obj._addr:#x} ({obj._paddr:#x})") + + flags2 = {"AttrIndex": MemoryAttr.Shared} + flags2.update(flags) + obj._map_flags = flags2 + + obj._size_align = size_align + self.agx.uat.iomap_at(self.ctx, obj._addr, obj._paddr, size_align, **flags2) + self.objects[obj._addr] = obj + self.agx.reg_object(obj, track=track) + + return obj + + def buf_at(self, addr, is_pipeline, size, name=None, track=True): + return self.new_at(addr, Bytes(size), name, track=track, + AttrIndex=MemoryAttr.Shared, PXN=1, + nG=1, AP=(1 if is_pipeline else 0)) + + def load_blob(self, addr, is_pipeline, filename, track=True): + data = open(filename, "rb").read() + obj = self.new_at(addr, Bytes(len(data)), filename, track=track, + AttrIndex=MemoryAttr.Shared, PXN=1, + nG=1, AP=(1 if is_pipeline else 0)) + obj.val = data + obj.push() + + return obj + + def free(self, obj): + obj._dead = True + self.agx.uat.iomap_at(self.ctx, obj._addr, 0, obj._size_align, VALID=0) + del self.objects[obj._addr] + self.agx.unreg_object(obj) + + def free_at(self, addr): + self.free(self.objects[obj._addr]) + +class GPUWorkQueue: + def __init__(self, agx, context, job_list): + self.agx = agx + self.u = agx.u + self.p = agx.p + self.context = context + + self.info = agx.kobj.new(CommandQueueInfo) + + self.pointers = agx.kshared.new(CommandQueuePointers).push() + self.pmap = CommandQueuePointerMap(self.u, self.pointers._paddr) + + self.rb_size = self.pointers.rb_size + self.ring = agx.kobj.new_buf(8 * self.rb_size, "GPUWorkQueue.RB") + + self.info.pointers = self.pointers + self.info.rb_addr = self.ring._addr + self.info.job_list = job_list + self.info.gpu_buf_addr = agx.kobj.buf(0x2c18, "GPUWorkQueue.gpu_buf") + self.info.gpu_context = context.gpu_context + self.info.push() + + self.wptr = 0 + self.first_time = True + + self.agx.uat.flush_dirty() + + def submit(self, work): + work.push() + + self.p.write64(self.ring._paddr + 8 * self.wptr, work._addr) + self.wptr = (self.wptr + 1) % self.rb_size + self.agx.uat.flush_dirty() + self.pmap.CPU_WPTR.val = self.wptr + + def wait_empty(self): + while self.wptr != self.pmap.GPU_DONEPTR.val: + self.agx.work() + +class GPU3DWorkQueue(GPUWorkQueue): + TYPE = 1 + +class GPUTAWorkQueue(GPUWorkQueue): + TYPE = 0 + +class GPUMicroSequence: + def __init__(self, agx): + self.agx = agx + self.off = 0 + self.ops = [] + self.obj = None + + def append(self, op): + off = self.off + self.ops.append(op) + self.off += op.sizeof() + return off + + def finalize(self): + self.ops.append(EndCmd()) + self.size = sum(i.sizeof() for i in self.ops) + self.obj = self.agx.kobj.new_buf(self.size, "GPUMicroSequence", track=False) + self.obj.val = b"".join(i.build() for i in self.ops) + self.obj.push() + return self.obj + + def dump(self): + chexdump(self.agx.iface.readmem(self.obj._paddr, self.size)) + print(MicroSequence.parse_stream(self.agx.uat.iostream(0, self.obj._addr))) + + def __str__(self): + s = f"GPUMicroSequence: {len(self.ops)} ops\n" + for i, op in enumerate(self.ops): + op_s = textwrap.indent(str(op), ' ' * 4) + s += f"[{i:2}:{op.sizeof():#x}] = {op!s}\n" + return s + +class GPUBufferManager: + def __init__(self, agx, context, blocks=8): + self.agx = agx + self.ctx = context + + self.block_ctl_obj = agx.kshared.new(BufferManagerBlockControl) + self.block_ctl_obj.total = blocks + self.block_ctl_obj.wptr = 0 + self.block_ctl_obj.unk = 0 + self.block_ctl = self.block_ctl_obj.push().regmap() + + self.counter_obj = agx.kshared.new(BufferManagerCounter) + self.counter_obj.count = 0 + self.counter = self.counter_obj.push().regmap() + + self.misc_obj = agx.kshared.new(BufferManagerMisc) + self.misc_obj.cpu_flag = 1 + self.misc = self.misc_obj.push().regmap() + + self.page_size = 0x8000 + self.pages_per_block = 4 + self.block_size = self.pages_per_block * self.page_size + + self.page_list = context.uobj.new(Array(0x10000 // 4, Int32ul), "BM PageList", track=False) + self.block_list = context.uobj.new(Array(0x8000 // 4, Int32ul), "BM BlockList", track=False) + + self.info = info = agx.kobj.new(BufferManagerInfo) + info.page_list_addr = self.page_list._addr + info.page_list_size = self.page_list._size + info.page_count = self.block_ctl_obj.total * 4 + info.block_count = self.block_ctl_obj.total + + info.block_list_addr = self.block_list._addr + info.block_ctl = self.block_ctl_obj + info.last_page = info.page_count - 1 + info.block_size = self.block_size + + info.counter = self.counter_obj + + self.populate() + self.block_ctl_obj.pull() + self.block_list.push() + self.page_list.push() + + info.push() + + def increment(self): + self.counter_obj.count += 1 + self.counter_obj.push() + + def populate(self): + idx = self.block_ctl.wptr.val + total = self.block_ctl.total.val + while idx < total: + block = self.ctx.uobj.new_buf(self.block_size, "BM Block", track=False) + self.block_list[idx * 2] = block._addr // self.page_size + + page_idx = idx * self.pages_per_block + for i in range(self.pages_per_block): + self.page_list[page_idx + i] = block._addr // self.page_size + i + + idx += 1 + self.block_ctl.wptr.val = idx + diff --git a/tools/proxyclient/m1n1/agx/event.py b/tools/proxyclient/m1n1/agx/event.py new file mode 100644 index 0000000..693f3a5 --- /dev/null +++ b/tools/proxyclient/m1n1/agx/event.py @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: MIT +from ..utils import chexdump +from ..malloc import Heap +from construct.core import * +from ..fw.agx.channels import * +from ..fw.agx.cmdqueue import * +from ..fw.agx.microsequence import * +from ..hw.uat import MemoryAttr +from .object import * +import textwrap + +class GPUEventManager: + MAX_EVENTS = 128 + + def __init__(self, agx): + self.agx = agx + + self.event_count = 0 + self.free_events = set(range(self.MAX_EVENTS)) + self.events = [None] * self.MAX_EVENTS + + def allocate_event(self): + if not self.free_events: + raise Exception("No free events") + ev_id = self.free_events.pop() + + ev = GPUEvent(ev_id) + self.events[ev_id] = ev + + return ev + + def free_event(self, ev): + self.events[ev.id] = None + self.free_events.add(ev.id) + + def fired(self, flags): + self.agx.log("= Events fired =") + for i, v in enumerate(flags): + for j in range(64): + if v & (1 << j): + ev_id = i * 64 + j + ev = self.events[ev_id] + self.agx.log(f"Event fired: {ev_id}") + if ev is None: + raise Exception("Received spurious notification for event ID {ev}") + ev.fire() + self.event_count += 1 + +class GPUEvent: + def __init__(self, ev_id): + self.id = ev_id + self.fired = False + + def fire(self): + self.fired = True + + def rearm(self): + self.fired = False diff --git a/tools/proxyclient/m1n1/agx/initdata.py b/tools/proxyclient/m1n1/agx/initdata.py new file mode 100644 index 0000000..d6fa76a --- /dev/null +++ b/tools/proxyclient/m1n1/agx/initdata.py @@ -0,0 +1,387 @@ +# SPDX-License-Identifier: MIT +from ..fw.agx.initdata import * +from ..fw.agx.channels import ChannelInfo +from ..hw.uat import MemoryAttr + +from construct import Container + +def build_iomappings(agx, chip_id): + def iomap(phys, size, range_size, rw): + off = phys & 0x3fff + virt = agx.io_allocator.malloc(size + 0x4000 + off) + agx.uat.iomap_at(0, virt, phys - off, size + off, AttrIndex=MemoryAttr.Device) + return IOMapping(phys, virt + off, size, range_size, rw) + + # for t8103 + if chip_id == 0x8103: + return [ + iomap(0x204d00000, 0x1c000, 0x1c000, 1), # Fender + iomap(0x20e100000, 0x4000, 0x4000, 0), # AICTimer + iomap(0x23b104000, 0x4000, 0x4000, 1), # AICSWInt + iomap(0x204000000, 0x20000, 0x20000, 1), # RGX + IOMapping(), # UVD + IOMapping(), # unused + IOMapping(), # DisplayUnderrunWA + iomap(0x23b2e8000, 0x1000, 0x1000, 0), # AnalogTempSensorControllerRegs + iomap(0x23bc00000, 0x1000, 0x1000, 1), # PMPDoorbell + iomap(0x204d80000, 0x5000, 0x5000, 1), # MetrologySensorRegs + iomap(0x204d61000, 0x1000, 0x1000, 1), # GMGIFAFRegs + iomap(0x200000000, 0xd6400, 0xd6400, 1), # MCache registers + IOMapping(), # AICBankedRegisters + iomap(0x23b738000, 0x1000, 0x1000, 1), # PMGRScratch + IOMapping(), # NIA Special agent idle register die 0 + IOMapping(), # NIA Special agent idle register die 1 + IOMapping(), # CRE registers + IOMapping(), # Streaming codec registers + IOMapping(), # + IOMapping(), # + ] + elif chip_id == 0x8112: + return [ + iomap(0x204d00000, 0x14000, 0x14000, 1), # Fender + iomap(0x20e100000, 0x4000, 0x4000, 0), # AICTimer + iomap(0x23b0c4000, 0x4000, 0x4000, 1), # AICSWInt + iomap(0x204000000, 0x20000, 0x20000, 1), # RGX + IOMapping(), # UVD + IOMapping(), # unused + IOMapping(), # DisplayUnderrunWA + iomap(0x23b2c0000, 0x1000, 0x1000, 0), # AnalogTempSensorControllerRegs + IOMapping(), # PMPDoorbell + iomap(0x204d80000, 0x8000, 0x8000, 1), # MetrologySensorRegs + iomap(0x204d61000, 0x1000, 0x1000, 1), # GMGIFAFRegs + iomap(0x200000000, 0xd6400, 0xd6400, 1), # MCache registers + IOMapping(), # AICBankedRegisters + IOMapping(), # PMGRScratch + IOMapping(), # NIA Special agent idle register die 0 + IOMapping(), # NIA Special agent idle register die 1 + iomap(0x204e00000, 0x10000, 0x10000, 0), # CRE registers + iomap(0x27d050000, 0x4000, 0x4000, 0), # Streaming codec registers + iomap(0x23b3d0000, 0x1000, 0x1000, 0), # + iomap(0x23b3c0000, 0x1000, 0x1000, 0), # + ] + elif chip_id in (0x6000, 0x6001, 0x6002): + mcc_cnt = {0x6002: 16, 0x6001: 8, 0x6000: 4} + return [ + iomap(0x404d00000, 0x1c000, 0x1c000, 1), # Fender + iomap(0x20e100000, 0x4000, 0x4000, 0), # AICTimer + iomap(0x28e104000, 0x4000, 0x4000, 1), # AICSWInt + iomap(0x404000000, 0x20000, 0x20000, 1), # RGX + IOMapping(), # UVD + IOMapping(), # unused + IOMapping(), # DisplayUnderrunWA + iomap(0x28e494000, 0x1000, 0x1000, 0), # AnalogTempSensorControllerRegs + IOMapping(), # PMPDoorbell + iomap(0x404d80000, 0x8000, 0x8000, 1), # MetrologySensorRegs + iomap(0x204d61000, 0x1000, 0x1000, 1), # GMGIFAFRegs + iomap(0x200000000, mcc_cnt[chip_id] * 0xd8000, 0xd8000, 1), # MCache registers + IOMapping(), # AICBankedRegisters + IOMapping(), # PMPDoorbell + iomap(0x2643c4000, 0x1000, 0x1000, 1), # NIA Special agent idle register die 0 + iomap(0x22643c4000, 0x1000, 0x1000, 1) if chip_id == 0x6002 else IOMapping(), # NIA Special agent idle register die 1 + IOMapping(), # CRE registers + IOMapping(), # Streaming codec registers + iomap(0x28e3d0000, 0x1000, 0x1000, 1), + iomap(0x28e3c0000, 0x2000, 0x2000, 0), + ] + + +CHIP_INFO = { + 0x8103: Container( + chip_id = 0x8103, + min_sram_volt = 850, + max_power = 19551, + max_freq_mhz = 1278, + unk_87c = -220, + unk_8cc = 9880, + unk_924 = [[0] * 8] * 8, + unk_e48 = [[0] * 8] * 8, + unk_e24 = 112, + gpu_fast_die0_sensor_mask64 = 0x12, + gpu_fast_die0_sensor_mask64_alt = 0x12, + gpu_fast_die0_sensor_present = 0x01, + shared1_tab = [ + -1, 0x7282, 0x50ea, 0x370a, 0x25be, 0x1c1f, 0x16fb + ] + ([-1] * 10), + shared1_a4 = 0xffff, + shared2_tab = [0x800, 0x1555, -1, -1, -1, -1, -1, -1, 0, 0], + shared2_unk_508 = 0xc0007, + unk_3cf4 = [1000.0, 0, 0, 0, 0, 0, 0, 0], + unk_3d14 = [45.0, 0, 0, 0, 0, 0, 0, 0], + unk_118ec = None, + hwdb_4e0 = 0, + hwdb_534 = 0, + num_cores = 8, + gpu_core = 11, + gpu_rev = 4, + hwdb_ab8 = 0x48, + hwdb_abc = 0x8, + hwdb_b30 = 0, + rel_max_powers = [0, 19, 26, 38, 60, 87, 100], + ), + 0x6001: Container( + chip_id = 0x6001, + min_sram_volt = 790, + max_power = 81415, + max_freq_mhz = 1296, + unk_87c = 900, + unk_8cc = 11000, + unk_924 = [[i, *([0] * 7)] for i in [ + 9.838, 9.819, 9.826, 9.799, + 0, 0, 0, 0, + ]], + unk_e48 = [[i, *([0] * 7)] for i in [ + 13, 13, 13, 13, 0, 0, 0, 0, + ]], + unk_e24 = 125, + gpu_fast_die0_sensor_mask64 = 0x80808080, + gpu_fast_die0_sensor_mask64_alt = 0x90909090, + gpu_fast_die0_sensor_present = 0x0f, + shared1_tab = [0] + ([0xffff] * 16), + shared1_a4 = 0xffff, + shared2_tab = [-1, -1, -1, -1, 0x2aa, 0xaaa, -1, -1, 0, 0], + shared2_unk_508 = 0xcc00001, + unk_3cf4 = [1314.0, 1330.0, 1314.0, 1288.0, 0, 0, 0, 0], + unk_3d14 = [21.0, 21.0, 22.0, 21.0, 0, 0, 0, 0], + unk_118ec = [ + 0, 1, 2, + 1, 1, 90, 75, 1, 1, + 1, 2, 90, 75, 1, 1, + 1, 1, 90, 75, 1, 1 + ], + hwdb_4e0 = 4, + hwdb_534 = 1, + num_cores = 32, + gpu_core = 13, + gpu_rev = 5, + hwdb_ab8 = 0x2084, + hwdb_abc = 0x80, + hwdb_b30 = 0, + rel_max_powers = [0, 15, 20, 27, 36, 52, 100], + ), + 0x6002: Container( + chip_id = 0x6002, + min_sram_volt = 790, + max_power = 166743, + max_freq_mhz = 1296, + unk_87c = 900, + unk_8cc = 11000, + unk_924 = [[i, *([0] * 7)] for i in [ + 9.838, 9.819, 9.826, 9.799, + 9.799, 9.826, 9.819, 9.838, + ]], + unk_c30 = 0, + unk_e48 = [[i, *([0] * 7)] for i in [ + 13, 13, 13, 13, 13, 13, 13, 13, + ]], + unk_e24 = 125, + gpu_fast_die0_sensor_mask64 = 0x8080808080808080, + gpu_fast_die0_sensor_mask64_alt = 0x9090909090909090, + gpu_fast_die0_sensor_present = 0xff, + shared1_tab = [0] + ([0xffff] * 16), + shared1_a4 = 0xffff, + shared2_tab = [-1, -1, -1, -1, 0x2aa, 0xaaa, -1, -1, 0, 0], + shared2_unk_508 = 0xcc00001, + unk_3cf4 = [1244.0, 1260.0, 1242.0, 1214.0, + 1072.0, 1066.0, 1044.0, 1042.0], + unk_3d14 = [18.0, 18.0, 18.0, 17.0, 15.0, 15.0, 15.0, 14.0], + unk_8924 = 0, + unk_118ec = [ + 0, 1, 2, + 1, 1, 90, 75, 1, 1, + 1, 2, 90, 75, 1, 1, + 1, 1, 90, 75, 1, 1 + ], + hwdb_4e0 = 4, + hwdb_534 = 1, + num_cores = 64, + gpu_core = 13, + gpu_rev = 5, + hwdb_ab8 = 0x2084, + hwdb_abc = 0x80, + hwdb_b30 = 0, + rel_max_powers = [0, 15, 19, 25, 34, 50, 100], + ), + 0x8112: Container( + chip_id = 0x8112, + min_sram_volt = 780, + max_power = 22800, + max_freq_mhz = 1398, + unk_87c = 900, + unk_8cc = 11000, + unk_924 = [[ + 0.0, 0.0, 0.0, 0.0, + 5.3, 0.0, 5.3, 6.6, + ]] + ([[0] * 8] * 7), + unk_e48 = [[ + 0.0, 0.0, 0.0, 0.0, + 5.3, 0.0, 5.3, 6.6, + ]] + ([[0] * 8] * 7), + unk_e24 = 125, + gpu_fast_die0_sensor_mask64 = 0x6800, + gpu_fast_die0_sensor_mask64_alt = 0x6800, + gpu_fast_die0_sensor_present = 0x02, + shared1_tab = [0] + ([0xffff] * 16), + shared1_a4 = 0, + shared2_tab = [-1, -1, -1, -1, -1, -1, -1, -1, 0xaa5aa, 0], + shared2_unk_508 = 0xc00000, + unk_3cf4 = [1920.0, 0, 0, 0, 0, 0, 0, 0], + unk_3d14 = [74.0, 0, 0, 0, 0, 0, 0, 0], + unk_118ec = None, + hwdb_4e0 = 4, + hwdb_534 = 0, + num_cores = 10, + gpu_core = 15, + gpu_rev = 3, + hwdb_ab8 = 0x2048, + hwdb_abc = 0x4000, + hwdb_b30 = 1, + rel_max_powers = [0, 18, 27, 37, 52, 66, 82, 96, 100], + ), +} +def build_initdata(agx): + sgx = agx.u.adt["/arm-io/sgx"] + chosen = agx.u.adt["/chosen"] + chip_info = CHIP_INFO[chosen.chip_id] + + initdata = agx.kshared.new(InitData) + + initdata.ver_info = (1, 1, 16, 1) + + initdata.regionA = agx.kshared.new_buf(0x4000, "InitData_RegionA").push() + + regionB = agx.kobj.new(InitData_RegionB) + + regionB.channels = agx.ch_info + + regionB.stats_ta = agx.kobj.new(InitData_GPUGlobalStatsTA).push() + regionB.stats_3d = agx.kobj.new(InitData_GPUGlobalStats3D).push() + + # size: 0x180, Empty + # 13.0: grew + #regionB.stats_cp = agx.kobj.new_buf(0x180, "RegionB.unkptr_180").push() + regionB.stats_cp = agx.kobj.new_buf(0x980, "RegionB.unkptr_180").push() + + # size: 0x3b80, few floats, few ints, needed for init + regionB.hwdata_a = agx.kobj.new(AGXHWDataA(sgx, chip_info), track=False) + + # size: 0x80, empty + regionB.unk_190 = agx.kobj.new_buf(0x80, "RegionB.unkptr_190").push() + + # size: 0xc0, fw writes timestamps into this + regionB.unk_198 = agx.kobj.new_buf(0xc0, "RegionB.unkptr_198").push() + + # size: 0xb80, io stuff + hwdata = agx.kobj.new(AGXHWDataB(sgx, chip_info), track=False) + hwdata.io_mappings = build_iomappings(agx, chosen.chip_id) + + k = 1.02 #? + count = sgx.perf_state_count + table_count = sgx.perf_state_table_count + base_pstate = sgx.getprop("gpu-perf-base-pstate", 3) + base_freq = sgx.perf_states[base_pstate].freq + max_freq = sgx.perf_states[count - 1].freq + for i in range(count): + ps = sgx.perf_states[i] + hwdata.frequencies[i] = ps.freq // 1000000 + + volt = [ps.volt] * 8 + for j in range(1, table_count): + volt[j] = sgx.perf_states[count * j + i].volt + sram_volt = [max(chip_info.min_sram_volt, i) for i in volt] + + hwdata.voltages[i] = volt + hwdata.voltages_sram[i] = sram_volt + + regionB.hwdata_a.unk_74[i] = k + hwdata.unk_9b4[i] = k + hwdata.rel_max_powers[i] = chip_info.rel_max_powers[i] + hwdata.rel_boost_freqs[i] = max(0, int((ps.freq - base_freq) / (max_freq - base_freq) * 100)) + + regionB.hwdata_a.push() + + regionB.hwdata_b = hwdata.push() + regionB.hwdata_b_addr2 = hwdata._addr + + regionB.fwlog_ring2 = agx.fwlog_ring + + # Unallocated, Size 0x1000 + regionB.unk_1b8 = agx.kobj.new_buf(0x1000, "RegionB.unkptr_1b8").push() + + # Unallocated, size 0x300 + regionB.unk_1c0 = agx.kobj.new_buf(0x300, "RegionB.unkptr_1c0").push() + + # Unallocated, unknown size + regionB.unk_1c8 = agx.kobj.new_buf(0x1000, "RegionB.unkptr_1c8").push() + + # Size: 0x4000 + regionB.buffer_mgr_ctl = agx.kshared2.new(InitData_BufferMgrCtl).push() + regionB.buffer_mgr_ctl_addr2 = regionB.buffer_mgr_ctl._addr + + regionB.unk_6a80 = 0 + regionB.gpu_idle = 0 + regionB.unk_6a9c = 0 + regionB.unk_ctr0 = 0 + regionB.unk_ctr1 = 0 + regionB.unk_6aa8 = 0 + regionB.unk_6aac = 0 + regionB.unk_ctr2 = 0 + regionB.unk_6ab4 = 0 + regionB.unk_6ab8 = 0 + regionB.unk_6abc = 0 + regionB.unk_6ac0 = 0 + regionB.unk_6ac4 = 0 + regionB.unk_ctr3 = 0 + regionB.unk_6acc = 0 + regionB.unk_6ad0 = 0 + regionB.unk_6ad4 = 0 + regionB.unk_6ad8 = 0 + regionB.unk_6adc = 0 + regionB.unk_6ae0 = 0 + regionB.unk_6ae4 = 0 + regionB.unk_6ae8 = 0 + regionB.unk_6aec = 0 + regionB.unk_6af0 = 0 + regionB.unk_ctr4 = 0 + regionB.unk_ctr5 = 0 + regionB.unk_6afc = 0 + + initdata.regionB = regionB.push() + + initdata.regionC = agx.kshared.new(InitData_RegionC(sgx, chip_info), track=False).push() + + #self.regionC_addr = agx.ksharedshared_heap.malloc(0x88000) + + initdata.fw_status = agx.kobj.new(InitData_FWStatus) + initdata.fw_status.fwctl_channel = agx.fwctl_chinfo + initdata.fw_status.push() + + ## This section seems to be data that would be used by firmware side page allocation + ## But the current firmware doesn't have this functionality enabled, so it's not used? + initdata.uat_num_levels = 3 + initdata.uat_page_bits = 14 + initdata.uat_page_size = 0x4000 + + if chip_info.chip_id in (0x8103, 0x8112): + phys_mask = 0xffffffc000 + else: + phys_mask = 0x3ffffffc000 + + initdata.uat_level_info = [ + UatLevelInfo(36, 8, phys_mask), + UatLevelInfo(25, 2048, phys_mask), + UatLevelInfo(14, 2048, phys_mask), + ] + + # Host handles FW allocations for existing firmware versions + initdata.host_mapped_fw_allocations = 1 + + + #initdata.regionC.idle_ts = agx.u.mrs("CNTPCT_EL0") + 24000000 + #initdata.regionC.idle_unk = 0x5b2e8 + #initdata.regionC.idle_to_off_timeout_ms = 20000 + + initdata.regionC.push() + initdata.push() + + #print(InitData.parse_stream(agx.uat.iostream(0, initdata._addr))) + return initdata diff --git a/tools/proxyclient/m1n1/agx/object.py b/tools/proxyclient/m1n1/agx/object.py new file mode 100644 index 0000000..8f382f9 --- /dev/null +++ b/tools/proxyclient/m1n1/agx/object.py @@ -0,0 +1,263 @@ +# SPDX-License-Identifier: MIT +import io, time + +from ..malloc import Heap +from ..utils import * +from ..constructutils import ConstructClassBase, str_value +from construct import Bytes, Container, HexDump +from ..hw.uat import MemoryAttr + +class GPUObject: + def __init__(self, allocator, objtype): + self._raw = False + if isinstance(objtype, int): + self.val = bytes(objtype) + self._size = objtype + self._name = b"Bytes({objtype})" + self._raw = True + elif isinstance(objtype, ConstructClassBase): + self.val = objtype + objtype = type(objtype) + self._size = objtype.sizeof() + self._name = objtype.__name__ + elif isinstance(objtype, type) and issubclass(objtype, ConstructClassBase): + self._size = objtype.sizeof() + self.val = objtype() + self._name = objtype.__name__ + else: + self._size = objtype.sizeof() + self.val = objtype.parse(bytes(self._size)) + self._name = type(objtype).__name__ + + self._alloc = allocator + self._type = objtype + self._addr = None + self._data = None + self._dead = False + self._map_flags = {} + self._mon_val = None + self._skipped_pushes = 0 + self._compress_threshold = 65536 + self._strm = None + self._read_phys = False + + def push(self, if_needed=False): + self._mon_val = self.val + assert self._addr is not None + + if self._raw: + data = self.val + else: + context = Container() + context._parsing = False + context._building = True + context._sizing = False + context._params = context + # build locally and push as a block for efficiency + ios = io.BytesIO() + self._type._build(self.val, ios, context, "(pushing)") + data = ios.getvalue() + + #if self._alloc.verbose: + #t = time.time() + #self._alloc.agx.log(f"[{self._name} @{self._addr:#x}] chk {self._size} bytes") + if if_needed and data[:] == self._data: + self._skipped_pushes += 1 + #if self._alloc.verbose: + #t2 = time.time() + #mbs = self._size / (t2 - t) / 1000000 + #self._alloc.agx.log(f"[{self._name} @{self._addr:#x}] chk done ({mbs:.02f} MB/s)") + return self + + self._skipped_pushes = 0 + + t = time.time() + if data == bytes(self._size): + if self._alloc.verbose: + self._alloc.agx.log(f"[{self._name} @{self._addr:#x}] zeroing {self._size} bytes") + self._alloc.agx.p.memset8(self._paddr, 0, self._size) + elif self._size > self._compress_threshold: + if self._alloc.verbose: + self._alloc.agx.log(f"[{self._name} @{self._addr:#x}] pushing {self._size} bytes (compressed)") + self._alloc.agx.u.compressed_writemem(self._paddr, data) + else: + if self._alloc.verbose: + self._alloc.agx.log(f"[{self._name} @{self._addr:#x}] pushing {self._size} bytes") + self._alloc.agx.iface.writemem(self._paddr, data) + if self._alloc.verbose: + t2 = time.time() + mbs = self._size / (t2 - t) / 1000000 + self._alloc.agx.log(f"[{self._name} @{self._addr:#x}] push done ({mbs:.02f} MB/s)") + #stream.write(data) + if isinstance(self._type, type) and issubclass(self._type, ConstructClassBase): + if self._strm is None: + self._strm = self._alloc.make_stream(self._addr) + self.val.set_addr(self._addr, self._strm) + + self._data = bytes(data) + return self + + def _pull(self): + if self._raw: + assert self._paddr is not None + return self._alloc.agx.iface.readmem(self._paddr, self._size) + + assert self._addr is not None + context = Container() + context._parsing = True + context._building = False + context._sizing = False + context._params = context + if self._alloc.verbose: + self._alloc.agx.log(f"[{self._name} @{self._addr:#x}] pulling {self._size} bytes") + if self._read_phys: + stream = io.BytesIO() + stream.write(self._alloc.agx.iface.readmem(self._paddr, self._size)) + stream.seek(0) + else: + stream = self._alloc.make_stream(self._addr) + return self._type._parse(stream, context, f"(pulling {self._name})") + + def pull(self): + self._mon_val = self.val = self._pull() + return self + + def poll(self): + prev_val = self._mon_val + self._mon_val = cur_val = self._pull() + if not hasattr(cur_val, "diff"): + return None + if cur_val != prev_val: + diff = cur_val.diff(prev_val) + assert diff is not None + return f"GPUObject {self._name} ({self._size:#x} @ {self._addr:#x}): " + diff + else: + return None + + @property + def _ctx(self): + return self._alloc.ctx + + def add_to_mon(self, mon): + mon.add(self._addr, self._size, self._name, offset=0, + readfn=lambda a, s: self._alloc.agx.iface.readmem(a - self._addr + self._paddr, s)) + + def _set_addr(self, addr, paddr=None): + self._addr = addr + self._paddr = paddr + if isinstance(self.val, ConstructClassBase): + self.val.set_addr(addr) + + def __getitem__(self, item): + return self.val[item] + def __setitem__(self, item, value): + self.val[item] = value + + def __getattr__(self, attr): + return getattr(self.val, attr) + + def __setattr__(self, attr, val): + if attr.startswith("_") or attr == "val": + self.__dict__[attr] = val + return + + setattr(self.val, attr, val) + + def __str__(self): + if isinstance(self.val, bytes) and len(self.val) > 128: + s_val = f"<{len(self.val)} bytes>" + else: + s_val = str_value(self.val) + return f"GPUObject {self._name} ({self._size:#x} @ {self._addr:#x}): " + s_val + + def free(self): + if self._dead: + return + self._dead = True + self._alloc.free(self) + +class GPUAllocator: + def __init__(self, agx, name, start, size, + ctx=0, page_size=16384, va_block=None, guard_pages=1, **kwargs): + self.page_size = page_size + if va_block is None: + va_block = page_size + self.agx = agx + self.ctx = ctx + self.name = name + self.va = Heap(start, start + size, block=va_block) + self.verbose = 0 + self.guard_pages = guard_pages + self.objects = {} + self.flags = kwargs + self.align_to_end = True + + def make_stream(self, base): + return self.agx.uat.iostream(self.ctx, base, recurse=False) + + def new(self, objtype, name=None, track=True, **kwargs): + obj = GPUObject(self, objtype) + obj._stream = self.make_stream + if name is not None: + obj._name = name + + guard_size = self.page_size * self.guard_pages + + size_align = align_up(obj._size, self.page_size) + addr = self.va.malloc(size_align + guard_size) + paddr = self.agx.u.memalign(self.page_size, size_align) + off = 0 + if self.align_to_end: + off = size_align - obj._size + + flags = dict(self.flags) + flags.update(kwargs) + + obj._addr_align = addr + obj._paddr_align = paddr + obj._size_align = size_align + self.agx.uat.iomap_at(self.ctx, addr, paddr, size_align, **flags) + obj._set_addr(addr + off, paddr + off) + obj._map_flags = flags + + self.objects[obj._addr] = obj + + if self.verbose: + self.agx.log(f"[{self.name}] Alloc {obj._name} size {obj._size:#x} @ {obj._addr:#x} ({obj._paddr:#x})") + + self.agx.reg_object(obj, track=track) + return obj + + def new_buf(self, size, name, track=True): + return self.new(HexDump(Bytes(size)), name=name, track=track) + + def buf(self, size, name, track=True): + return self.new_buf(size, name, track).push()._addr + + def free(self, obj): + obj._dead = True + is_private = obj._map_flags.get("AttrIndex", MemoryAttr.Normal) != MemoryAttr.Shared + if is_private and obj._addr_align > 0xf8000000000: + flags2 = dict(obj._map_flags) + flags2["AttrIndex"] = MemoryAttr.Shared + self.agx.uat.iomap_at(self.ctx, obj._addr_align, obj._paddr_align, + obj._size_align, **flags2) + self.agx.uat.flush_dirty() + self.agx.uat.handoff.prepare_cacheflush(obj._addr_align, obj._size_align) + self.agx.ch.fwctl.send_inval(0x40, obj._addr_align) + self.agx.uat.handoff.wait_cacheflush() + + self.agx.uat.iomap_at(self.ctx, obj._addr_align, 0, + obj._size_align, VALID=0) + + if is_private and obj._addr_align > 0xf8000000000: + self.agx.uat.flush_dirty() + self.agx.uat.handoff.complete_cacheflush() + + self.agx.u.free(obj._paddr_align) + self.va.free(obj._addr_align) + del self.objects[obj._addr] + self.agx.unreg_object(obj) + + if self.verbose: + self.agx.log(f"[{self.name}] Free {obj._name} size {obj._size:#x} @ {obj._addr:#x} ({obj._paddr:#x})") diff --git a/tools/proxyclient/m1n1/agx/render.py b/tools/proxyclient/m1n1/agx/render.py new file mode 100644 index 0000000..b29683b --- /dev/null +++ b/tools/proxyclient/m1n1/agx/render.py @@ -0,0 +1,1075 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +import sys, json, zipfile + +json.c_make_encoder = None + +from m1n1.proxy import * +from .context import * +from .event import GPUEventManager +from .uapi import * +from m1n1.constructutils import ConstructClass, Ver + +def unswizzle(agx, addr, w, h, psize, dump=None, grid=False): + iface = agx.u.iface + + tw = 64 + th = 64 + ntx = (w + tw - 1) // 64 + nty = (h + th - 1) // 64 + data = iface.readmem(addr, ntx * nty * psize * tw * th) + new_data = [] + for y in range(h): + ty = y // th + for x in range(w): + tx = x // tw + toff = tw * th * psize * (ty * ntx + tx) + j = x & (tw - 1) + i = y & (th - 1) + off = ( + ((j & 1) << 0) | ((i & 1) << 1) | + ((j & 2) << 1) | ((i & 2) << 2) | + ((j & 4) << 2) | ((i & 4) << 3) | + ((j & 8) << 3) | ((i & 8) << 4) | + ((j & 16) << 4) | ((i & 16) << 5) | + ((j & 32) << 5) | ((i & 32) << 6)) + r,g,b,a = data[toff + psize*off: toff + psize*(off+1)] + if grid: + if x % 64 == 0 or y % 64 == 0: + r,g,b,a = 255,255,255,255 + elif x % 32 == 0 or y % 32 == 0: + r,g,b,a = 128,128,128,255 + new_data.append(bytes([b, g, r, a])) + data = b"".join(new_data) + if dump: + open(dump, "wb").write(data[:w*h*psize]) + #iface.writemem(addr, data) + +class GPUFrame: + def __init__(self, context, filename=None, track=False): + self.ctx = context + self.agx = context.agx + self.objects = [] + self.cmdbuf = None + self.track = track + if filename is not None: + self.load(filename) + + def add_object(self, obj): + self.objects.append(obj) + + def save(self, filename): + cmdbuf = self.cmdbuf + with zipfile.ZipFile(filename, "w") as zf: + cmdbuf_data = json.dumps(cmdbuf, indent=4).encode("utf-8") + zf.writestr("cmdbuf.json", cmdbuf_data) + + obj_info = [] + for obj in self.objects: + if obj._data == bytes(obj._size): + filename = None + else: + filename = f"obj_{obj._addr:x}.bin" + zf.writestr(filename, obj._data) + obj_info.append({ + "file": filename, + "name": obj._name, + "addr": obj._addr, + "size": obj._size, + "map_flags": obj._map_flags, + }) + + obj_info_data = json.dumps(obj_info, indent=4).encode("utf-8") + zf.writestr("objects.json", obj_info_data) + + def load(self, filename): + with zipfile.ZipFile(filename, "r") as zf: + with zf.open("cmdbuf.json", "r") as fd: + self.cmdbuf = drm_asahi_cmdbuf_t.from_json(fd) + with zf.open("objects.json", "r") as fd: + obj_info = json.load(fd) + + self.objects = [] + for i in obj_info: + filename = i["file"] + obj = self.ctx.new_at(i["addr"], Bytes(i["size"]), name=i["name"], track=self.track, + **i["map_flags"]) + if filename is not None: + with zf.open(i["file"], "r") as fd: + data = fd.read() + obj.val = data + obj.push() + else: + obj.val = bytes(i["size"]) + obj.push() + self.objects.append(obj) + +class GPUWork: + def __init__(self, renderer): + self.objects = [] + self.renderer = renderer + + def add(self, obj): + self.objects.append(obj) + + def free(self): + for obj in self.objects: + obj.free() + self.objects = [] + +class GPURenderer: + def __init__(self, ctx, buffers=16, bm_slot=0, queue=0): + self.agx = agx = ctx.agx + self.queue = queue + + # 0..63 + self.ctx = ctx + self.ctx_id = ctx.ctx + + # 0..255 + self.buffers = buffers + self.buffer_mgr_slot = bm_slot + + ## These MUST go together + self.buffer_mgr = GPUBufferManager(agx, ctx, buffers) + self.buffer_mgr_initialized = False + self.unk_emptybuf = agx.kobj.new_buf(0x40, "unk_emptybuf") + self.tpc_size = 0 + + ##### Job group + + self.job_list = agx.kshared.new(JobList) + self.job_list.first_job = 0 + self.job_list.last_head = self.job_list._addr # Empty list has self as last_head + self.job_list.unkptr_10 = 0 + self.job_list.push() + + ##### Work Queues + + self.ts3d_1 = agx.kshared.new(Int64ul, name="3D timestamp 1") + self.ts3d_2 = agx.kshared.new(Int64ul, name="3D timestamp 2") + self.tsta_1 = agx.kshared.new(Int64ul, name="TA timestamp 1") + self.tsta_2 = agx.kshared.new(Int64ul, name="TA timestamp 2") + + self.wq_3d = GPU3DWorkQueue(agx, ctx, self.job_list) + self.wq_ta = GPUTAWorkQueue(agx, ctx, self.job_list) + + self.wq_3d.info.uuid = 0x3D0000 | bm_slot + self.wq_3d.info.push() + self.wq_ta.info.uuid = 0x7A0000 | bm_slot + self.wq_ta.info.push() + + self.stamp_value_3d = 0x3D000000 | (bm_slot << 16) + self.stamp_value_ta = 0x7A000000 | (bm_slot << 16) + + ##### TA stamps + + # start? + self.stamp_ta1 = agx.kshared.new(StampCounter, name="TA stamp 1") + self.stamp_ta1.value = self.stamp_value_ta + self.stamp_ta1.push() + + # complete? + self.stamp_ta2 = agx.kobj.new(StampCounter, name="TA stamp 2") + self.stamp_ta2.value = self.stamp_value_ta + self.stamp_ta2.push() + + ##### 3D stamps + + # start? + self.stamp_3d1 = agx.kshared.new(StampCounter, name="3D stamp 1") + self.stamp_3d1.value = self.stamp_value_3d + self.stamp_3d1.push() + + # complete? + self.stamp_3d2 = agx.kobj.new(StampCounter, name="3D stamp 2") + self.stamp_3d2.value = self.stamp_value_3d + self.stamp_3d2.push() + + + ##### Things userspace deals with for macOS + + #self.aux_fb = ctx.uobj.new_buf(0x8000, "Aux FB thing") + ##self.deflake_1 = ctx.uobj.new_buf(0x20, "Deflake 1") + ##self.deflake_2 = ctx.uobj.new_buf(0x280, "Deflake 2") + ##self.deflake_3 = ctx.uobj.new_buf(0x540, "Deflake 3") + #self.deflake = ctx.uobj.new_buf(0x7e0, "Deflake") + #self.unk_buf = ctx.uobj.new(Array(0x800, Int64ul), "Unknown Buffer") + #self.unk_buf.val = [0, *range(1, 0x400), *(0x400 * [0])] + #self.unk_buf.push() + + ##### Some kind of feedback/status buffer, GPU managed? + + self.event_control = agx.kobj.new(EventControl) + self.event_control.event_count = agx.kobj.new(Int32ul, "event_count") + self.event_control.event_count.val = 0 + self.event_control.event_count.push() + + self.event_control.generation = 0 + self.event_control.cur_count = 0 + self.event_control.unk_10 = 0x50 + self.event_control.push() + + self.frames = 0 + + self.ev_ta = ev_ta = self.agx.event_mgr.allocate_event() + self.ev_3d = ev_3d = self.agx.event_mgr.allocate_event() + + self.work = [] + + def submit(self, cmdbuf, wait_for=None): + nclusters = 8 + + work = GPUWork(self) + self.work.append(work) + + self.buffer_mgr.increment() + + aux_fb = self.ctx.uobj.new_buf(0x20000, "Aux FB thing", track=False) + work.add(aux_fb) + + # t8103 + deflake_1_size = 0x540 + deflake_2_size = 0x280 + deflake_3_size = 0x20 + + # t6002 - 9 times larger instead of 8? works with 8... + deflake_1_size *= nclusters + deflake_2_size *= nclusters + deflake_3_size *= nclusters + + deflake_1 = self.ctx.uobj.new_buf(deflake_1_size, "Deflake 1", track=True) + deflake_2 = self.ctx.uobj.new_buf(deflake_2_size, "Deflake 2", track=True) + deflake_3 = self.ctx.uobj.new_buf(deflake_3_size, "Deflake 3", track=True) + work.add(deflake_1) + work.add(deflake_2) + work.add(deflake_3) + + unk_buf = self.ctx.uobj.new(Array(0x800, Int64ul), "Unknown Buffer", track=False) + work.add(unk_buf) + + unk_buf.val = [0, *range(2, 0x401), *(0x400 * [0])] + unk_buf.push() + + work.cmdbuf = cmdbuf + + self.frames += 1 + + work.ev_ta = ev_ta = self.ev_ta + work.ev_3d = ev_3d = self.ev_3d + + self.ev_ta.rearm() + self.ev_3d.rearm() + + self.agx.log(f"ev_ta: {ev_ta.id}") + self.agx.log(f"ev_3d: {ev_3d.id}") + + #self.event_control.base_stamp = self.stamp_value >> 8 + #self.event_control.push() + + self.prev_stamp_value_3d = self.stamp_value_3d + self.prev_stamp_value_ta = self.stamp_value_ta + self.stamp_value_3d += 0x100 + self.stamp_value_ta += 0x100 + self.event_control.event_count.val += 2 + self.event_control.event_count.push() + + work.stamp_value_3d = self.stamp_value_3d + work.stamp_value_ta = self.stamp_value_ta + + agx = self.agx + ctx = self.ctx + + work.width = width = cmdbuf.fb_width + work.height = height = cmdbuf.fb_height + + ##### TVB allocations / Tiler config + + tile_width = 32 + tile_height = 32 + tiles_x = ((width + tile_width - 1) // tile_width) + tiles_y = ((height + tile_height - 1) // tile_height) + tiles = tiles_x * tiles_y + + mtiles_x = 4 + mtiles_y = 4 + + mtile_x1 = align(((tiles_x + mtiles_x - 1) // mtiles_x), 4) + mtile_x2 = 2 * mtile_x1 + mtile_x3 = 3 * mtile_x1 + mtile_y1 = align(((tiles_y + mtiles_y - 1) // mtiles_y), 4) + mtile_y2 = 2 * mtile_y1 + mtile_y3 = 3 * mtile_y1 + + mtile_stride = mtile_x1 * mtile_y1 + + ## TODO: *samples + tiles_per_mtile_x = mtile_x1 + tiles_per_mtile_y = mtile_y1 + + tile_blocks_x = (tiles_x + 15) // 16 + tile_blocks_y = (tiles_y + 15) // 16 + tile_blocks = tile_blocks_x * tile_blocks_y + + tiling_params = TilingParameters() + # rgn_header_size + rgn_entry_size = 5 + tiling_params.size1 = (rgn_entry_size * tiles_per_mtile_x * tiles_per_mtile_y + 3) // 4 + # PPP_MULTISAMPLECTL + tiling_params.unk_4 = 0x88 + # PPP_CTRL + tiling_params.unk_8 = 0x203 # bit 0: GL clip mode + # PPP_SCREEN + tiling_params.x_max = width - 1 + tiling_params.y_max = height - 1 + # TE_SCREEN + tiling_params.tile_count = ((tiles_y-1) << 12) | (tiles_x-1) + # TE_MTILE1 + tiling_params.x_blocks = mtile_x3 | (mtile_x2 << 9) | (mtile_x1 << 18) + # TE_MTILE2 + tiling_params.y_blocks = mtile_y3 | (mtile_y2 << 9) | (mtile_y1 << 18) + tiling_params.size2 = mtile_stride + tiling_params.size3 = 2 * mtile_stride + tiling_params.unk_24 = 0x100 + tiling_params.unk_28 = 0x8000 + + tilemap_size = (4 * tiling_params.size1 * mtiles_x * mtiles_y) + + tmtiles_x = tiles_per_mtile_x * mtiles_x + tmtiles_y = tiles_per_mtile_y * mtiles_y + + tpc_entry_size = 8 + tpc_size = tpc_entry_size * tmtiles_x * tmtiles_y * nclusters + + if self.tpc_size < tpc_size: + self.tpc = ctx.uobj.new_buf(tpc_size, "TPC", track=True).push() + self.tpc_size = tpc_size + + depth_aux_buffer_addr = 0 + if cmdbuf.depth_buffer: + size = align_pot(max(width, tile_width)) * align_pot(max(height, tile_width)) // 32 + depth_aux_buffer = self.ctx.uobj.new_buf(size, "Depth Aux", track=True) + work.add(depth_aux_buffer) + depth_aux_buffer_addr = depth_aux_buffer._addr + + stencil_aux_buffer_addr = 0 + if cmdbuf.stencil_buffer: + size = align_pot(max(width, tile_width)) * align_pot(max(height, tile_width)) // 32 + stencil_aux_buffer = self.ctx.uobj.new_buf(size, "Stencil Aux", track=False) + work.add(stencil_aux_buffer) + stencil_aux_buffer_addr = stencil_aux_buffer._addr + + #tvb_tilemap_size = 0x80 * mtile_stride + tvb_tilemap_size = tilemap_size + tvb_tilemap = ctx.uobj.new_buf(tvb_tilemap_size, "TVB Tilemap", track=True).push() + work.tvb_tilemap_size = tvb_tilemap_size + work.tvb_tilemap = tvb_tilemap + work.add(tvb_tilemap) + + # rogue: 0x180 * 4? + tvb_heapmeta_size = 0x200 + #tvb_heapmeta_size = 0x600 + tvb_heapmeta = ctx.uobj.new_buf(tvb_heapmeta_size, "TVB Heap Meta", track=False).push() + work.add(tvb_heapmeta) + + unk_tile_buf1 = self.ctx.uobj.new_buf(tvb_tilemap_size * nclusters, "Unk tile buf 1", track=True) + print("tvb_tilemap_size", hex(tvb_tilemap_size)) + unk_tile_buf2 = self.ctx.uobj.new_buf(0x4 * nclusters, "Unk tile buf 2", track=True) + #size = 0xc0 * nclusters + size = 0xc80 + unk_tile_buf3 = self.ctx.uobj.new_buf(size, "Unk tile buf 3", track=True) + unk_tile_buf4 = self.ctx.uobj.new_buf(0x280 * nclusters, "Unk tile buf 4", track=True) + unk_tile_buf5 = self.ctx.uobj.new_buf(0x30 * nclusters, "Unk tile buf 5", track=True) + work.add(unk_tile_buf1) + work.add(unk_tile_buf2) + work.add(unk_tile_buf3) + work.add(unk_tile_buf4) + work.add(unk_tile_buf5) + + ##### Buffer stuff? + + # buffer related? + bufferthing_buf = ctx.uobj.new_buf(0x80, "BufferThing.unkptr_18", track=True) + work.add(bufferthing_buf) + + work.buf_desc = buf_desc = agx.kobj.new(BufferThing, track=False) + work.add(buf_desc) + buf_desc.unk_0 = 0x0 + buf_desc.unk_8 = 0x0 + buf_desc.unk_10 = 0x0 + buf_desc.unkptr_18 = bufferthing_buf._addr + buf_desc.unk_20 = 0x0 + buf_desc.bm_misc_addr = self.buffer_mgr.misc_obj._addr + buf_desc.unk_2c = 0x0 + buf_desc.unk_30 = 0x0 + buf_desc.unk_38 = 0x0 + buf_desc.push() + + uuid_3d = cmdbuf.cmd_3d_id + uuid_ta = cmdbuf.cmd_ta_id + encoder_id = cmdbuf.encoder_id + + #print(barrier_cmd) + + #self.wq_ta.submit(ta_barrier_cmd) + + ##### 3D barrier command + + barrier_cmd = agx.kobj.new(WorkCommandBarrier, track=False) + work.add(barrier_cmd) + barrier_cmd.stamp = self.stamp_ta2 + barrier_cmd.wait_value = self.stamp_value_ta + barrier_cmd.stamp_self = self.stamp_value_3d + barrier_cmd.event = ev_ta.id + barrier_cmd.uuid = uuid_3d + + #print(barrier_cmd) + + self.wq_3d.submit(barrier_cmd) + + ##### 3D execution + + work.wc_3d = wc_3d = agx.kobj.new(WorkCommand3D, track=False) + work.add(work.wc_3d) + wc_3d.counter = 0 + wc_3d.context_id = self.ctx_id + wc_3d.unk_8 = 0 + wc_3d.event_control = self.event_control + wc_3d.buffer_mgr = self.buffer_mgr.info + wc_3d.buf_thing = buf_desc + wc_3d.unk_emptybuf_addr = self.unk_emptybuf._addr + wc_3d.tvb_tilemap = tvb_tilemap._addr + wc_3d.unk_40 = 0x88 + wc_3d.unk_48 = 0x1 + wc_3d.tile_blocks_y = mtile_y1 + wc_3d.tile_blocks_x = mtile_x1 + wc_3d.unk_50 = 0x0 + wc_3d.unk_58 = 0x0 + + TAN_60 = 1.732051 + wc_3d.merge_upper_x = TAN_60 / width + wc_3d.merge_upper_y = TAN_60 / height + wc_3d.unk_68 = 0x0 + wc_3d.tile_count = tiles + + wc_3d.unk_758 = Flag() + wc_3d.unk_75c = Flag() + wc_3d.unk_buf = WorkCommand1_UnkBuf() + wc_3d.busy_flag = Flag() + wc_3d.unk_buf2 = WorkCommand1_UnkBuf2() + wc_3d.unk_buf2.unk_0 = 0 + wc_3d.unk_buf2.unk_8 = 0 + wc_3d.unk_buf2.unk_10 = 1 + wc_3d.ts1 = TimeStamp(0) + wc_3d.ts2 = TimeStamp(self.ts3d_1._addr) + wc_3d.ts3 = TimeStamp(self.ts3d_2._addr) + wc_3d.unk_914 = 0 + wc_3d.unk_918 = 0 + wc_3d.unk_920 = 0 + wc_3d.unk_924 = 1 + # Ventura + wc_3d.unk_928_0 = 0 + wc_3d.unk_928_4 = 0 + wc_3d.ts_flag = TsFlag() + + # cmdbuf.ds_flags + # 0 - no depth + # 0x80000 - depth store enable + # 0x08000 - depth load enable + + # 0x00044 - compressed depth + + # 0x40000 - stencil store enable + # 0x04000 - stencil load enable + # 0x00110 - compressed stencil + + # Z store format + # 0x4000000 - Depth16Unorm + + # For Depth16Unorm: 0x40000 here also + # AFBI.[ 0. 4] unk1 = 0x4c000 + + # ASAHI_CMDBUF_SET_WHEN_RELOADING_Z_OR_S + # Actually set when loading *and* storing Z, OR loading *and* storing S + + # Structures embedded in WorkCommand3D + if True: + wc_3d.struct_1 = Start3DStruct1() + wc_3d.struct_1.store_pipeline_bind = cmdbuf.store_pipeline_bind + wc_3d.struct_1.store_pipeline_addr = cmdbuf.store_pipeline | 4 + wc_3d.struct_1.unk_8 = 0x0 + wc_3d.struct_1.unk_c = 0x0 + + TAN_60 = 1.732051 + wc_3d.struct_1.merge_upper_x = TAN_60 / width + wc_3d.struct_1.merge_upper_y = TAN_60 / height + + wc_3d.struct_1.unk_18 = 0x0 + # ISP_MTILE_SIZE + wc_3d.struct_1.tile_blocks_y = mtile_y1 + wc_3d.struct_1.tile_blocks_x = mtile_x1 + wc_3d.struct_1.unk_24 = 0x0 + wc_3d.struct_1.tile_counts = ((tiles_y-1) << 12) | (tiles_x-1) + wc_3d.struct_1.unk_2c = 0x8 + wc_3d.struct_1.depth_clear_val1 = cmdbuf.depth_clear_value + wc_3d.struct_1.stencil_clear_val1 = cmdbuf.stencil_clear_value + wc_3d.struct_1.unk_35 = 0x7 # clear flags? 2 = depth 4 = stencil? + wc_3d.struct_1.unk_36 = 0x0 + wc_3d.struct_1.unk_38 = 0x0 + wc_3d.struct_1.unk_3c = 0x1 + wc_3d.struct_1.unk_40 = 0 + wc_3d.struct_1.unk_44_padding = bytes(0xac) + wc_3d.struct_1.depth_bias_array = Start3DArrayAddr(cmdbuf.depth_bias_array) + wc_3d.struct_1.scissor_array = Start3DArrayAddr(cmdbuf.scissor_array) + wc_3d.struct_1.visibility_result_buffer = 0x0 + wc_3d.struct_1.unk_118 = 0x0 + wc_3d.struct_1.unk_120 = [0] * 37 + wc_3d.struct_1.unk_reload_pipeline = Start3DClearPipelineBinding( + cmdbuf.partial_reload_pipeline_bind, cmdbuf.partial_reload_pipeline | 4) + wc_3d.struct_1.unk_258 = 0 + wc_3d.struct_1.unk_260 = 0 + wc_3d.struct_1.unk_268 = 0 + wc_3d.struct_1.unk_270 = 0 + wc_3d.struct_1.reload_pipeline = Start3DClearPipelineBinding( + cmdbuf.partial_reload_pipeline_bind, cmdbuf.partial_reload_pipeline | 4) + wc_3d.struct_1.depth_flags = cmdbuf.ds_flags | 0x44 + wc_3d.struct_1.unk_290 = 0x0 + wc_3d.struct_1.depth_buffer_ptr1 = cmdbuf.depth_buffer + wc_3d.struct_1.unk_2a0 = 0x0 + wc_3d.struct_1.unk_2a8 = 0x0 + wc_3d.struct_1.depth_buffer_ptr2 = cmdbuf.depth_buffer + wc_3d.struct_1.depth_buffer_ptr3 = cmdbuf.depth_buffer + wc_3d.struct_1.depth_aux_buffer_ptr = depth_aux_buffer_addr + wc_3d.struct_1.stencil_buffer_ptr1 = cmdbuf.stencil_buffer + wc_3d.struct_1.unk_2d0 = 0x0 + wc_3d.struct_1.unk_2d8 = 0x0 + wc_3d.struct_1.stencil_buffer_ptr2 = cmdbuf.stencil_buffer + wc_3d.struct_1.stencil_buffer_ptr3 = cmdbuf.stencil_buffer + wc_3d.struct_1.stencil_aux_buffer_ptr = stencil_aux_buffer_addr + wc_3d.struct_1.unk_2f8 = [0x0, 0x0] + wc_3d.struct_1.aux_fb_unk0 = 4 #0x8 # sometimes 4 + wc_3d.struct_1.unk_30c = 0x0 + wc_3d.struct_1.aux_fb = AuxFBInfo(0xc000, 0, width, height) + wc_3d.struct_1.unk_320_padding = bytes(0x10) + wc_3d.struct_1.unk_partial_store_pipeline = Start3DStorePipelineBinding( + cmdbuf.partial_store_pipeline_bind, cmdbuf.partial_store_pipeline | 4) + wc_3d.struct_1.partial_store_pipeline = Start3DStorePipelineBinding( + cmdbuf.partial_store_pipeline_bind, cmdbuf.partial_store_pipeline | 4) + wc_3d.struct_1.depth_clear_val2 = cmdbuf.depth_clear_value + wc_3d.struct_1.stencil_clear_val2 = cmdbuf.stencil_clear_value + wc_3d.struct_1.unk_375 = 3 + wc_3d.struct_1.unk_376 = 0x0 + wc_3d.struct_1.unk_378 = 0x10 + wc_3d.struct_1.unk_37c = 0x0 + wc_3d.struct_1.unk_380 = 0x0 + wc_3d.struct_1.unk_388 = 0x0 + wc_3d.struct_1.unk_390_0 = 0x0 # Ventura + wc_3d.struct_1.depth_dimensions = (width - 1) | ((height - 1) << 15) + + if True: + wc_3d.struct_2 = Start3DStruct2() + wc_3d.struct_2.unk_0 = 0xa000 + wc_3d.struct_2.clear_pipeline = Start3DClearPipelineBinding( + cmdbuf.load_pipeline_bind, cmdbuf.load_pipeline | 4) + wc_3d.struct_2.unk_18 = 0x88 + wc_3d.struct_2.scissor_array = cmdbuf.scissor_array + wc_3d.struct_2.depth_bias_array = cmdbuf.depth_bias_array + wc_3d.struct_2.aux_fb = wc_3d.struct_1.aux_fb + # ISP_ZLS_PIXELS + wc_3d.struct_2.depth_dimensions = wc_3d.struct_1.depth_dimensions + wc_3d.struct_2.visibility_result_buffer = 0x0 + # ISP_ZLSCTL + wc_3d.struct_2.depth_flags = cmdbuf.ds_flags + wc_3d.struct_2.unk_58_g14_0 = 0x4040404 + wc_3d.struct_2.unk_58_g14_8 = 0 + wc_3d.struct_2.depth_buffer_ptr1 = cmdbuf.depth_buffer + wc_3d.struct_2.depth_buffer_ptr2 = cmdbuf.depth_buffer + wc_3d.struct_2.unk_68_g14_0 = 0 + wc_3d.struct_2.stencil_buffer_ptr1 = cmdbuf.stencil_buffer + wc_3d.struct_2.stencil_buffer_ptr2 = cmdbuf.stencil_buffer + wc_3d.struct_2.unk_78 = [0] * 4 + wc_3d.struct_2.depth_aux_buffer_ptr1 = depth_aux_buffer_addr + wc_3d.struct_2.unk_a0 = 0 + wc_3d.struct_2.depth_aux_buffer_ptr2 = depth_aux_buffer_addr + wc_3d.struct_2.unk_b0 = 0 + wc_3d.struct_2.stencil_aux_buffer_ptr1 = stencil_aux_buffer_addr + wc_3d.struct_2.unk_c0 = 0 + wc_3d.struct_2.stencil_aux_buffer_ptr2 = stencil_aux_buffer_addr + wc_3d.struct_2.unk_d0 = 0 + wc_3d.struct_2.tvb_tilemap = tvb_tilemap._addr + wc_3d.struct_2.tvb_heapmeta_addr = tvb_heapmeta._addr + wc_3d.struct_2.unk_e8 = tiling_params.size1 << 24 + wc_3d.struct_2.tvb_heapmeta_addr2 = tvb_heapmeta._addr + # 0x10000 - clear empty tiles + # ISP_CTL (but bits seem to have moved) + wc_3d.struct_2.unk_f8 = 0x10280 #0x10280 # TODO: varies 0, 0x280, 0x10000, 0x10280 + wc_3d.struct_2.aux_fb_ptr = aux_fb._addr + wc_3d.struct_2.unk_108 = [0x0, 0x0, 0x0, 0x0, 0x0, 0x0] + wc_3d.struct_2.pipeline_base = self.ctx.pipeline_base + wc_3d.struct_2.unk_140 = 0x8c60 + wc_3d.struct_2.unk_148 = 0x0 + wc_3d.struct_2.unk_150 = 0x0 + wc_3d.struct_2.unk_158 = 0x1c + wc_3d.struct_2.unk_160 = 0 + wc_3d.struct_2.unk_168_padding = bytes(0x1d8) + wc_3d.struct_2.unk_198_padding = bytes(0x1a8) + + if True: + wc_3d.struct_6 = Start3DStruct6() + wc_3d.struct_6.tvb_overflow_count = 0x0 + wc_3d.struct_6.unk_8 = 0x0 # 1? + wc_3d.struct_6.unk_c = 0x0 # 1? + wc_3d.struct_6.unk_10 = 0x0 + wc_3d.struct_6.encoder_id = cmdbuf.encoder_id + wc_3d.struct_6.unk_1c = 0xffffffff + wc_3d.struct_6.unknown_buffer = unk_buf._addr + wc_3d.struct_6.unk_28 = 0x0 + wc_3d.struct_6.unk_30 = 0x0 + wc_3d.struct_6.unk_34 = 0x0 + + if True: + wc_3d.struct_7 = Start3DStruct7() + wc_3d.struct_7.unk_0 = 0x0 + wc_3d.struct_7.stamp1 = self.stamp_3d1 + wc_3d.struct_7.stamp2 = self.stamp_3d2 + wc_3d.struct_7.stamp_value = self.stamp_value_3d + wc_3d.struct_7.ev_3d = ev_3d.id + wc_3d.struct_7.evctl_index = 0x0 + wc_3d.struct_7.unk_24 = 1 + wc_3d.struct_7.uuid = uuid_3d + wc_3d.struct_7.prev_stamp_value = self.prev_stamp_value_3d >> 8 + wc_3d.struct_7.unk_30 = 0x0 + + wc_3d.set_addr() # Update inner structure addresses + #print("WC3D", hex(wc_3d._addr)) + #print(" s1", hex(wc_3d.struct_1._addr)) + #print(" s2", hex(wc_3d.struct_2._addr)) + #print(" s6", hex(wc_3d.struct_6._addr)) + #print(" s7", hex(wc_3d.struct_7._addr)) + + ms = GPUMicroSequence(agx) + + start_3d = Start3DCmd() + start_3d.struct1 = wc_3d.struct_1 # 0x44 bytes! + start_3d.struct2 = wc_3d.struct_2 # 0x168 bytes! + start_3d.buf_thing = buf_desc + start_3d.stats_ptr = agx.initdata.regionB.stats_3d.stats._addr + start_3d.busy_flag_ptr = wc_3d.busy_flag._addr + start_3d.struct6 = wc_3d.struct_6 # 4 bytes! + start_3d.struct7 = wc_3d.struct_7 # 4 bytes! + start_3d.cmdqueue_ptr = self.wq_3d.info._addr + start_3d.workitem_ptr = wc_3d._addr + start_3d.context_id = self.ctx_id + start_3d.unk_50 = 0x1 + start_3d.event_generation = self.event_control.generation + start_3d.buffer_mgr_slot = self.buffer_mgr_slot + start_3d.unk_5c = 0x0 + start_3d.prev_stamp_value = self.prev_stamp_value_3d >> 8 + start_3d.unk_68 = 0x0 + start_3d.unk_buf_ptr = wc_3d.unk_758._addr + start_3d.unk_buf2_ptr = wc_3d.unk_buf2._addr + start_3d.unk_7c = 0x0 + start_3d.unk_80 = 0x0 + start_3d.unk_84 = 0x0 + start_3d.uuid = uuid_3d + start_3d.attachments = [] + start_3d.unk_194 = 0 + start_3d.unkptr_19c = self.event_control.unk_buf._addr + + work.fb = None + work.depth = None + + for i in cmdbuf.attachments[:cmdbuf.attachment_count]: + cache_lines = align_up(i.size, 128) // 128 + order = 1 # FIXME + start_3d.attachments.append(Attachment(i.pointer, cache_lines, 0x17, order)) # FIXME check + if work.fb is None and i.type == ASAHI_ATTACHMENT_C: + work.fb = i.pointer + if work.depth is None and i.type == ASAHI_ATTACHMENT_Z: + work.depth = i.pointer + start_3d.attachments += [Attachment(0, 0, 0, 0)] * (16 - len(start_3d.attachments)) + start_3d.num_attachments = cmdbuf.attachment_count + start_3d.unk_190 = 0x0 + + start_3d_offset = ms.append(start_3d) + + ts1 = TimestampCmd() + ts1.unk_1 = 0x0 + ts1.unk_2 = 0x0 + ts1.unk_3 = 0x80 + ts1.ts0_addr = wc_3d.ts1._addr + ts1.ts1_addr = wc_3d.ts2._addr + ts1.ts2_addr = wc_3d.ts2._addr + ts1.cmdqueue_ptr = self.wq_3d.info._addr + ts1.unk_24 = 0x0 + if Ver.check("V >= V13_0B4"): + ts1.unkptr_2c_0 = wc_3d.ts_flag._addr + ts1.uuid = uuid_3d + ts1.unk_30_padding = 0x0 + ms.append(ts1) + + ms.append(WaitForInterruptCmd(0, 1, 0)) + + ts2 = TimestampCmd() + ts2.unk_1 = 0x0 + ts2.unk_2 = 0x0 + ts2.unk_3 = 0x0 + ts2.ts0_addr = wc_3d.ts1._addr + ts2.ts1_addr = wc_3d.ts2._addr + ts2.ts2_addr = wc_3d.ts3._addr + ts2.cmdqueue_ptr = self.wq_3d.info._addr + ts2.unk_24 = 0x0 + if Ver.check("V >= V13_0B4"): + ts2.unkptr_2c_0 = wc_3d.ts_flag._addr + ts2.uuid = uuid_3d + ts2.unk_30_padding = 0x0 + ms.append(ts2) + + finish_3d = Finalize3DCmd() + finish_3d.uuid = uuid_3d + finish_3d.unk_8 = 0 + finish_3d.stamp = self.stamp_3d2 + finish_3d.stamp_value = self.stamp_value_3d + finish_3d.unk_18 = 0 + finish_3d.buf_thing = buf_desc + finish_3d.buffer_mgr = self.buffer_mgr.info + finish_3d.unk_2c = 1 + finish_3d.stats_ptr = agx.initdata.regionB.stats_3d.stats._addr + finish_3d.struct7 = wc_3d.struct_7 + finish_3d.busy_flag_ptr = wc_3d.busy_flag._addr + finish_3d.cmdqueue_ptr = self.wq_3d.info._addr + finish_3d.workitem_ptr = wc_3d._addr + finish_3d.unk_5c = self.ctx_id + finish_3d.unk_buf_ptr = wc_3d.unk_758._addr + finish_3d.unk_6c = 0 + finish_3d.unk_74 = 0 + finish_3d.unk_7c = 0 + finish_3d.unk_84 = 0 + finish_3d.unk_8c = 0 + finish_3d.unk_8c_g14 = 0 + finish_3d.restart_branch_offset = start_3d_offset - ms.off + finish_3d.unk_98 = 0 + finish_3d.unk_9c = bytes(0x10) + ms.append(finish_3d) + ms.finalize() + + work.add(ms.obj) + + wc_3d.microsequence_ptr = ms.obj._addr + wc_3d.microsequence_size = ms.size + + print(wc_3d) + self.wq_3d.submit(wc_3d) + + ##### TA init + + #print(ctx_info) + if wait_for is not None: + barrier_cmd = agx.kobj.new(WorkCommandBarrier, track=False) + work.add(barrier_cmd) + if not isinstance(wait_for, tuple): + barrier_cmd.stamp = wait_for.renderer.stamp_3d2 + barrier_cmd.wait_value = wait_for.stamp_value_3d + barrier_cmd.event = wait_for.ev_3d.id + else: + barrier_cmd.stamp_addr = wait_for[0] + barrier_cmd.wait_value = wait_for[1] + barrier_cmd.event = wait_for[2] + + barrier_cmd.stamp_self = self.stamp_value_ta + barrier_cmd.uuid = uuid_ta + + self.wq_ta.submit(barrier_cmd) + + if not self.buffer_mgr_initialized: + wc_initbm = agx.kobj.new(WorkCommandInitBM, track=False) + work.add(wc_initbm) + wc_initbm.context_id = self.ctx_id + wc_initbm.buffer_mgr_slot = self.buffer_mgr_slot + wc_initbm.unk_c = 0 + wc_initbm.unk_10 = self.buffer_mgr.info.block_count + wc_initbm.buffer_mgr = self.buffer_mgr.info + wc_initbm.stamp_value = self.stamp_value_ta + + self.wq_ta.submit(wc_initbm) + + self.buffer_mgr_initialized = True + + ##### TA execution + + work.wc_ta = wc_ta = agx.kobj.new(WorkCommandTA, track=False) + work.add(work.wc_ta) + wc_ta.context_id = self.ctx_id + wc_ta.counter = 1 + wc_ta.unk_8 = 0 + wc_ta.event_control = self.event_control + wc_ta.buffer_mgr_slot = self.buffer_mgr_slot + wc_ta.buffer_mgr = self.buffer_mgr.info + wc_ta.buf_thing = buf_desc + wc_ta.unk_emptybuf_addr = wc_3d.unk_emptybuf_addr + wc_ta.unk_34 = 0x0 + + wc_ta.unk_154 = bytes(0x268) + wc_ta.unk_3e8 = bytes(0x74) + wc_ta.unk_594 = WorkCommand0_UnkBuf() + + wc_ta.ts1 = TimeStamp(0) + wc_ta.ts2 = TimeStamp(self.tsta_1._addr) + wc_ta.ts3 = TimeStamp(self.tsta_2._addr) + wc_ta.unk_5c4 = 0 + wc_ta.unk_5c8 = 0 + wc_ta.unk_5cc = 0 + wc_ta.unk_5d0 = 0 + wc_ta.unk_5d4 = 1 #0x27 #1 + # Ventura + wc_ta.unk_5e0 = 0 + wc_ta.unk_5e4 = 0 + wc_ta.ts_flag = TsFlag() + + # Structures embedded in WorkCommandTA + if True: + wc_ta.tiling_params = tiling_params + + if True: + wc_ta.struct_2 = StartTACmdStruct2() + wc_ta.struct_2.unk_0 = 0x200 + wc_ta.struct_2.unk_8 = 0x1e3ce508 # fixed + wc_ta.struct_2.unk_c = 0x1e3ce508 # fixed + wc_ta.struct_2.tvb_tilemap = tvb_tilemap._addr + wc_ta.struct_2.tvb_cluster_tilemaps = unk_tile_buf1._addr + wc_ta.struct_2.tpc = self.tpc._addr + wc_ta.struct_2.tvb_heapmeta_addr = tvb_heapmeta._addr | 0x8000_0000_0000_0000 + wc_ta.struct_2.iogpu_unk_54 = 0x6b0003 # fixed + wc_ta.struct_2.iogpu_unk_55 = 0x3a0012 # fixed + wc_ta.struct_2.iogpu_unk_56 = 0x1 # fixed + wc_ta.struct_2.tvb_cluster_meta1 = unk_tile_buf2._addr | 0x4_0000_0000_0000 + wc_ta.struct_2.unk_48 = 0xa000 + wc_ta.struct_2.unk_50 = 0x88 # fixed + wc_ta.struct_2.tvb_heapmeta_addr2 = tvb_heapmeta._addr + wc_ta.struct_2.unk_60 = 0x0 # fixed + wc_ta.struct_2.core_mask = 0xffffffffffffffff + #wc_ta.struct_2.unk_68 = 0xff << (8 * (self.buffer_mgr_slot % 8)) + wc_ta.struct_2.iogpu_deflake_1 = deflake_1._addr + wc_ta.struct_2.iogpu_deflake_2 = deflake_2._addr + wc_ta.struct_2.unk_80 = 0x1 # fixed + wc_ta.struct_2.iogpu_deflake_3 = deflake_3._addr | 0x4_0000_0000_0000 # check + wc_ta.struct_2.encoder_addr = cmdbuf.encoder_ptr + wc_ta.struct_2.tvb_cluster_meta2 = unk_tile_buf3._addr + wc_ta.struct_2.tvb_cluster_meta3 = unk_tile_buf4._addr + wc_ta.struct_2.tiling_control = 0xa040 #0xa041 # fixed + wc_ta.struct_2.unk_b0 = [0x0, 0x0, 0x0, 0x0, 0x0, 0x0] # fixed + wc_ta.struct_2.pipeline_base = self.ctx.pipeline_base + wc_ta.struct_2.tvb_cluster_meta4 = unk_tile_buf5._addr | 0x3000_0000_0000_0000 + wc_ta.struct_2.unk_f0 = 0x20 # fixed + wc_ta.struct_2.unk_f8 = 0x8c60 # fixed + wc_ta.struct_2.unk_100 = [0x0, 0x0, 0x0] # fixed + wc_ta.struct_2.unk_118 = 0x1c # fixed + + if True: + wc_ta.struct_3 = StartTACmdStruct3() + wc_ta.struct_3.unk_480 = [0x0, 0x0, 0x0, 0x0, 0x0, 0x0] # fixed + wc_ta.struct_3.unk_498 = 0x0 # fixed + wc_ta.struct_3.unk_4a0 = 0x0 # fixed + wc_ta.struct_3.iogpu_deflake_1 = deflake_1._addr + wc_ta.struct_3.unk_4ac = 0x0 # fixed + wc_ta.struct_3.unk_4b0 = 0x0 # fixed + wc_ta.struct_3.unk_4b8 = 0x0 # fixed + wc_ta.struct_3.unk_4bc = 0x0 # fixed + wc_ta.struct_3.unk_4c4_padding = bytes(0x48) + wc_ta.struct_3.unk_50c = 0x0 # fixed + wc_ta.struct_3.unk_510 = 0x0 # fixed + wc_ta.struct_3.unk_518 = 0x0 # fixed + wc_ta.struct_3.unk_520 = 0x0 # fixed + wc_ta.struct_3.unk_528 = 0x0 # fixed + wc_ta.struct_3.unk_52c = 0x0 # fixed + wc_ta.struct_3.unk_530 = 0x0 # fixed + wc_ta.struct_3.encoder_id = cmdbuf.encoder_id + wc_ta.struct_3.unk_538 = 0x0 # fixed + wc_ta.struct_3.unk_53c = 0xffffffff + wc_ta.struct_3.unknown_buffer = wc_3d.struct_6.unknown_buffer + wc_ta.struct_3.unk_548 = 0x0 # fixed + wc_ta.struct_3.unk_550 = [ + 0x0, 0x0, # fixed + 0x0, # 1 for boot stuff? + 0x0, 0x0, 0x0] # fixed + wc_ta.struct_3.stamp1 = self.stamp_ta1 + wc_ta.struct_3.stamp2 = self.stamp_ta2 + wc_ta.struct_3.stamp_value = self.stamp_value_ta + wc_ta.struct_3.ev_ta = ev_ta.id + wc_ta.struct_3.evctl_index = 0 + wc_ta.struct_3.unk_584 = 0x0 # 1 for boot stuff? + wc_ta.struct_3.uuid2 = uuid_ta + wc_ta.struct_3.prev_stamp_value = self.prev_stamp_value_ta >> 8 + wc_ta.struct_3.unk_590 = 0 # sometimes 1? + + wc_ta.set_addr() # Update inner structure addresses + #print("wc_ta", wc_ta) + + ms = GPUMicroSequence(agx) + + start_ta = StartTACmd() + start_ta.tiling_params = wc_ta.tiling_params + start_ta.struct2 = wc_ta.struct_2 # len 0x120 + start_ta.buffer_mgr = self.buffer_mgr.info + start_ta.buf_thing = buf_desc + start_ta.stats_ptr = agx.initdata.regionB.stats_ta.stats._addr + start_ta.cmdqueue_ptr = self.wq_ta.info._addr + start_ta.context_id = self.ctx_id + start_ta.unk_38 = 1 + start_ta.event_generation = self.event_control.generation + start_ta.buffer_mgr_slot = self.buffer_mgr_slot + start_ta.unk_48 = 0#1 #0 + start_ta.unk_50 = 0 + start_ta.struct3 = wc_ta.struct_3 + + start_ta.unkptr_5c = wc_ta.unk_594._addr + start_ta.unk_64 = 0x0 # fixed + start_ta.unk_68 = 0x0 # sometimes 1? + start_ta.uuid = uuid_ta + start_ta.unk_70 = 0x0 # fixed + start_ta.unk_74 = [ # fixed + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + ] + start_ta.unk_15c = 0x0 # fixed + start_ta.unk_160 = 0x0 # fixed + start_ta.unk_168 = 0x0 # fixed + start_ta.unk_16c = 0x0 # fixed + start_ta.unk_170 = 0x0 # fixed + start_ta.unk_178 = 0x0 # fixed? + start_ta.unk_17c = 0x0 + start_ta.unkptr_180 = self.event_control.unk_buf._addr + start_ta.unk_188 = 0x0 + + start_ta_offset = ms.append(start_ta) + + ts1 = TimestampCmd() + ts1.unk_1 = 0x0 + ts1.unk_2 = 0x0 + ts1.unk_3 = 0x80 + ts1.ts0_addr = wc_ta.ts1._addr + ts1.ts1_addr = wc_ta.ts2._addr + ts1.ts2_addr = wc_ta.ts2._addr + ts1.cmdqueue_ptr = self.wq_ta.info._addr + ts1.unk_24 = 0x0 + if Ver.check("V >= V13_0B4"): + ts1.unkptr_2c_0 = wc_ta.ts_flag._addr + ts1.uuid = uuid_ta + ts1.unk_30_padding = 0x0 + ms.append(ts1) + + ms.append(WaitForInterruptCmd(1, 0, 0)) + + ts2 = TimestampCmd() + ts2.unk_1 = 0x0 + ts2.unk_2 = 0x0 + ts2.unk_3 = 0x0 + ts2.ts0_addr = wc_ta.ts1._addr + ts2.ts1_addr = wc_ta.ts2._addr + ts2.ts2_addr = wc_ta.ts3._addr + ts2.cmdqueue_ptr = self.wq_ta.info._addr + ts2.unk_24 = 0x0 + if Ver.check("V >= V13_0B4"): + ts2.unkptr_2c_0 = wc_ta.ts_flag._addr + ts2.uuid = uuid_ta + ts2.unk_30_padding = 0x0 + ms.append(ts2) + + finish_ta = FinalizeTACmd() + finish_ta.buf_thing = buf_desc + finish_ta.buffer_mgr = self.buffer_mgr.info + finish_ta.stats_ptr = agx.initdata.regionB.stats_ta.stats._addr + finish_ta.cmdqueue_ptr = self.wq_ta.info._addr + finish_ta.context_id = self.ctx_id + finish_ta.unk_28 = 0x0 # fixed + finish_ta.struct3 = wc_ta.struct_3 + finish_ta.unk_34 = 0x0 # fixed + finish_ta.uuid = uuid_ta + finish_ta.stamp = self.stamp_ta2 + finish_ta.stamp_value = self.stamp_value_ta + finish_ta.unk_48 = 0x0 # fixed + finish_ta.unk_50 = 0x0 # fixed + finish_ta.unk_54 = 0x0 # fixed + finish_ta.unk_58 = 0x0 # fixed + finish_ta.unk_60 = 0x0 # fixed + finish_ta.unk_64 = 0x0 # fixed + finish_ta.unk_68 = 0x0 # fixed + finish_ta.unk_6c_g14 = 0 # fixed + finish_ta.restart_branch_offset = start_ta_offset - ms.off + finish_ta.unk_70 = 0x0 # fixed + finish_ta.unk_74 = bytes(0x10) # Ventura + ms.append(finish_ta) + + ms.finalize() + + work.add(ms.obj) + + wc_ta.unkptr_45c = self.tpc._addr + wc_ta.tvb_size = tpc_size + wc_ta.microsequence_ptr = ms.obj._addr + wc_ta.microsequence_size = ms.size + wc_ta.ev_3d = ev_3d.id + wc_ta.stamp_value = self.stamp_value_ta + + print(wc_ta) + self.wq_ta.submit(wc_ta) + + self.agx.log("Submit done") + return work + + def run(self): + ##### Run queues + self.agx.log("Run queues") + self.agx.ch.queue[self.queue].q_3D.run(self.wq_3d, self.ev_3d.id) + self.agx.ch.queue[self.queue].q_TA.run(self.wq_ta, self.ev_ta.id) + self.agx.log("Run done") + + def wait(self): + self.agx.log("Waiting...") + work = self.work[-1] + + ##### Wait for work completion + while not self.ev_3d.fired: + self.agx.wait_for_events(timeout=2.0) + + if not self.ev_3d.fired: + self.agx.log("3D event didn't fire") + + self.agx.log(f"Event {self.ev_3d.id} fired") + #print("Stamps:") + #print(self.stamp_ta1.pull()) + #print(self.stamp_ta2.pull()) + #print(self.stamp_3d1.pull()) + #print(self.stamp_3d2.pull()) + + #print("WCs:") + #print(work.wc_3d.pull()) + #print(work.wc_ta.pull()) + + #if work.fb is not None and work.width and work.height: + if work.fb is not None and work.width and work.height and work.width == 1920: + agx = self.agx + self.agx.log(f"Render {work.width}x{work.height} @ {work.fb:#x}") + base, obj = self.agx.find_object(work.fb, self.ctx_id) + + #unswizzle(agx, obj._paddr, work.width, work.height, 4, "fb.bin", grid=False) + #open("fb.bin", "wb").write(self.agx.u.iface.readmem(obj._paddr, work.width*work.height*4)) + #os.system(f"convert -size {work.width}x{work.height} -depth 8 rgba:fb.bin -alpha off frame{self.frames}.png") + self.agx.p.fb_blit(0, 0, work.width, work.height, obj._paddr, work.width, PIX_FMT.XBGR) + + if False: #work.depth is not None: + base, obj = self.agx.find_object(work.depth, self.ctx_id) + + width = align_up(work.width, 64) + height = align_up(work.height, 64) + + obj.pull() + chexdump(obj.val) + + unswizzle(self.agx, obj._paddr, work.width, work.height, 4, "depth.bin", grid=False) + os.system(f"convert -size {work.width}x{work.height} -depth 8 rgba:depth.bin -alpha off depth.png") + + for i in self.work: + i.free() + + self.work = [] diff --git a/tools/proxyclient/m1n1/agx/shim.py b/tools/proxyclient/m1n1/agx/shim.py new file mode 100644 index 0000000..253812a --- /dev/null +++ b/tools/proxyclient/m1n1/agx/shim.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT + +import errno, ctypes, sys, atexit, os, os.path, mmap +from construct import * + +from m1n1 import malloc +from m1n1.utils import Register32 +from m1n1.agx import AGX +from m1n1.agx.render import * +from m1n1.agx.uapi import * +from m1n1.proxyutils import * +from m1n1.utils import * + +PAGE_SIZE = 32768 +SHIM_MEM_SIZE = 4 * 1024 * 1024 * 1024 + +class IOCTL(Register32): + NR = 7, 0 + TYPE = 15, 8 + SIZE = 29, 16 + DIR = 31, 30 + +_IOC_NONE = 0 +_IOC_WRITE = 1 +_IOC_READ = 2 + +_IO = lambda type, nr: IOCTL(TYPE=type, NR=nr, SIZE=0, DIR=_IOC_NONE) +_IOR = lambda type, nr, size: IOCTL(TYPE=type, NR=nr, SIZE=size, DIR=_IOC_READ) +_IOW = lambda type, nr, size: IOCTL(TYPE=type, NR=nr, SIZE=size, DIR=_IOC_WRITE) +_IOWR = lambda type, nr, size: IOCTL(TYPE=type, NR=nr, SIZE=size, DIR=_IOC_READ|_IOC_WRITE) + +DRM_IOCTL_BASE = ord('d') + +def IO(nr): + def dec(f): + f._ioctl = _IO(DRM_IOCTL_BASE, nr) + return f + return dec + +def IOR(nr, cls): + def dec(f): + f._ioctl = _IOR(DRM_IOCTL_BASE, nr, cls.sizeof()) + f._arg_cls = cls + return f + return dec + +def IOW(nr, cls): + def dec(f): + f._ioctl = _IOW(DRM_IOCTL_BASE, nr, cls.sizeof()) + f._arg_cls = cls + return f + return dec + +def IOWR(nr, cls): + def dec(f): + f._ioctl = _IOWR(DRM_IOCTL_BASE, nr, cls.sizeof()) + f._arg_cls = cls + return f + return dec + +class DRMAsahiShim: + def __init__(self, memfd): + self.memfd = memfd + self.initialized = False + self.ioctl_map = {} + for key in dir(self): + f = getattr(self, key) + ioctl = getattr(f, "_ioctl", None) + if ioctl is not None: + self.ioctl_map[ioctl.value] = ioctl, f + self.bos = {} + self.pull_buffers = bool(os.getenv("ASAHI_SHIM_PULL")) + self.dump_frames = bool(os.getenv("ASAHI_SHIM_DUMP")) + self.frame = 0 + self.agx = None + + def read_buf(self, ptr, size): + return ctypes.cast(ptr, ctypes.POINTER(ctypes.c_ubyte * size))[0] + + def init_agx(self): + from m1n1.setup import p, u, iface + + p.pmgr_adt_clocks_enable("/arm-io/gfx-asc") + p.pmgr_adt_clocks_enable("/arm-io/sgx") + + self.agx = agx = AGX(u) + + mon = RegMonitor(u, ascii=True, bufsize=0x8000000) + agx.mon = mon + + sgx = agx.sgx_dev + #mon.add(sgx.gpu_region_base, sgx.gpu_region_size, "contexts") + #mon.add(sgx.gfx_shared_region_base, sgx.gfx_shared_region_size, "gfx-shared") + #mon.add(sgx.gfx_handoff_base, sgx.gfx_handoff_size, "gfx-handoff") + + #mon.add(agx.initdasgx.gfx_handoff_base, sgx.gfx_handoff_size, "gfx-handoff") + + atexit.register(p.reboot) + agx.start() + + def init(self): + if self.initialized: + return + + self.init_agx() + self.ctx = GPUContext(self.agx) + self.ctx.bind(0x17) + self.renderer = GPURenderer(self.ctx, 0x40, bm_slot=10, queue=1) + + self.initialized = True + + @IOW(DRM_COMMAND_BASE + 0x00, drm_asahi_submit_t) + def submit(self, fd, args): + sys.stdout.write(".") + sys.stdout.flush() + + size = drm_asahi_cmdbuf_t.sizeof() + cmdbuf = drm_asahi_cmdbuf_t.parse(self.read_buf(args.cmdbuf, size)) + + self.log("Pushing objects...") + for obj in self.bos.values(): + #if obj._skipped_pushes > 64:# and obj._addr > 0x1200000000 and obj._size > 131072: + #continue + obj.push(True) + self.log("Push done") + + attachment_objs = [] + for i in cmdbuf.attachments: + for obj in self.bos.values(): + if obj._addr == i.pointer: + attachment_objs.append(obj) + + if self.dump_frames: + name = f"shim_frame{self.frame:03d}.agx" + f = GPUFrame(self.renderer.ctx) + f.cmdbuf = cmdbuf + for obj in self.bos.values(): + f.add_object(obj) + f.save(name) + + self.renderer.submit(cmdbuf) + self.renderer.run() + self.renderer.wait() + + if self.pull_buffers: + self.log("Pulling buffers...") + for obj in attachment_objs: + obj.pull() + obj._map[:] = obj.val + obj.val = obj._map + self.log("Pull done") + + #print("HEAP STATS") + #self.ctx.uobj.va.check() + #self.ctx.gobj.va.check() + #self.ctx.pobj.va.check() + #self.agx.kobj.va.check() + #self.agx.cmdbuf.va.check() + #self.agx.kshared.va.check() + #self.agx.kshared2.va.check() + + self.frame += 1 + return 0 + + @IOW(DRM_COMMAND_BASE + 0x01, drm_asahi_wait_bo_t) + def wait_bo(self, fd, args): + self.log("Wait BO!", args) + return 0 + + @IOWR(DRM_COMMAND_BASE + 0x02, drm_asahi_create_bo_t) + def create_bo(self, fd, args): + memfd_offset = args.offset + + if args.flags & ASAHI_BO_PIPELINE: + alloc = self.renderer.ctx.pobj + else: + alloc = self.renderer.ctx.gobj + + obj = alloc.new(args.size, name=f"GBM offset {memfd_offset:#x}", track=False) + obj._memfd_offset = memfd_offset + obj._pushed = False + obj.val = obj._map = mmap.mmap(self.memfd, args.size, offset=memfd_offset) + self.bos[memfd_offset] = obj + args.offset = obj._addr + + if args.flags & ASAHI_BO_PIPELINE: + args.offset -= self.renderer.ctx.pipeline_base + + self.log(f"Create BO @ {memfd_offset:#x}") + return 0 + + @IOWR(DRM_COMMAND_BASE + 0x04, drm_asahi_get_param_t) + def get_param(self, fd, args): + self.log("Get Param!", args) + return 0 + + @IOWR(DRM_COMMAND_BASE + 0x05, drm_asahi_get_bo_offset_t) + def get_bo_offset(self, fd, args): + self.log("Get BO Offset!", args) + return 0 + + def bo_free(self, memfd_offset): + self.log(f"Free BO @ {memfd_offset:#x}") + self.bos[memfd_offset].free() + del self.bos[memfd_offset] + sys.stdout.flush() + + def ioctl(self, fd, request, p_arg): + self.init() + + p_arg = ctypes.c_void_p(p_arg) + + if request not in self.ioctl_map: + self.log(f"Unknown ioctl: fd={fd} request={IOCTL(request)} arg={p_arg:#x}") + return -errno.ENOSYS + + ioctl, f = self.ioctl_map[request] + + size = ioctl.SIZE + if ioctl.DIR & _IOC_WRITE: + args = f._arg_cls.parse(self.read_buf(p_arg, size)) + ret = f(fd, args) + elif ioctl.DIR & _IOC_READ: + args = f._arg_cls.parse(bytes(size)) + ret = f(fd, args) + else: + ret = f(fd) + + if ioctl.DIR & _IOC_READ: + data = args.build() + assert len(data) == size + ctypes.memmove(p_arg, data, size) + + sys.stdout.flush() + return ret + + def log(self, s): + if self.agx is None: + print("[Shim] " + s) + else: + self.agx.log("[Shim] " + s) + +Shim = DRMAsahiShim diff --git a/tools/proxyclient/m1n1/agx/uapi.py b/tools/proxyclient/m1n1/agx/uapi.py new file mode 100644 index 0000000..75850cb --- /dev/null +++ b/tools/proxyclient/m1n1/agx/uapi.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT + +from construct import * +from m1n1.constructutils import ConstructClass + +__all__ = [] + +DRM_COMMAND_BASE = 0x40 + +ASAHI_BO_PIPELINE = 1 + +class drm_asahi_submit_t(ConstructClass): + subcon = Struct( + "cmdbuf" / Int64ul, + "in_syncs" / Int64ul, + "in_sync_count" / Int32ul, + "out_sync" / Int32ul, + ) + +class drm_asahi_wait_bo_t(ConstructClass): + subcon = Struct( + "handle" / Int32ul, + Padding(4), + "timeout_ns" / Int64sl, + ) + +class drm_asahi_create_bo_t(ConstructClass): + subcon = Struct( + "size" / Int32ul, + "flags" / Int32ul, + "handle" / Int32ul, + Padding(4), + "offset" / Int64ul, + ) + +#class drm_asahi_mmap_bo_t(ConstructClass): + #subcon = Struct( + #"handle" / Int32ul, + #"flags" / Int32ul, + #"offset" / Int64ul, + #) + +class drm_asahi_get_param_t(ConstructClass): + subcon = Struct( + "param" / Int32ul, + Padding(4), + "value" / Int64ul, + ) + +class drm_asahi_get_bo_offset_t(ConstructClass): + subcon = Struct( + "handle" / Int32ul, + Padding(4), + "offset" / Int64ul, + ) + +ASAHI_MAX_ATTACHMENTS = 16 + +ASAHI_ATTACHMENT_C = 0 +ASAHI_ATTACHMENT_Z = 1 +ASAHI_ATTACHMENT_S = 2 + +class drm_asahi_attachment_t(ConstructClass): + subcon = Struct( + "type" / Int32ul, + "size" / Int32ul, + "pointer" / Int64ul, + ) + +ASAHI_CMDBUF_LOAD_C = (1 << 0) +ASAHI_CMDBUF_LOAD_Z = (1 << 1) +ASAHI_CMDBUF_LOAD_S = (1 << 2) + +class drm_asahi_cmdbuf_t(ConstructClass): + subcon = Struct( + "flags" / Int64ul, + + "encoder_ptr" / Int64ul, + "encoder_id" / Int32ul, + + "cmd_ta_id" / Int32ul, + "cmd_3d_id" / Int32ul, + + "ds_flags" / Int32ul, + "depth_buffer" / Int64ul, + "stencil_buffer" / Int64ul, + + "scissor_array" / Int64ul, + "depth_bias_array" / Int64ul, + + "fb_width" / Int32ul, + "fb_height" / Int32ul, + + "load_pipeline" / Int32ul, + "load_pipeline_bind" / Int32ul, + + "store_pipeline" / Int32ul, + "store_pipeline_bind" / Int32ul, + + "partial_reload_pipeline" / Int32ul, + "partial_reload_pipeline_bind" / Int32ul, + + "partial_store_pipeline" / Int32ul, + "partial_store_pipeline_bind" / Int32ul, + + "depth_clear_value" / Float32l, + "stencil_clear_value" / Int8ul, + Padding(3), + + "attachments" / Array(ASAHI_MAX_ATTACHMENTS, drm_asahi_attachment_t), + "attachment_count" / Int32ul, + ) + +__all__.extend(k for k, v in globals().items() + if ((callable(v) or isinstance(v, type)) and v.__module__ == __name__) or isinstance(v, int)) |
