9 files changed, 2911 insertions, 0 deletions
diff --git a/tools/proxyclient/m1n1/agx/__init__.py b/tools/proxyclient/m1n1/agx/__init__.py
new file mode 100644
index 0000000..26368ce
--- /dev/null
+++ b/tools/proxyclient/m1n1/agx/__init__.py
@@ -0,0 +1,343 @@
+# SPDX-License-Identifier: MIT
+import bisect, time
+
+from .object import GPUObject, GPUAllocator
+from .initdata import build_initdata
+from .channels import *
+from .event import GPUEventManager
+from ..proxy import IODEV
+from ..malloc import Heap
+from ..hw.uat import UAT, MemoryAttr
+from ..hw.agx import *
+from ..fw.agx import AGXASC
+from ..fw.agx.channels import ChannelInfoSet, ChannelInfo
+
+class AGXChannels:
+    pass
+
+class AGXQueue:
+    pass
+
+class AGX:
+    PAGE_SIZE = 0x4000
+    MAX_EVENTS = 128
+
+    def __init__(self, u):
+        self.start_time = time.time()
+        self.u = u
+        self.p = u.proxy
+
+        self.iface = u.iface
+        self.show_stats = False
+
+        self.asc_dev = u.adt["/arm-io/gfx-asc"]
+        self.sgx_dev = u.adt["/arm-io/sgx"]
+        self.sgx = SGXRegs(u, self.sgx_dev.get_reg(0)[0])
+
+        self.log("Initializing allocations")
+
+        self.aic_base = u.adt["/arm-io/aic"].get_reg(0)[0]
+
+        self.all_objects = {}
+        self.tracked_objects = {}
+
+        # Memory areas
+        self.fw_va_base = self.sgx_dev.rtkit_private_vm_region_base
+        self.fw_va_size = self.sgx_dev.rtkit_private_vm_region_size
+        self.kern_va_base = self.fw_va_base + self.fw_va_size
+
+        # Set up UAT
+        self.uat = UAT(self.u.iface, self.u)
+
+        # Allocator for RTKit/ASC objects
+        self.uat.allocator = Heap(self.kern_va_base + 0x80000000,
+                                  self.kern_va_base + 0x81000000,
+                                  self.PAGE_SIZE)
+
+        self.asc = AGXASC(self.u, self.asc_dev.get_reg(0)[0], self, self.uat)
+        self.asc.verbose = 0
+        self.asc.mgmt.verbose = 0
+
+        self.kobj = GPUAllocator(self, "kernel",
+                                 self.kern_va_base, 0x10000000,
+                                 AttrIndex=MemoryAttr.Shared, AP=1, guard_pages=4)
+        self.cmdbuf = GPUAllocator(self, "cmdbuf",
+                                   self.kern_va_base + 0x10000000, 0x10000000,
+                                   AttrIndex=MemoryAttr.Shared, AP=0, guard_pages=4)
+        self.kshared = GPUAllocator(self, "kshared",
+                                    self.kern_va_base + 0x20000000, 0x10000000,
+                                    AttrIndex=MemoryAttr.Shared, AP=1, guard_pages=4)
+        self.kshared2 = GPUAllocator(self, "kshared2",
+                                     self.kern_va_base + 0x30000000, 0x100000,
+                                     AttrIndex=MemoryAttr.Shared, AP=0, PXN=1, guard_pages=4)
+
+        self.io_allocator = Heap(self.kern_va_base + 0x38000000,
+                                 self.kern_va_base + 0x40000000,
+                                 block=self.PAGE_SIZE)
+
+        self.mon = None
+        self.event_mgr = GPUEventManager(self)
+
+        self.p.iodev_set_usage(IODEV.FB, 0)
+
+        self.initdata_hook = None
+
+        # Early init, needed?
+        self.poke_sgx()
+
+    def poke_sgx(self):
+        self.sgx_base = self.sgx_dev.get_reg(0)[0]
+        self.p.read32(self.sgx_base + 0xd14000)
+        self.p.write32(self.sgx_base + 0xd14000, 0x70001)
+
+    def find_object(self, addr, ctx=0):
+        all_objects = list(self.all_objects.items())
+        all_objects.sort()
+
+        idx = bisect.bisect_left(all_objects, ((ctx, addr + 1), "")) - 1
+        if idx < 0 or idx >= len(all_objects):
+            return None, None
+
+        (ctx, base), obj = all_objects[idx]
+        return base, obj
+
+    def reg_object(self, obj, track=True):
+        self.all_objects[(obj._ctx, obj._addr)] = obj
+        if track:
+            if self.mon is not None:
+                obj.add_to_mon(self.mon)
+            self.tracked_objects[(obj._ctx, obj._addr)] = obj
+
+    def unreg_object(self, obj):
+        del self.all_objects[(obj._ctx, obj._addr)]
+        if obj._addr in self.tracked_objects:
+            del self.tracked_objects[(obj._ctx, obj._addr)]
+
+    def poll_objects(self):
+        for obj in self.tracked_objects.values():
+            diff = obj.poll()
+            if diff is not None:
+                self.log(diff)
+
+    def alloc_channels(self, cls, name, channel_id, count=1, ring_size=0x100, rx=False):
+
+        # All channels have 0x100 items
+        item_count = ring_size
+        item_size = cls.item_size
+        ring_size = item_count * item_size
+
+        self.log(f"Allocating {count} channel(s) for {name} ({item_count} * {item_size:#x} bytes each)")
+
+        state_obj = self.kshared.new_buf(0x30 * count, f"Channel.{name}.state", track=False)
+        if rx:
+            ring_buf = self.kshared.new_buf(ring_size * count, f"Channel.{name}.ring", track=False)
+        else:
+            ring_buf = self.kobj.new_buf(ring_size * count, f"Channel.{name}.ring", track=False)
+
+        info = ChannelInfo()
+        info.state_addr = state_obj._addr
+        info.ringbuffer_addr = ring_buf._addr
+        if name == "FWCtl":
+            self.fwctl_chinfo = info
+        else:
+            setattr(self.ch_info, name, info)
+
+        return [cls(self, name + ("" if count == 1 else f"[{i}]"), channel_id,
+                    state_obj._paddr + 0x30 * i,
+                    ring_buf._paddr + ring_size * i, item_count)
+                for i in range(count)]
+
+    def init_channels(self):
+        self.log("Initializing channels...")
+        self.ch_info = ChannelInfoSet()
+        self.ch = AGXChannels()
+        self.ch.queue = []
+
+        # Command queue submission channels
+        for index in range(4):
+            queue = AGXQueue()
+            self.ch.queue.append(queue)
+            for typeid, chtype in enumerate(("TA", "3D", "CL")):
+                name = f"{chtype}_{index}"
+                chan = self.alloc_channels(GPUCmdQueueChannel, name,
+                                           (index << 2) | typeid)[0]
+                setattr(queue, "q_" + chtype, chan)
+
+        # Device control channel
+        self.ch.devctrl = self.alloc_channels(GPUDeviceControlChannel, "DevCtrl", 0x11)[0]
+
+        # GPU -> CPU channels
+        self.ch.event = self.alloc_channels(GPUEventChannel, "Event", None, rx=True)[0]
+        self.ch.log = self.alloc_channels(GPULogChannel, "FWLog", None, 6, rx=True)
+        self.ch.ktrace = self.alloc_channels(GPUKTraceChannel, "KTrace", None, ring_size=0x200, rx=True)[0]
+        self.ch.stats = self.alloc_channels(GPUStatsChannel, "Stats", None, rx=True)[0]
+
+        self.ch.fwctl = self.alloc_channels(GPUFWCtlChannel, "FWCtl", None, rx=False)[0]
+
+        # For some reason, the FWLog channels have their rings in a different place...
+        self.fwlog_ring = self.ch_info.FWLog.ringbuffer_addr
+        self.ch_info.FWLog.ringbuffer_addr = self.kshared.buf(0x150000, "FWLog_Dummy")
+
+    def poll_channels(self):
+        for chan in self.ch.log:
+            chan.poll()
+        self.ch.ktrace.poll()
+        if self.show_stats:
+            self.ch.stats.poll()
+        self.ch.event.poll()
+
+    def kick_firmware(self):
+        self.asc.db.doorbell(0x10)
+
+    def show_irqs(self):
+        hw_state = self.aic_base + 0x4200
+        irqs = []
+        for irq in self.sgx_dev.interrupts:
+            v = int(bool((self.p.read32(hw_state + (irq // 32) * 4) & (1 << (irq % 32)))))
+            irqs.append(v)
+        self.log(f' SGX IRQ state: {irqs}')
+
+    def timeout(self, msg):
+        if self.mon:
+            self.mon.poll()
+        self.poll_objects()
+        self.log(msg)
+        self.log(r' (\________/) ')
+        self.log(r'  |        |  ')
+        self.log(r"'.| \  , / |.'")
+        self.log(r'--| / (( \ |--')
+        self.log(r".'|  _-_-  |'.")
+        self.log(r'  |________|  ')
+        self.log(r'')
+        self.log(r' Timeout nya~!!!!!')
+        self.log(r'')
+        self.log(f' Stamp index: {int(msg.stamp_index)}')
+        self.show_pending_stamps()
+        self.log(f' Fault info:')
+        self.log(self.initdata.regionC.fault_info)
+
+        self.show_irqs()
+        self.check_fault()
+        self.recover()
+
+    def faulted(self, msg):
+        if self.mon:
+            self.mon.poll()
+        self.poll_objects()
+        self.log(msg)
+        self.log(r' (\________/) ')
+        self.log(r'  |        |  ')
+        self.log(r"'.| \  , / |.'")
+        self.log(r'--| / (( \ |--')
+        self.log(r".'|  _-_-  |'.")
+        self.log(r'  |________|  ')
+        self.log(r'')
+        self.log(r' Fault nya~!!!!!')
+        self.log(r'')
+        self.show_pending_stamps()
+        self.log(f' Fault info:')
+        self.log(self.initdata.regionC.fault_info)
+
+        self.show_irqs()
+        self.check_fault()
+        self.recover()
+
+    def show_pending_stamps(self):
+        self.initdata.regionC.pull()
+        self.log(f' Pending stamps:')
+        for i in self.initdata.regionC.pending_stamps:
+            if i.info or i.wait_value:
+                self.log(f"  - #{i.info >> 3:3d}: {i.info & 0x7}/{i.wait_value:#x}")
+            i.info = 0
+            i.wait_value = 0
+            tmp = i.regmap()
+            tmp.info.val = 0
+            tmp.wait_value.val = 0
+
+        #self.initdata.regionC.push()
+
+    def check_fault(self):
+        fault_info = self.sgx.FAULT_INFO.reg
+        if fault_info.value == 0xacce5515abad1dea:
+            raise Exception("Got fault notification, but fault address is unreadable")
+
+        self.log(f" Fault info: {fault_info}")
+
+        if not fault_info.FAULTED:
+            return
+
+        fault_addr = fault_info.ADDR
+        if fault_addr & 0x8000000000:
+            fault_addr |= 0xffffff8000000000
+        base, obj = self.find_object(fault_addr)
+        info = ""
+        if obj is not None:
+            info = f" ({obj!s} + {fault_addr - base:#x})"
+        self.log(f" GPU fault at {fault_addr:#x}{info}")
+        self.log(f" Faulting unit: {agx_decode_unit(fault_info.UNIT)}")
+
+    def recover(self):
+        status = self.fw_status
+        self.log(f" Halt count: {status.halt_count.val}")
+        halted = bool(status.halted.val)
+        self.log(f" Halted: {halted}")
+        if halted:
+            self.log(f" Attempting recovery...")
+            status.halted.val = 0
+            status.resume.val = 1
+        else:
+            raise Exception("Cannot recover")
+        self.show_irqs()
+
+    def resume(self):
+        self.log("Starting ASC")
+        self.asc.start()
+
+        self.log("Starting endpoints")
+        self.asc.start_ep(0x20)
+        self.asc.start_ep(0x21)
+
+    def start(self):
+        self.resume()
+
+        self.init_channels()
+
+        self.log("Building initdata")
+        self.initdata = build_initdata(self)
+        if self.initdata_hook:
+            self.initdata_hook(self)
+
+        self.fw_status = self.initdata.fw_status.regmap()
+        self.uat.flush_dirty()
+
+        self.log("Sending initdata")
+        self.asc.fw.send_initdata(self.initdata._addr & 0xfff_ffffffff)
+        self.asc.work()
+
+        self.log("Sending DC_Init")
+        self.ch.devctrl.send_init()
+        self.asc.work()
+
+        self.log("Sending DC_UpdateIdleTS")
+        self.ch.devctrl.update_idle_ts()
+        self.asc.work()
+
+    def stop(self):
+        self.asc.stop()
+
+    def work(self):
+        self.asc.work()
+
+    def wait_for_events(self, timeout=1.0):
+        now = time.time()
+        deadline = now + timeout
+        cnt = self.event_mgr.event_count
+        while now < deadline and self.event_mgr.event_count == cnt:
+            self.asc.work()
+            now = time.time()
+        if self.event_mgr.event_count == cnt:
+            raise Exception("Timed out waiting for events")
+
+    def log(self, msg):
+        t = time.time() - self.start_time
+        print(f"[AGX][{t:10.03f}] " + str(msg))
diff --git a/tools/proxyclient/m1n1/agx/channels.py b/tools/proxyclient/m1n1/agx/channels.py
new file mode 100644
index 0000000..c91f347
--- /dev/null
+++ b/tools/proxyclient/m1n1/agx/channels.py
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: MIT
+
+from construct import *
+from ..fw.agx.channels import *
+from ..fw.agx.cmdqueue import *
+
+class GPUChannel:
+    STATE_FIELDS = ChannelStateFields
+
+    def __init__(self, agx, name, channel_id, state_addr, ring_addr, ring_size):
+        self.agx = agx
+        self.u = agx.u
+        self.name = name
+        self.channel_id = channel_id
+        self.iface = agx.u.iface
+        self.state_addr = state_addr
+        self.ring_addr = ring_addr
+        self.ring_size = ring_size
+        self.state = self.STATE_FIELDS(self.u, self.state_addr)
+        self.state.READ_PTR.val = 0
+        self.state.WRITE_PTR.val = 0
+
+    @classmethod
+    @property
+    def item_size(cls):
+        return cls.MSG_CLASS.sizeof()
+
+    def log(self, msg):
+        self.agx.log(f"[{self.name}] {msg}")
+
+class GPUTXChannel(GPUChannel):
+    def doorbell(self):
+        self.agx.asc.db.doorbell(self.channel_id)
+
+    def send_message(self, msg):
+        wptr = self.state.WRITE_PTR.val
+        self.iface.writemem(self.ring_addr + self.item_size * wptr,
+                            msg.build())
+        self.state.WRITE_PTR.val = (wptr + 1) % self.ring_size
+        self.doorbell()
+
+class GPURXChannel(GPUChannel):
+    def poll(self):
+        wptr = self.state.WRITE_PTR.val
+        rptr = self.state.READ_PTR.val
+
+        if wptr >= self.ring_size:
+            raise Exception(f"wptr = {wptr:#x} > {self.ring_size:#x}")
+
+        while rptr != wptr:
+            msg = self.iface.readmem(self.ring_addr + self.item_size * rptr,
+                                     self.item_size)
+            self.handle_message(self.MSG_CLASS.parse(msg))
+            rptr = (rptr + 1) % self.ring_size
+        self.state.READ_PTR.val = rptr
+
+    def handle_message(self, msg):
+        self.log(f"Message: {msg}")
+
+class GPUCmdQueueChannel(GPUTXChannel):
+    MSG_CLASS = RunCmdQueueMsg
+
+    def run(self, queue, event):
+        msg = RunCmdQueueMsg()
+        msg.queue_type = queue.TYPE
+        msg.cmdqueue = queue.info
+        msg.cmdqueue_addr = queue.info._addr
+        msg.head = queue.wptr
+        msg.event_number = event
+        msg.new_queue = 1 if queue.first_time else 0
+        queue.first_time = False
+        #print(msg)
+        self.send_message(msg)
+
+class GPUDeviceControlChannel(GPUTXChannel):
+    MSG_CLASS = DeviceControlMsg
+
+    def send_init(self):
+        self.send_message(DC_Init())
+
+    def dc_09(self, a, ptr, b):
+        # Writes to InitData.RegionB
+        msg = DC_09()
+        msg.unk_4 = a
+        msg.unkptr_c = ptr
+        msg.unk_14 = b
+        self.send_message(msg)
+
+    def send_foo(self, t, d=None):
+        msg = DC_Any()
+        msg.msg_type = t
+        if d is not None:
+            msg.data = d
+        self.send_message(msg)
+
+    def update_idle_ts(self):
+        self.send_message(DC_UpdateIdleTS())
+
+    def destroy_context(self, ctx):
+        msg = DC_DestroyContext()
+        msg.unk_4 = 0
+        msg.unk_8 = 2
+        msg.unk_c = 0
+        msg.unk_10 = 0
+        msg.unk_14 = 0xffff
+        msg.unk_18 = 0
+        msg.context_addr = ctx.gpu_context._addr
+        print(msg)
+        self.send_message(msg)
+
+    # Maybe related to stamps?
+    def write32(self, addr, val):
+        msg = DC_Write32()
+        msg.addr = addr
+        msg.data = val
+        msg.unk_10 = 0
+        msg.unk_14 = 0
+        msg.unk_18 = 0
+        msg.unk_1c = 0
+        print(msg)
+        self.send_message(msg)
+
+    def dc_1e(self, a, b):
+        msg = DC_1e()
+        msg.unk_4 = a
+        msg.unk_c = b
+        print(msg)
+        self.send_message(msg)
+
+class GPUFWCtlChannel(GPUTXChannel):
+    STATE_FIELDS = FWControlStateFields
+    MSG_CLASS = FWCtlMsg
+
+    def doorbell(self):
+        self.agx.asc.db.fwctl_doorbell()
+
+    def send_inval(self, ctx, addr=0):
+        msg = FWCtlMsg()
+        msg.addr = addr
+        msg.unk_8 = 0
+        msg.context_id = ctx
+        msg.unk_10 = 1
+        msg.unk_12 = 2
+        print(msg)
+        self.send_message(msg)
+
+class GPUEventChannel(GPURXChannel):
+    MSG_CLASS = EventMsg
+
+    def handle_message(self, msg):
+        if isinstance(msg, FlagMsg):
+            self.agx.event_mgr.fired(msg.firing)
+        elif isinstance(msg, FaultMsg):
+            self.agx.faulted(msg)
+        elif isinstance(msg, TimeoutMsg):
+            self.agx.timeout(msg)
+        else:
+            self.log(f"Unknown event: {msg}")
+
+class GPULogChannel(GPURXChannel):
+    MSG_CLASS = FWLogMsg
+
+    def handle_message(self, msg):
+        ts = msg.timestamp / 24000000
+        self.log(f"[{msg.seq_no:<4d}{ts:14.7f}] {msg.msg}")
+
+class GPUKTraceChannel(GPURXChannel):
+    MSG_CLASS = KTraceMsg
+
+    def handle_message(self, msg):
+        self.log(f"{msg}")
+
+class GPUStatsChannel(GPURXChannel):
+    MSG_CLASS = HexDump(Bytes(0x60))
+
+    def handle_message(self, msg):
+        if self.agx.show_stats:
+            self.log(f"stat {msg}")
diff --git a/tools/proxyclient/m1n1/agx/context.py b/tools/proxyclient/m1n1/agx/context.py
new file mode 100644
index 0000000..41ebed5
--- /dev/null
+++ b/tools/proxyclient/m1n1/agx/context.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: MIT
+from ..utils import chexdump
+from ..malloc import Heap
+from construct.core import *
+from ..fw.agx.channels import *
+from ..fw.agx.cmdqueue import *
+from ..fw.agx.microsequence import *
+from ..hw.uat import MemoryAttr
+from .object import *
+import textwrap
+
+class GPUContext:
+    def __init__(self, agx):
+        self.agx = agx
+        self.uat = self.agx.uat
+        self.u = self.agx.u
+        self.p = self.agx.p
+        self.verbose = False
+
+        #self.job_list = agx.kshared.new(JobList)
+        #self.job_list.first_job = 0
+        #self.job_list.last_head = self.job_list._addr # Empty list has self as last_head
+        #self.job_list.unkptr_10 = 0
+        #self.job_list.push()
+
+        self.gpu_context = agx.kobj.new(GPUContextData).push()
+
+        self.ttbr0_base = self.u.memalign(self.agx.PAGE_SIZE, self.agx.PAGE_SIZE)
+        self.p.memset32(self.ttbr0_base, 0, self.agx.PAGE_SIZE)
+
+        self.objects = {}
+
+        # 32K VA pages since buffer manager needs that
+        self.uobj = GPUAllocator(agx, "Userspace", 0x1600000000, 0x100000000, ctx=None,
+                                 guard_pages=16,
+                                 va_block=32768, nG=1, AP=0, PXN=1, UXN=1)
+
+        self.gobj = GPUAllocator(agx, "GEM", 0x1500000000, 0x100000000, ctx=None,
+                                 guard_pages=16, nG=1, AP=0, PXN=1, UXN=1)
+
+        self.pipeline_base = 0x1100000000
+        self.pipeline_size = 1 << 32
+        self.pobj = GPUAllocator(agx, "Pipelines", self.pipeline_base + 0x10000, self.pipeline_size,
+                                 ctx=None, guard_pages=1, nG=1, AP=0, PXN=1, UXN=1)
+
+    def bind(self, ctx_id):
+        self.ctx = ctx_id
+        self.uobj.ctx = ctx_id
+        self.gobj.ctx = ctx_id
+        self.pobj.ctx = ctx_id
+        self.uat.bind_context(ctx_id, self.ttbr0_base)
+        self.thing = self.buf_at(0x6fffff8000, 0, 0x4000, "thing")
+
+    def make_stream(self, base):
+        return self.uat.iostream(self.ctx, base, recurse=False)
+
+    def new_at(self, addr, objtype, name=None, track=True, **flags):
+        obj = GPUObject(self, objtype)
+        obj._stream = self.make_stream
+        if name is not None:
+            obj._name = name
+
+        size_align = align_up(obj._size, self.agx.PAGE_SIZE)
+        obj._addr = addr
+
+        obj._paddr = self.agx.u.memalign(self.agx.PAGE_SIZE, size_align)
+        #if isinstance(obj.val, ConstructClassBase):
+            #obj.val._addr = obj._addr
+
+        self.agx.log(f"[Context@{self.gpu_context._addr:#x}] Map {obj._name} size {obj._size:#x} @ {obj._addr:#x} ({obj._paddr:#x})")
+
+        flags2 = {"AttrIndex": MemoryAttr.Shared}
+        flags2.update(flags)
+        obj._map_flags = flags2
+
+        obj._size_align = size_align
+        self.agx.uat.iomap_at(self.ctx, obj._addr, obj._paddr, size_align, **flags2)
+        self.objects[obj._addr] = obj
+        self.agx.reg_object(obj, track=track)
+
+        return obj
+
+    def buf_at(self, addr, is_pipeline, size, name=None, track=True):
+        return self.new_at(addr, Bytes(size), name, track=track,
+                           AttrIndex=MemoryAttr.Shared, PXN=1,
+                           nG=1, AP=(1 if is_pipeline else 0))
+
+    def load_blob(self, addr, is_pipeline, filename, track=True):
+        data = open(filename, "rb").read()
+        obj = self.new_at(addr, Bytes(len(data)), filename, track=track,
+                          AttrIndex=MemoryAttr.Shared, PXN=1,
+                          nG=1, AP=(1 if is_pipeline else 0))
+        obj.val = data
+        obj.push()
+
+        return obj
+
+    def free(self, obj):
+        obj._dead = True
+        self.agx.uat.iomap_at(self.ctx, obj._addr, 0, obj._size_align, VALID=0)
+        del self.objects[obj._addr]
+        self.agx.unreg_object(obj)
+
+    def free_at(self, addr):
+        self.free(self.objects[obj._addr])
+
+class GPUWorkQueue:
+    def __init__(self, agx, context, job_list):
+        self.agx = agx
+        self.u = agx.u
+        self.p = agx.p
+        self.context = context
+
+        self.info = agx.kobj.new(CommandQueueInfo)
+
+        self.pointers = agx.kshared.new(CommandQueuePointers).push()
+        self.pmap = CommandQueuePointerMap(self.u, self.pointers._paddr)
+
+        self.rb_size = self.pointers.rb_size
+        self.ring = agx.kobj.new_buf(8 * self.rb_size, "GPUWorkQueue.RB")
+
+        self.info.pointers = self.pointers
+        self.info.rb_addr = self.ring._addr
+        self.info.job_list = job_list
+        self.info.gpu_buf_addr = agx.kobj.buf(0x2c18, "GPUWorkQueue.gpu_buf")
+        self.info.gpu_context = context.gpu_context
+        self.info.push()
+
+        self.wptr = 0
+        self.first_time = True
+
+        self.agx.uat.flush_dirty()
+
+    def submit(self, work):
+        work.push()
+
+        self.p.write64(self.ring._paddr + 8 * self.wptr, work._addr)
+        self.wptr = (self.wptr + 1) % self.rb_size
+        self.agx.uat.flush_dirty()
+        self.pmap.CPU_WPTR.val = self.wptr
+
+    def wait_empty(self):
+        while self.wptr != self.pmap.GPU_DONEPTR.val:
+            self.agx.work()
+
+class GPU3DWorkQueue(GPUWorkQueue):
+    TYPE = 1
+
+class GPUTAWorkQueue(GPUWorkQueue):
+    TYPE = 0
+
+class GPUMicroSequence:
+    def __init__(self, agx):
+        self.agx = agx
+        self.off = 0
+        self.ops = []
+        self.obj = None
+
+    def append(self, op):
+        off = self.off
+        self.ops.append(op)
+        self.off += op.sizeof()
+        return off
+
+    def finalize(self):
+        self.ops.append(EndCmd())
+        self.size = sum(i.sizeof() for i in self.ops)
+        self.obj = self.agx.kobj.new_buf(self.size, "GPUMicroSequence", track=False)
+        self.obj.val = b"".join(i.build() for i in self.ops)
+        self.obj.push()
+        return self.obj
+
+    def dump(self):
+        chexdump(self.agx.iface.readmem(self.obj._paddr, self.size))
+        print(MicroSequence.parse_stream(self.agx.uat.iostream(0, self.obj._addr)))
+
+    def __str__(self):
+        s = f"GPUMicroSequence: {len(self.ops)} ops\n"
+        for i, op in enumerate(self.ops):
+            op_s = textwrap.indent(str(op), ' ' * 4)
+            s += f"[{i:2}:{op.sizeof():#x}] = {op!s}\n"
+        return s
+
+class GPUBufferManager:
+    def __init__(self, agx, context, blocks=8):
+        self.agx = agx
+        self.ctx = context
+
+        self.block_ctl_obj = agx.kshared.new(BufferManagerBlockControl)
+        self.block_ctl_obj.total = blocks
+        self.block_ctl_obj.wptr = 0
+        self.block_ctl_obj.unk = 0
+        self.block_ctl = self.block_ctl_obj.push().regmap()
+
+        self.counter_obj = agx.kshared.new(BufferManagerCounter)
+        self.counter_obj.count = 0
+        self.counter = self.counter_obj.push().regmap()
+
+        self.misc_obj = agx.kshared.new(BufferManagerMisc)
+        self.misc_obj.cpu_flag = 1
+        self.misc = self.misc_obj.push().regmap()
+
+        self.page_size = 0x8000
+        self.pages_per_block = 4
+        self.block_size = self.pages_per_block * self.page_size
+
+        self.page_list = context.uobj.new(Array(0x10000 // 4, Int32ul), "BM PageList", track=False)
+        self.block_list = context.uobj.new(Array(0x8000 // 4, Int32ul), "BM BlockList", track=False)
+
+        self.info = info = agx.kobj.new(BufferManagerInfo)
+        info.page_list_addr = self.page_list._addr
+        info.page_list_size = self.page_list._size
+        info.page_count = self.block_ctl_obj.total * 4
+        info.block_count = self.block_ctl_obj.total
+
+        info.block_list_addr = self.block_list._addr
+        info.block_ctl = self.block_ctl_obj
+        info.last_page = info.page_count - 1
+        info.block_size = self.block_size
+
+        info.counter = self.counter_obj
+
+        self.populate()
+        self.block_ctl_obj.pull()
+        self.block_list.push()
+        self.page_list.push()
+
+        info.push()
+
+    def increment(self):
+        self.counter_obj.count += 1
+        self.counter_obj.push()
+
+    def populate(self):
+        idx = self.block_ctl.wptr.val
+        total = self.block_ctl.total.val
+        while idx < total:
+            block = self.ctx.uobj.new_buf(self.block_size, "BM Block", track=False)
+            self.block_list[idx * 2] = block._addr // self.page_size
+
+            page_idx = idx * self.pages_per_block
+            for i in range(self.pages_per_block):
+                self.page_list[page_idx + i] = block._addr // self.page_size + i
+
+            idx += 1
+        self.block_ctl.wptr.val = idx
+
diff --git a/tools/proxyclient/m1n1/agx/event.py b/tools/proxyclient/m1n1/agx/event.py
new file mode 100644
index 0000000..693f3a5
--- /dev/null
+++ b/tools/proxyclient/m1n1/agx/event.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: MIT
+from ..utils import chexdump
+from ..malloc import Heap
+from construct.core import *
+from ..fw.agx.channels import *
+from ..fw.agx.cmdqueue import *
+from ..fw.agx.microsequence import *
+from ..hw.uat import MemoryAttr
+from .object import *
+import textwrap
+
+class GPUEventManager:
+    MAX_EVENTS = 128
+
+    def __init__(self, agx):
+        self.agx = agx
+
+        self.event_count = 0
+        self.free_events = set(range(self.MAX_EVENTS))
+        self.events = [None] * self.MAX_EVENTS
+
+    def allocate_event(self):
+        if not self.free_events:
+            raise Exception("No free events")
+        ev_id = self.free_events.pop()
+
+        ev = GPUEvent(ev_id)
+        self.events[ev_id] = ev
+
+        return ev
+
+    def free_event(self, ev):
+        self.events[ev.id] = None
+        self.free_events.add(ev.id)
+
+    def fired(self, flags):
+        self.agx.log("= Events fired =")
+        for i, v in enumerate(flags):
+            for j in range(64):
+                if v & (1 << j):
+                    ev_id = i * 64 + j
+                    ev = self.events[ev_id]
+                    self.agx.log(f"Event fired: {ev_id}")
+                    if ev is None:
+                        raise Exception("Received spurious notification for event ID {ev}")
+                    ev.fire()
+                    self.event_count += 1
+
+class GPUEvent:
+    def __init__(self, ev_id):
+        self.id = ev_id
+        self.fired = False
+
+    def fire(self):
+        self.fired = True
+
+    def rearm(self):
+        self.fired = False
diff --git a/tools/proxyclient/m1n1/agx/initdata.py b/tools/proxyclient/m1n1/agx/initdata.py
new file mode 100644
index 0000000..d6fa76a
--- /dev/null
+++ b/tools/proxyclient/m1n1/agx/initdata.py
@@ -0,0 +1,387 @@
+# SPDX-License-Identifier: MIT
+from ..fw.agx.initdata import *
+from ..fw.agx.channels import ChannelInfo
+from ..hw.uat import MemoryAttr
+
+from construct import Container
+
+def build_iomappings(agx, chip_id):
+    def iomap(phys, size, range_size, rw):
+        off = phys & 0x3fff
+        virt = agx.io_allocator.malloc(size + 0x4000 + off)
+        agx.uat.iomap_at(0, virt, phys - off, size + off, AttrIndex=MemoryAttr.Device)
+        return IOMapping(phys, virt + off, size, range_size, rw)
+
+    # for t8103
+    if chip_id == 0x8103:
+        return [
+            iomap(0x204d00000, 0x1c000, 0x1c000, 1), # Fender
+            iomap(0x20e100000, 0x4000, 0x4000, 0), # AICTimer
+            iomap(0x23b104000, 0x4000, 0x4000, 1), # AICSWInt
+            iomap(0x204000000, 0x20000, 0x20000, 1), # RGX
+            IOMapping(), # UVD
+            IOMapping(), # unused
+            IOMapping(), # DisplayUnderrunWA
+            iomap(0x23b2e8000, 0x1000, 0x1000, 0), # AnalogTempSensorControllerRegs
+            iomap(0x23bc00000, 0x1000, 0x1000, 1), # PMPDoorbell
+            iomap(0x204d80000, 0x5000, 0x5000, 1), # MetrologySensorRegs
+            iomap(0x204d61000, 0x1000, 0x1000, 1), # GMGIFAFRegs
+            iomap(0x200000000, 0xd6400, 0xd6400, 1), # MCache registers
+            IOMapping(), # AICBankedRegisters
+            iomap(0x23b738000, 0x1000, 0x1000, 1), # PMGRScratch
+            IOMapping(), # NIA Special agent idle register die 0
+            IOMapping(), # NIA Special agent idle register die 1
+            IOMapping(), # CRE registers
+            IOMapping(), # Streaming codec registers
+            IOMapping(), #
+            IOMapping(), #
+        ]
+    elif chip_id == 0x8112:
+        return [
+            iomap(0x204d00000, 0x14000, 0x14000, 1), # Fender
+            iomap(0x20e100000, 0x4000, 0x4000, 0), # AICTimer
+            iomap(0x23b0c4000, 0x4000, 0x4000, 1), # AICSWInt
+            iomap(0x204000000, 0x20000, 0x20000, 1), # RGX
+            IOMapping(), # UVD
+            IOMapping(), # unused
+            IOMapping(), # DisplayUnderrunWA
+            iomap(0x23b2c0000, 0x1000, 0x1000, 0), # AnalogTempSensorControllerRegs
+            IOMapping(), # PMPDoorbell
+            iomap(0x204d80000, 0x8000, 0x8000, 1), # MetrologySensorRegs
+            iomap(0x204d61000, 0x1000, 0x1000, 1), # GMGIFAFRegs
+            iomap(0x200000000, 0xd6400, 0xd6400, 1), # MCache registers
+            IOMapping(), # AICBankedRegisters
+            IOMapping(), # PMGRScratch
+            IOMapping(), # NIA Special agent idle register die 0
+            IOMapping(), # NIA Special agent idle register die 1
+            iomap(0x204e00000, 0x10000, 0x10000, 0), # CRE registers
+            iomap(0x27d050000, 0x4000, 0x4000, 0), # Streaming codec registers
+            iomap(0x23b3d0000, 0x1000, 0x1000, 0), #
+            iomap(0x23b3c0000, 0x1000, 0x1000, 0), #
+        ]
+    elif chip_id in (0x6000, 0x6001, 0x6002):
+        mcc_cnt = {0x6002: 16, 0x6001: 8, 0x6000: 4}
+        return [
+            iomap(0x404d00000, 0x1c000, 0x1c000, 1), # Fender
+            iomap(0x20e100000, 0x4000, 0x4000, 0), # AICTimer
+            iomap(0x28e104000, 0x4000, 0x4000, 1), # AICSWInt
+            iomap(0x404000000, 0x20000, 0x20000, 1), # RGX
+            IOMapping(), # UVD
+            IOMapping(), # unused
+            IOMapping(), # DisplayUnderrunWA
+            iomap(0x28e494000, 0x1000, 0x1000, 0), # AnalogTempSensorControllerRegs
+            IOMapping(), # PMPDoorbell
+            iomap(0x404d80000, 0x8000, 0x8000, 1), # MetrologySensorRegs
+            iomap(0x204d61000, 0x1000, 0x1000, 1), # GMGIFAFRegs
+            iomap(0x200000000, mcc_cnt[chip_id] * 0xd8000, 0xd8000, 1), # MCache registers
+            IOMapping(), # AICBankedRegisters
+            IOMapping(), # PMPDoorbell
+            iomap(0x2643c4000, 0x1000, 0x1000, 1), # NIA Special agent idle register die 0
+            iomap(0x22643c4000, 0x1000, 0x1000, 1) if chip_id == 0x6002 else IOMapping(), # NIA Special agent idle register die 1
+            IOMapping(), # CRE registers
+            IOMapping(), # Streaming codec registers
+            iomap(0x28e3d0000, 0x1000, 0x1000, 1),
+            iomap(0x28e3c0000, 0x2000, 0x2000, 0),
+        ]
+
+
+CHIP_INFO = {
+    0x8103: Container(
+        chip_id = 0x8103,
+        min_sram_volt = 850,
+        max_power = 19551,
+        max_freq_mhz = 1278,
+        unk_87c = -220,
+        unk_8cc = 9880,
+        unk_924 = [[0] * 8] * 8,
+        unk_e48 = [[0] * 8] * 8,
+        unk_e24 = 112,
+        gpu_fast_die0_sensor_mask64 = 0x12,
+        gpu_fast_die0_sensor_mask64_alt = 0x12,
+        gpu_fast_die0_sensor_present = 0x01,
+        shared1_tab = [
+            -1, 0x7282, 0x50ea, 0x370a, 0x25be, 0x1c1f, 0x16fb
+        ] + ([-1] * 10),
+        shared1_a4 = 0xffff,
+        shared2_tab = [0x800, 0x1555, -1, -1, -1, -1, -1, -1, 0, 0],
+        shared2_unk_508 = 0xc0007,
+        unk_3cf4 = [1000.0, 0, 0, 0, 0, 0, 0, 0],
+        unk_3d14 = [45.0, 0, 0, 0, 0, 0, 0, 0],
+        unk_118ec = None,
+        hwdb_4e0 = 0,
+        hwdb_534 = 0,
+        num_cores = 8,
+        gpu_core = 11,
+        gpu_rev = 4,
+        hwdb_ab8 = 0x48,
+        hwdb_abc = 0x8,
+        hwdb_b30 = 0,
+        rel_max_powers = [0, 19, 26, 38, 60, 87, 100],
+    ),
+    0x6001: Container(
+        chip_id = 0x6001,
+        min_sram_volt = 790,
+        max_power = 81415,
+        max_freq_mhz = 1296,
+        unk_87c = 900,
+        unk_8cc = 11000,
+        unk_924 = [[i, *([0] * 7)] for i in [
+            9.838, 9.819, 9.826, 9.799,
+            0, 0, 0, 0,
+        ]],
+        unk_e48 = [[i, *([0] * 7)] for i in [
+            13, 13, 13, 13, 0, 0, 0, 0,
+        ]],
+        unk_e24 = 125,
+        gpu_fast_die0_sensor_mask64 = 0x80808080,
+        gpu_fast_die0_sensor_mask64_alt = 0x90909090,
+        gpu_fast_die0_sensor_present = 0x0f,
+        shared1_tab = [0] + ([0xffff] * 16),
+        shared1_a4 = 0xffff,
+        shared2_tab = [-1, -1, -1, -1, 0x2aa, 0xaaa, -1, -1, 0, 0],
+        shared2_unk_508 = 0xcc00001,
+        unk_3cf4 = [1314.0, 1330.0, 1314.0, 1288.0, 0, 0, 0, 0],
+        unk_3d14 = [21.0, 21.0, 22.0, 21.0, 0, 0, 0, 0],
+        unk_118ec = [
+            0, 1, 2,
+            1, 1, 90, 75, 1, 1,
+            1, 2, 90, 75, 1, 1,
+            1, 1, 90, 75, 1, 1
+        ],
+        hwdb_4e0 = 4,
+        hwdb_534 = 1,
+        num_cores = 32,
+        gpu_core = 13,
+        gpu_rev = 5,
+        hwdb_ab8 = 0x2084,
+        hwdb_abc = 0x80,
+        hwdb_b30 = 0,
+        rel_max_powers = [0, 15, 20, 27, 36, 52, 100],
+    ),
+    0x6002: Container(
+        chip_id = 0x6002,
+        min_sram_volt = 790,
+        max_power = 166743,
+        max_freq_mhz = 1296,
+        unk_87c = 900,
+        unk_8cc = 11000,
+        unk_924 = [[i, *([0] * 7)] for i in [
+            9.838, 9.819, 9.826, 9.799,
+            9.799, 9.826, 9.819, 9.838,
+        ]],
+        unk_c30 = 0,
+        unk_e48 = [[i, *([0] * 7)] for i in [
+            13, 13, 13, 13, 13, 13, 13, 13,
+        ]],
+        unk_e24 = 125,
+        gpu_fast_die0_sensor_mask64 = 0x8080808080808080,
+        gpu_fast_die0_sensor_mask64_alt = 0x9090909090909090,
+        gpu_fast_die0_sensor_present = 0xff,
+        shared1_tab = [0] + ([0xffff] * 16),
+        shared1_a4 = 0xffff,
+        shared2_tab = [-1, -1, -1, -1, 0x2aa, 0xaaa, -1, -1, 0, 0],
+        shared2_unk_508 = 0xcc00001,
+        unk_3cf4 = [1244.0, 1260.0, 1242.0, 1214.0,
+                    1072.0, 1066.0, 1044.0, 1042.0],
+        unk_3d14 = [18.0, 18.0, 18.0, 17.0, 15.0, 15.0, 15.0, 14.0],
+        unk_8924 = 0,
+        unk_118ec = [
+            0, 1, 2,
+            1, 1, 90, 75, 1, 1,
+            1, 2, 90, 75, 1, 1,
+            1, 1, 90, 75, 1, 1
+        ],
+        hwdb_4e0 = 4,
+        hwdb_534 = 1,
+        num_cores = 64,
+        gpu_core = 13,
+        gpu_rev = 5,
+        hwdb_ab8 = 0x2084,
+        hwdb_abc = 0x80,
+        hwdb_b30 = 0,
+        rel_max_powers = [0, 15, 19, 25, 34, 50, 100],
+    ),
+    0x8112: Container(
+        chip_id = 0x8112,
+        min_sram_volt = 780,
+        max_power = 22800,
+        max_freq_mhz = 1398,
+        unk_87c = 900,
+        unk_8cc = 11000,
+        unk_924 = [[
+            0.0, 0.0, 0.0, 0.0,
+            5.3, 0.0, 5.3, 6.6,
+        ]] + ([[0] * 8] * 7),
+        unk_e48 = [[
+            0.0, 0.0, 0.0, 0.0,
+            5.3, 0.0, 5.3, 6.6,
+        ]] + ([[0] * 8] * 7),
+        unk_e24 = 125,
+        gpu_fast_die0_sensor_mask64 = 0x6800,
+        gpu_fast_die0_sensor_mask64_alt = 0x6800,
+        gpu_fast_die0_sensor_present = 0x02,
+        shared1_tab = [0] + ([0xffff] * 16),
+        shared1_a4 = 0,
+        shared2_tab = [-1, -1, -1, -1, -1, -1, -1, -1, 0xaa5aa, 0],
+        shared2_unk_508 = 0xc00000,
+        unk_3cf4 = [1920.0, 0, 0, 0, 0, 0, 0, 0],
+        unk_3d14 = [74.0, 0, 0, 0, 0, 0, 0, 0],
+        unk_118ec = None,
+        hwdb_4e0 = 4,
+        hwdb_534 = 0,
+        num_cores = 10,
+        gpu_core = 15,
+        gpu_rev = 3,
+        hwdb_ab8 = 0x2048,
+        hwdb_abc = 0x4000,
+        hwdb_b30 = 1,
+        rel_max_powers = [0, 18, 27, 37, 52, 66, 82, 96, 100],
+    ),
+}
+def build_initdata(agx):
+    sgx = agx.u.adt["/arm-io/sgx"]
+    chosen = agx.u.adt["/chosen"]
+    chip_info = CHIP_INFO[chosen.chip_id]
+
+    initdata = agx.kshared.new(InitData)
+
+    initdata.ver_info = (1, 1, 16, 1)
+
+    initdata.regionA = agx.kshared.new_buf(0x4000, "InitData_RegionA").push()
+
+    regionB = agx.kobj.new(InitData_RegionB)
+
+    regionB.channels = agx.ch_info
+
+    regionB.stats_ta = agx.kobj.new(InitData_GPUGlobalStatsTA).push()
+    regionB.stats_3d = agx.kobj.new(InitData_GPUGlobalStats3D).push()
+
+    # size: 0x180, Empty
+    # 13.0: grew
+    #regionB.stats_cp = agx.kobj.new_buf(0x180, "RegionB.unkptr_180").push()
+    regionB.stats_cp = agx.kobj.new_buf(0x980, "RegionB.unkptr_180").push()
+
+    # size: 0x3b80, few floats, few ints, needed for init
+    regionB.hwdata_a = agx.kobj.new(AGXHWDataA(sgx, chip_info), track=False)
+
+    # size: 0x80, empty
+    regionB.unk_190 = agx.kobj.new_buf(0x80, "RegionB.unkptr_190").push()
+
+    # size: 0xc0, fw writes timestamps into this
+    regionB.unk_198 = agx.kobj.new_buf(0xc0, "RegionB.unkptr_198").push()
+
+    # size: 0xb80, io stuff
+    hwdata = agx.kobj.new(AGXHWDataB(sgx, chip_info), track=False)
+    hwdata.io_mappings = build_iomappings(agx, chosen.chip_id)
+
+    k = 1.02 #?
+    count = sgx.perf_state_count
+    table_count = sgx.perf_state_table_count
+    base_pstate = sgx.getprop("gpu-perf-base-pstate", 3)
+    base_freq = sgx.perf_states[base_pstate].freq
+    max_freq = sgx.perf_states[count - 1].freq
+    for i in range(count):
+        ps = sgx.perf_states[i]
+        hwdata.frequencies[i] = ps.freq // 1000000
+
+        volt = [ps.volt] * 8
+        for j in range(1, table_count):
+            volt[j] = sgx.perf_states[count * j + i].volt
+        sram_volt = [max(chip_info.min_sram_volt, i) for i in volt]
+
+        hwdata.voltages[i] = volt
+        hwdata.voltages_sram[i] = sram_volt
+
+        regionB.hwdata_a.unk_74[i] = k
+        hwdata.unk_9b4[i] = k
+        hwdata.rel_max_powers[i] = chip_info.rel_max_powers[i]
+        hwdata.rel_boost_freqs[i] = max(0, int((ps.freq - base_freq) / (max_freq - base_freq) * 100))
+
+    regionB.hwdata_a.push()
+
+    regionB.hwdata_b = hwdata.push()
+    regionB.hwdata_b_addr2 = hwdata._addr
+
+    regionB.fwlog_ring2 = agx.fwlog_ring
+
+    # Unallocated, Size 0x1000
+    regionB.unk_1b8 = agx.kobj.new_buf(0x1000, "RegionB.unkptr_1b8").push()
+
+    # Unallocated, size 0x300
+    regionB.unk_1c0 = agx.kobj.new_buf(0x300, "RegionB.unkptr_1c0").push()
+
+    # Unallocated, unknown size
+    regionB.unk_1c8 = agx.kobj.new_buf(0x1000, "RegionB.unkptr_1c8").push()
+
+    # Size: 0x4000
+    regionB.buffer_mgr_ctl = agx.kshared2.new(InitData_BufferMgrCtl).push()
+    regionB.buffer_mgr_ctl_addr2 = regionB.buffer_mgr_ctl._addr
+
+    regionB.unk_6a80 = 0
+    regionB.gpu_idle = 0
+    regionB.unk_6a9c = 0
+    regionB.unk_ctr0 = 0
+    regionB.unk_ctr1 = 0
+    regionB.unk_6aa8 = 0
+    regionB.unk_6aac = 0
+    regionB.unk_ctr2 = 0
+    regionB.unk_6ab4 = 0
+    regionB.unk_6ab8 = 0
+    regionB.unk_6abc = 0
+    regionB.unk_6ac0 = 0
+    regionB.unk_6ac4 = 0
+    regionB.unk_ctr3 = 0
+    regionB.unk_6acc = 0
+    regionB.unk_6ad0 = 0
+    regionB.unk_6ad4 = 0
+    regionB.unk_6ad8 = 0
+    regionB.unk_6adc = 0
+    regionB.unk_6ae0 = 0
+    regionB.unk_6ae4 = 0
+    regionB.unk_6ae8 = 0
+    regionB.unk_6aec = 0
+    regionB.unk_6af0 = 0
+    regionB.unk_ctr4 = 0
+    regionB.unk_ctr5 = 0
+    regionB.unk_6afc = 0
+
+    initdata.regionB = regionB.push()
+
+    initdata.regionC = agx.kshared.new(InitData_RegionC(sgx, chip_info), track=False).push()
+
+    #self.regionC_addr = agx.ksharedshared_heap.malloc(0x88000)
+
+    initdata.fw_status = agx.kobj.new(InitData_FWStatus)
+    initdata.fw_status.fwctl_channel = agx.fwctl_chinfo
+    initdata.fw_status.push()
+
+    ## This section seems to be data that would be used by firmware side page allocation
+    ## But the current firmware doesn't have this functionality enabled, so it's not used?
+    initdata.uat_num_levels = 3
+    initdata.uat_page_bits = 14
+    initdata.uat_page_size = 0x4000
+
+    if chip_info.chip_id in (0x8103, 0x8112):
+        phys_mask = 0xffffffc000
+    else:
+        phys_mask = 0x3ffffffc000
+
+    initdata.uat_level_info = [
+        UatLevelInfo(36, 8, phys_mask),
+        UatLevelInfo(25, 2048, phys_mask),
+        UatLevelInfo(14, 2048, phys_mask),
+    ]
+
+    # Host handles FW allocations for existing firmware versions
+    initdata.host_mapped_fw_allocations = 1
+
+
+    #initdata.regionC.idle_ts = agx.u.mrs("CNTPCT_EL0") + 24000000
+    #initdata.regionC.idle_unk = 0x5b2e8
+    #initdata.regionC.idle_to_off_timeout_ms = 20000
+
+    initdata.regionC.push()
+    initdata.push()
+
+    #print(InitData.parse_stream(agx.uat.iostream(0, initdata._addr)))
+    return initdata
diff --git a/tools/proxyclient/m1n1/agx/object.py b/tools/proxyclient/m1n1/agx/object.py
new file mode 100644
index 0000000..8f382f9
--- /dev/null
+++ b/tools/proxyclient/m1n1/agx/object.py
@@ -0,0 +1,263 @@
+# SPDX-License-Identifier: MIT
+import io, time
+
+from ..malloc import Heap
+from ..utils import *
+from ..constructutils import ConstructClassBase, str_value
+from construct import Bytes, Container, HexDump
+from ..hw.uat import MemoryAttr
+
+class GPUObject:
+    def __init__(self, allocator, objtype):
+        self._raw = False
+        if isinstance(objtype, int):
+            self.val = bytes(objtype)
+            self._size = objtype
+            self._name = b"Bytes({objtype})"
+            self._raw = True
+        elif isinstance(objtype, ConstructClassBase):
+            self.val = objtype
+            objtype = type(objtype)
+            self._size = objtype.sizeof()
+            self._name = objtype.__name__
+        elif isinstance(objtype, type) and issubclass(objtype, ConstructClassBase):
+            self._size = objtype.sizeof()
+            self.val = objtype()
+            self._name = objtype.__name__
+        else:
+            self._size = objtype.sizeof()
+            self.val = objtype.parse(bytes(self._size))
+            self._name = type(objtype).__name__
+
+        self._alloc = allocator
+        self._type = objtype
+        self._addr = None
+        self._data = None
+        self._dead = False
+        self._map_flags = {}
+        self._mon_val = None
+        self._skipped_pushes = 0
+        self._compress_threshold = 65536
+        self._strm = None
+        self._read_phys = False
+
+    def push(self, if_needed=False):
+        self._mon_val = self.val
+        assert self._addr is not None
+
+        if self._raw:
+            data = self.val
+        else:
+            context = Container()
+            context._parsing = False
+            context._building = True
+            context._sizing = False
+            context._params = context
+            # build locally and push as a block for efficiency
+            ios = io.BytesIO()
+            self._type._build(self.val, ios, context, "(pushing)")
+            data = ios.getvalue()
+
+        #if self._alloc.verbose:
+            #t = time.time()
+            #self._alloc.agx.log(f"[{self._name} @{self._addr:#x}] chk {self._size} bytes")
+        if if_needed and data[:] == self._data:
+            self._skipped_pushes += 1
+            #if self._alloc.verbose:
+                #t2 = time.time()
+                #mbs = self._size / (t2 - t) / 1000000
+                #self._alloc.agx.log(f"[{self._name} @{self._addr:#x}] chk done ({mbs:.02f} MB/s)")
+            return self
+
+        self._skipped_pushes = 0
+
+        t = time.time()
+        if data == bytes(self._size):
+            if self._alloc.verbose:
+                self._alloc.agx.log(f"[{self._name} @{self._addr:#x}] zeroing {self._size} bytes")
+            self._alloc.agx.p.memset8(self._paddr, 0, self._size)
+        elif self._size > self._compress_threshold:
+            if self._alloc.verbose:
+                self._alloc.agx.log(f"[{self._name} @{self._addr:#x}] pushing {self._size} bytes (compressed)")
+            self._alloc.agx.u.compressed_writemem(self._paddr, data)
+        else:
+            if self._alloc.verbose:
+                self._alloc.agx.log(f"[{self._name} @{self._addr:#x}] pushing {self._size} bytes")
+            self._alloc.agx.iface.writemem(self._paddr, data)
+        if self._alloc.verbose:
+            t2 = time.time()
+            mbs = self._size / (t2 - t) / 1000000
+            self._alloc.agx.log(f"[{self._name} @{self._addr:#x}] push done ({mbs:.02f} MB/s)")
+        #stream.write(data)
+        if isinstance(self._type, type) and issubclass(self._type, ConstructClassBase):
+            if self._strm is None:
+                self._strm = self._alloc.make_stream(self._addr)
+            self.val.set_addr(self._addr, self._strm)
+
+        self._data = bytes(data)
+        return self
+
+    def _pull(self):
+        if self._raw:
+            assert self._paddr is not None
+            return self._alloc.agx.iface.readmem(self._paddr, self._size)
+
+        assert self._addr is not None
+        context = Container()
+        context._parsing = True
+        context._building = False
+        context._sizing = False
+        context._params = context
+        if self._alloc.verbose:
+            self._alloc.agx.log(f"[{self._name} @{self._addr:#x}] pulling {self._size} bytes")
+        if self._read_phys:
+            stream = io.BytesIO()
+            stream.write(self._alloc.agx.iface.readmem(self._paddr, self._size))
+            stream.seek(0)
+        else:
+            stream = self._alloc.make_stream(self._addr)
+        return self._type._parse(stream, context, f"(pulling {self._name})")
+
+    def pull(self):
+        self._mon_val = self.val = self._pull()
+        return self
+
+    def poll(self):
+        prev_val = self._mon_val
+        self._mon_val = cur_val = self._pull()
+        if not hasattr(cur_val, "diff"):
+            return None
+        if cur_val != prev_val:
+            diff = cur_val.diff(prev_val)
+            assert diff is not None
+            return f"GPUObject {self._name} ({self._size:#x} @ {self._addr:#x}): " + diff
+        else:
+            return None
+
+    @property
+    def _ctx(self):
+        return self._alloc.ctx
+
+    def add_to_mon(self, mon):
+        mon.add(self._addr, self._size, self._name, offset=0,
+                readfn=lambda a, s: self._alloc.agx.iface.readmem(a - self._addr + self._paddr, s))
+
+    def _set_addr(self, addr, paddr=None):
+        self._addr = addr
+        self._paddr = paddr
+        if isinstance(self.val, ConstructClassBase):
+            self.val.set_addr(addr)
+
+    def __getitem__(self, item):
+        return self.val[item]
+    def __setitem__(self, item, value):
+        self.val[item] = value
+
+    def __getattr__(self, attr):
+        return getattr(self.val, attr)
+
+    def __setattr__(self, attr, val):
+        if attr.startswith("_") or attr == "val":
+            self.__dict__[attr] = val
+            return
+
+        setattr(self.val, attr, val)
+
+    def __str__(self):
+        if isinstance(self.val, bytes) and len(self.val) > 128:
+            s_val = f"<{len(self.val)} bytes>"
+        else:
+            s_val = str_value(self.val)
+        return f"GPUObject {self._name} ({self._size:#x} @ {self._addr:#x}): " + s_val
+
+    def free(self):
+        if self._dead:
+            return
+        self._dead = True
+        self._alloc.free(self)
+
+class GPUAllocator:
+    def __init__(self, agx, name, start, size,
+                 ctx=0, page_size=16384, va_block=None, guard_pages=1, **kwargs):
+        self.page_size = page_size
+        if va_block is None:
+            va_block = page_size
+        self.agx = agx
+        self.ctx = ctx
+        self.name = name
+        self.va = Heap(start, start + size, block=va_block)
+        self.verbose = 0
+        self.guard_pages = guard_pages
+        self.objects = {}
+        self.flags = kwargs
+        self.align_to_end = True
+
+    def make_stream(self, base):
+        return self.agx.uat.iostream(self.ctx, base, recurse=False)
+
+    def new(self, objtype, name=None, track=True, **kwargs):
+        obj = GPUObject(self, objtype)
+        obj._stream = self.make_stream
+        if name is not None:
+            obj._name = name
+
+        guard_size = self.page_size * self.guard_pages
+
+        size_align = align_up(obj._size, self.page_size)
+        addr = self.va.malloc(size_align + guard_size)
+        paddr = self.agx.u.memalign(self.page_size, size_align)
+        off = 0
+        if self.align_to_end:
+            off = size_align - obj._size
+
+        flags = dict(self.flags)
+        flags.update(kwargs)
+
+        obj._addr_align = addr
+        obj._paddr_align = paddr
+        obj._size_align = size_align
+        self.agx.uat.iomap_at(self.ctx, addr, paddr, size_align, **flags)
+        obj._set_addr(addr + off, paddr + off)
+        obj._map_flags = flags
+
+        self.objects[obj._addr] = obj
+
+        if self.verbose:
+            self.agx.log(f"[{self.name}] Alloc {obj._name} size {obj._size:#x} @ {obj._addr:#x} ({obj._paddr:#x})")
+
+        self.agx.reg_object(obj, track=track)
+        return obj
+
+    def new_buf(self, size, name, track=True):
+        return self.new(HexDump(Bytes(size)), name=name, track=track)
+
+    def buf(self, size, name, track=True):
+        return self.new_buf(size, name, track).push()._addr
+
+    def free(self, obj):
+        obj._dead = True
+        is_private = obj._map_flags.get("AttrIndex", MemoryAttr.Normal) != MemoryAttr.Shared
+        if is_private and obj._addr_align > 0xf8000000000:
+            flags2 = dict(obj._map_flags)
+            flags2["AttrIndex"] = MemoryAttr.Shared
+            self.agx.uat.iomap_at(self.ctx, obj._addr_align, obj._paddr_align,
+                                  obj._size_align, **flags2)
+            self.agx.uat.flush_dirty()
+            self.agx.uat.handoff.prepare_cacheflush(obj._addr_align, obj._size_align)
+            self.agx.ch.fwctl.send_inval(0x40, obj._addr_align)
+            self.agx.uat.handoff.wait_cacheflush()
+
+        self.agx.uat.iomap_at(self.ctx, obj._addr_align, 0,
+                              obj._size_align, VALID=0)
+
+        if is_private and obj._addr_align > 0xf8000000000:
+            self.agx.uat.flush_dirty()
+            self.agx.uat.handoff.complete_cacheflush()
+
+        self.agx.u.free(obj._paddr_align)
+        self.va.free(obj._addr_align)
+        del self.objects[obj._addr]
+        self.agx.unreg_object(obj)
+
+        if self.verbose:
+            self.agx.log(f"[{self.name}] Free {obj._name} size {obj._size:#x} @ {obj._addr:#x} ({obj._paddr:#x})")
diff --git a/tools/proxyclient/m1n1/agx/render.py b/tools/proxyclient/m1n1/agx/render.py
new file mode 100644
index 0000000..b29683b
--- /dev/null
+++ b/tools/proxyclient/m1n1/agx/render.py
@@ -0,0 +1,1075 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+import sys, json, zipfile
+
+json.c_make_encoder = None
+
+from m1n1.proxy import *
+from .context import *
+from .event import GPUEventManager
+from .uapi import *
+from m1n1.constructutils import ConstructClass, Ver
+
+def unswizzle(agx, addr, w, h, psize, dump=None, grid=False):
+    iface = agx.u.iface
+
+    tw = 64
+    th = 64
+    ntx = (w + tw - 1) // 64
+    nty = (h + th - 1) // 64
+    data = iface.readmem(addr, ntx * nty * psize * tw * th)
+    new_data = []
+    for y in range(h):
+        ty = y // th
+        for x in range(w):
+            tx = x // tw
+            toff = tw * th * psize * (ty * ntx + tx)
+            j = x & (tw - 1)
+            i = y & (th - 1)
+            off = (
+                ((j & 1) << 0) | ((i & 1) << 1) |
+                ((j & 2) << 1) | ((i & 2) << 2) |
+                ((j & 4) << 2) | ((i & 4) << 3) |
+                ((j & 8) << 3) | ((i & 8) << 4) |
+                ((j & 16) << 4) | ((i & 16) << 5) |
+                ((j & 32) << 5) | ((i & 32) << 6))
+            r,g,b,a = data[toff + psize*off: toff + psize*(off+1)]
+            if grid:
+                if x % 64 == 0 or y % 64 == 0:
+                    r,g,b,a = 255,255,255,255
+                elif x % 32 == 0 or y % 32 == 0:
+                    r,g,b,a = 128,128,128,255
+            new_data.append(bytes([b, g, r, a]))
+    data = b"".join(new_data)
+    if dump:
+        open(dump, "wb").write(data[:w*h*psize])
+    #iface.writemem(addr, data)
+
+class GPUFrame:
+    def __init__(self, context, filename=None, track=False):
+        self.ctx = context
+        self.agx = context.agx
+        self.objects = []
+        self.cmdbuf = None
+        self.track = track
+        if filename is not None:
+            self.load(filename)
+
+    def add_object(self, obj):
+        self.objects.append(obj)
+
+    def save(self, filename):
+        cmdbuf = self.cmdbuf
+        with zipfile.ZipFile(filename, "w") as zf:
+            cmdbuf_data = json.dumps(cmdbuf, indent=4).encode("utf-8")
+            zf.writestr("cmdbuf.json", cmdbuf_data)
+
+            obj_info = []
+            for obj in self.objects:
+                if obj._data == bytes(obj._size):
+                    filename = None
+                else:
+                    filename = f"obj_{obj._addr:x}.bin"
+                    zf.writestr(filename, obj._data)
+                obj_info.append({
+                    "file": filename,
+                    "name": obj._name,
+                    "addr": obj._addr,
+                    "size": obj._size,
+                    "map_flags": obj._map_flags,
+                })
+
+            obj_info_data = json.dumps(obj_info, indent=4).encode("utf-8")
+            zf.writestr("objects.json", obj_info_data)
+
+    def load(self, filename):
+        with zipfile.ZipFile(filename, "r") as zf:
+            with zf.open("cmdbuf.json", "r") as fd:
+                self.cmdbuf = drm_asahi_cmdbuf_t.from_json(fd)
+            with zf.open("objects.json", "r") as fd:
+                obj_info = json.load(fd)
+
+            self.objects = []
+            for i in obj_info:
+                filename = i["file"]
+                obj = self.ctx.new_at(i["addr"], Bytes(i["size"]), name=i["name"], track=self.track,
+                                          **i["map_flags"])
+                if filename is not None:
+                    with zf.open(i["file"], "r") as fd:
+                        data = fd.read()
+                        obj.val = data
+                        obj.push()
+                else:
+                    obj.val = bytes(i["size"])
+                    obj.push()
+                self.objects.append(obj)
+
+class GPUWork:
+    def __init__(self, renderer):
+        self.objects = []
+        self.renderer = renderer
+
+    def add(self, obj):
+        self.objects.append(obj)
+
+    def free(self):
+        for obj in self.objects:
+            obj.free()
+        self.objects = []
+
+class GPURenderer:
+    def __init__(self, ctx, buffers=16, bm_slot=0, queue=0):
+        self.agx = agx = ctx.agx
+        self.queue = queue
+
+        # 0..63
+        self.ctx = ctx
+        self.ctx_id = ctx.ctx
+
+        # 0..255
+        self.buffers = buffers
+        self.buffer_mgr_slot = bm_slot
+
+        ## These MUST go together
+        self.buffer_mgr = GPUBufferManager(agx, ctx, buffers)
+        self.buffer_mgr_initialized = False
+        self.unk_emptybuf = agx.kobj.new_buf(0x40, "unk_emptybuf")
+        self.tpc_size = 0
+
+        ##### Job group
+
+        self.job_list = agx.kshared.new(JobList)
+        self.job_list.first_job = 0
+        self.job_list.last_head = self.job_list._addr # Empty list has self as last_head
+        self.job_list.unkptr_10 = 0
+        self.job_list.push()
+
+        ##### Work Queues
+
+        self.ts3d_1 = agx.kshared.new(Int64ul, name="3D timestamp 1")
+        self.ts3d_2 = agx.kshared.new(Int64ul, name="3D timestamp 2")
+        self.tsta_1 = agx.kshared.new(Int64ul, name="TA timestamp 1")
+        self.tsta_2 = agx.kshared.new(Int64ul, name="TA timestamp 2")
+
+        self.wq_3d = GPU3DWorkQueue(agx, ctx, self.job_list)
+        self.wq_ta = GPUTAWorkQueue(agx, ctx, self.job_list)
+
+        self.wq_3d.info.uuid = 0x3D0000 | bm_slot
+        self.wq_3d.info.push()
+        self.wq_ta.info.uuid = 0x7A0000 | bm_slot
+        self.wq_ta.info.push()
+
+        self.stamp_value_3d = 0x3D000000 | (bm_slot << 16)
+        self.stamp_value_ta = 0x7A000000 | (bm_slot << 16)
+
+        ##### TA stamps
+
+        # start?
+        self.stamp_ta1 = agx.kshared.new(StampCounter, name="TA stamp 1")
+        self.stamp_ta1.value = self.stamp_value_ta
+        self.stamp_ta1.push()
+
+        # complete?
+        self.stamp_ta2 = agx.kobj.new(StampCounter, name="TA stamp 2")
+        self.stamp_ta2.value = self.stamp_value_ta
+        self.stamp_ta2.push()
+
+        ##### 3D stamps
+
+        # start?
+        self.stamp_3d1 = agx.kshared.new(StampCounter, name="3D stamp 1")
+        self.stamp_3d1.value = self.stamp_value_3d
+        self.stamp_3d1.push()
+
+        # complete?
+        self.stamp_3d2 = agx.kobj.new(StampCounter, name="3D stamp 2")
+        self.stamp_3d2.value = self.stamp_value_3d
+        self.stamp_3d2.push()
+
+
+        ##### Things userspace deals with for macOS
+
+        #self.aux_fb = ctx.uobj.new_buf(0x8000, "Aux FB thing")
+        ##self.deflake_1 = ctx.uobj.new_buf(0x20, "Deflake 1")
+        ##self.deflake_2 = ctx.uobj.new_buf(0x280, "Deflake 2")
+        ##self.deflake_3 = ctx.uobj.new_buf(0x540, "Deflake 3")
+        #self.deflake = ctx.uobj.new_buf(0x7e0, "Deflake")
+        #self.unk_buf = ctx.uobj.new(Array(0x800, Int64ul), "Unknown Buffer")
+        #self.unk_buf.val = [0, *range(1, 0x400), *(0x400 * [0])]
+        #self.unk_buf.push()
+
+        ##### Some kind of feedback/status buffer, GPU managed?
+
+        self.event_control = agx.kobj.new(EventControl)
+        self.event_control.event_count = agx.kobj.new(Int32ul, "event_count")
+        self.event_control.event_count.val = 0
+        self.event_control.event_count.push()
+
+        self.event_control.generation = 0
+        self.event_control.cur_count = 0
+        self.event_control.unk_10 = 0x50
+        self.event_control.push()
+
+        self.frames = 0
+
+        self.ev_ta = ev_ta = self.agx.event_mgr.allocate_event()
+        self.ev_3d = ev_3d = self.agx.event_mgr.allocate_event()
+
+        self.work = []
+
+    def submit(self, cmdbuf, wait_for=None):
+        nclusters = 8
+
+        work = GPUWork(self)
+        self.work.append(work)
+
+        self.buffer_mgr.increment()
+
+        aux_fb = self.ctx.uobj.new_buf(0x20000, "Aux FB thing", track=False)
+        work.add(aux_fb)
+
+        # t8103
+        deflake_1_size = 0x540
+        deflake_2_size = 0x280
+        deflake_3_size = 0x20
+
+        # t6002 - 9 times larger instead of 8? works with 8...
+        deflake_1_size *= nclusters
+        deflake_2_size *= nclusters
+        deflake_3_size *= nclusters
+
+        deflake_1 = self.ctx.uobj.new_buf(deflake_1_size, "Deflake 1", track=True)
+        deflake_2 = self.ctx.uobj.new_buf(deflake_2_size, "Deflake 2", track=True)
+        deflake_3 = self.ctx.uobj.new_buf(deflake_3_size, "Deflake 3", track=True)
+        work.add(deflake_1)
+        work.add(deflake_2)
+        work.add(deflake_3)
+
+        unk_buf = self.ctx.uobj.new(Array(0x800, Int64ul), "Unknown Buffer", track=False)
+        work.add(unk_buf)
+
+        unk_buf.val = [0, *range(2, 0x401), *(0x400 * [0])]
+        unk_buf.push()
+
+        work.cmdbuf = cmdbuf
+
+        self.frames += 1
+
+        work.ev_ta = ev_ta = self.ev_ta
+        work.ev_3d = ev_3d = self.ev_3d
+
+        self.ev_ta.rearm()
+        self.ev_3d.rearm()
+
+        self.agx.log(f"ev_ta: {ev_ta.id}")
+        self.agx.log(f"ev_3d: {ev_3d.id}")
+
+        #self.event_control.base_stamp = self.stamp_value >> 8
+        #self.event_control.push()
+
+        self.prev_stamp_value_3d = self.stamp_value_3d
+        self.prev_stamp_value_ta = self.stamp_value_ta
+        self.stamp_value_3d += 0x100
+        self.stamp_value_ta += 0x100
+        self.event_control.event_count.val += 2
+        self.event_control.event_count.push()
+
+        work.stamp_value_3d = self.stamp_value_3d
+        work.stamp_value_ta = self.stamp_value_ta
+
+        agx = self.agx
+        ctx = self.ctx
+
+        work.width = width = cmdbuf.fb_width
+        work.height = height = cmdbuf.fb_height
+
+        ##### TVB allocations / Tiler config
+
+        tile_width = 32
+        tile_height = 32
+        tiles_x = ((width + tile_width - 1) // tile_width)
+        tiles_y = ((height + tile_height - 1) // tile_height)
+        tiles = tiles_x * tiles_y
+
+        mtiles_x = 4
+        mtiles_y = 4
+
+        mtile_x1 = align(((tiles_x + mtiles_x - 1) // mtiles_x), 4)
+        mtile_x2 = 2 * mtile_x1
+        mtile_x3 = 3 * mtile_x1
+        mtile_y1 = align(((tiles_y + mtiles_y - 1) // mtiles_y), 4)
+        mtile_y2 = 2 * mtile_y1
+        mtile_y3 = 3 * mtile_y1
+
+        mtile_stride = mtile_x1 * mtile_y1
+
+        ## TODO: *samples
+        tiles_per_mtile_x = mtile_x1
+        tiles_per_mtile_y = mtile_y1
+
+        tile_blocks_x = (tiles_x + 15) // 16
+        tile_blocks_y = (tiles_y + 15) // 16
+        tile_blocks = tile_blocks_x * tile_blocks_y
+
+        tiling_params = TilingParameters()
+        # rgn_header_size
+        rgn_entry_size = 5
+        tiling_params.size1 = (rgn_entry_size * tiles_per_mtile_x * tiles_per_mtile_y + 3) // 4
+        # PPP_MULTISAMPLECTL
+        tiling_params.unk_4 = 0x88
+        # PPP_CTRL
+        tiling_params.unk_8 = 0x203 # bit 0: GL clip mode
+        # PPP_SCREEN
+        tiling_params.x_max = width - 1
+        tiling_params.y_max = height - 1
+        # TE_SCREEN
+        tiling_params.tile_count = ((tiles_y-1) << 12) | (tiles_x-1)
+        # TE_MTILE1
+        tiling_params.x_blocks = mtile_x3 | (mtile_x2 << 9) | (mtile_x1 << 18)
+        # TE_MTILE2
+        tiling_params.y_blocks = mtile_y3 | (mtile_y2 << 9) | (mtile_y1 << 18)
+        tiling_params.size2 = mtile_stride
+        tiling_params.size3 = 2 * mtile_stride
+        tiling_params.unk_24 = 0x100
+        tiling_params.unk_28 = 0x8000
+
+        tilemap_size = (4 * tiling_params.size1 * mtiles_x * mtiles_y)
+
+        tmtiles_x = tiles_per_mtile_x * mtiles_x
+        tmtiles_y = tiles_per_mtile_y * mtiles_y
+
+        tpc_entry_size = 8
+        tpc_size = tpc_entry_size * tmtiles_x * tmtiles_y * nclusters
+
+        if self.tpc_size < tpc_size:
+            self.tpc = ctx.uobj.new_buf(tpc_size, "TPC", track=True).push()
+            self.tpc_size = tpc_size
+
+        depth_aux_buffer_addr = 0
+        if cmdbuf.depth_buffer:
+            size = align_pot(max(width, tile_width)) * align_pot(max(height, tile_width)) // 32
+            depth_aux_buffer = self.ctx.uobj.new_buf(size, "Depth Aux", track=True)
+            work.add(depth_aux_buffer)
+            depth_aux_buffer_addr = depth_aux_buffer._addr
+
+        stencil_aux_buffer_addr = 0
+        if cmdbuf.stencil_buffer:
+            size = align_pot(max(width, tile_width)) * align_pot(max(height, tile_width)) // 32
+            stencil_aux_buffer = self.ctx.uobj.new_buf(size, "Stencil Aux", track=False)
+            work.add(stencil_aux_buffer)
+            stencil_aux_buffer_addr = stencil_aux_buffer._addr
+
+        #tvb_tilemap_size = 0x80 * mtile_stride
+        tvb_tilemap_size = tilemap_size
+        tvb_tilemap = ctx.uobj.new_buf(tvb_tilemap_size, "TVB Tilemap", track=True).push()
+        work.tvb_tilemap_size = tvb_tilemap_size
+        work.tvb_tilemap = tvb_tilemap
+        work.add(tvb_tilemap)
+
+        # rogue: 0x180 * 4?
+        tvb_heapmeta_size = 0x200
+        #tvb_heapmeta_size = 0x600
+        tvb_heapmeta = ctx.uobj.new_buf(tvb_heapmeta_size, "TVB Heap Meta", track=False).push()
+        work.add(tvb_heapmeta)
+
+        unk_tile_buf1 = self.ctx.uobj.new_buf(tvb_tilemap_size * nclusters, "Unk tile buf 1", track=True)
+        print("tvb_tilemap_size", hex(tvb_tilemap_size))
+        unk_tile_buf2 = self.ctx.uobj.new_buf(0x4 * nclusters, "Unk tile buf 2", track=True)
+        #size = 0xc0 * nclusters
+        size = 0xc80
+        unk_tile_buf3 = self.ctx.uobj.new_buf(size, "Unk tile buf 3", track=True)
+        unk_tile_buf4 = self.ctx.uobj.new_buf(0x280 * nclusters, "Unk tile buf 4", track=True)
+        unk_tile_buf5 = self.ctx.uobj.new_buf(0x30 * nclusters, "Unk tile buf 5", track=True)
+        work.add(unk_tile_buf1)
+        work.add(unk_tile_buf2)
+        work.add(unk_tile_buf3)
+        work.add(unk_tile_buf4)
+        work.add(unk_tile_buf5)
+
+        ##### Buffer stuff?
+
+        # buffer related?
+        bufferthing_buf = ctx.uobj.new_buf(0x80, "BufferThing.unkptr_18", track=True)
+        work.add(bufferthing_buf)
+
+        work.buf_desc = buf_desc = agx.kobj.new(BufferThing, track=False)
+        work.add(buf_desc)
+        buf_desc.unk_0 = 0x0
+        buf_desc.unk_8 = 0x0
+        buf_desc.unk_10 = 0x0
+        buf_desc.unkptr_18 = bufferthing_buf._addr
+        buf_desc.unk_20 = 0x0
+        buf_desc.bm_misc_addr = self.buffer_mgr.misc_obj._addr
+        buf_desc.unk_2c = 0x0
+        buf_desc.unk_30 = 0x0
+        buf_desc.unk_38 = 0x0
+        buf_desc.push()
+
+        uuid_3d = cmdbuf.cmd_3d_id
+        uuid_ta = cmdbuf.cmd_ta_id
+        encoder_id = cmdbuf.encoder_id
+
+        #print(barrier_cmd)
+
+        #self.wq_ta.submit(ta_barrier_cmd)
+
+        ##### 3D barrier command
+
+        barrier_cmd = agx.kobj.new(WorkCommandBarrier, track=False)
+        work.add(barrier_cmd)
+        barrier_cmd.stamp = self.stamp_ta2
+        barrier_cmd.wait_value = self.stamp_value_ta
+        barrier_cmd.stamp_self = self.stamp_value_3d
+        barrier_cmd.event = ev_ta.id
+        barrier_cmd.uuid = uuid_3d
+
+        #print(barrier_cmd)
+
+        self.wq_3d.submit(barrier_cmd)
+
+        ##### 3D execution
+
+        work.wc_3d = wc_3d = agx.kobj.new(WorkCommand3D, track=False)
+        work.add(work.wc_3d)
+        wc_3d.counter = 0
+        wc_3d.context_id = self.ctx_id
+        wc_3d.unk_8 = 0
+        wc_3d.event_control = self.event_control
+        wc_3d.buffer_mgr = self.buffer_mgr.info
+        wc_3d.buf_thing = buf_desc
+        wc_3d.unk_emptybuf_addr = self.unk_emptybuf._addr
+        wc_3d.tvb_tilemap = tvb_tilemap._addr
+        wc_3d.unk_40 = 0x88
+        wc_3d.unk_48 = 0x1
+        wc_3d.tile_blocks_y = mtile_y1
+        wc_3d.tile_blocks_x = mtile_x1
+        wc_3d.unk_50 = 0x0
+        wc_3d.unk_58 = 0x0
+
+        TAN_60 = 1.732051
+        wc_3d.merge_upper_x = TAN_60 / width
+        wc_3d.merge_upper_y = TAN_60 / height
+        wc_3d.unk_68 = 0x0
+        wc_3d.tile_count = tiles
+
+        wc_3d.unk_758 = Flag()
+        wc_3d.unk_75c = Flag()
+        wc_3d.unk_buf = WorkCommand1_UnkBuf()
+        wc_3d.busy_flag = Flag()
+        wc_3d.unk_buf2 = WorkCommand1_UnkBuf2()
+        wc_3d.unk_buf2.unk_0 = 0
+        wc_3d.unk_buf2.unk_8 = 0
+        wc_3d.unk_buf2.unk_10 = 1
+        wc_3d.ts1 = TimeStamp(0)
+        wc_3d.ts2 = TimeStamp(self.ts3d_1._addr)
+        wc_3d.ts3 = TimeStamp(self.ts3d_2._addr)
+        wc_3d.unk_914 = 0
+        wc_3d.unk_918 = 0
+        wc_3d.unk_920 = 0
+        wc_3d.unk_924 = 1
+        # Ventura
+        wc_3d.unk_928_0 = 0
+        wc_3d.unk_928_4 = 0
+        wc_3d.ts_flag = TsFlag()
+
+        # cmdbuf.ds_flags
+        # 0 - no depth
+        # 0x80000 - depth store enable
+        # 0x08000 - depth load enable
+
+        # 0x00044 - compressed depth
+
+        # 0x40000 - stencil store enable
+        # 0x04000 - stencil load enable
+        # 0x00110 - compressed stencil
+
+        # Z store format
+        # 0x4000000 - Depth16Unorm
+
+        # For Depth16Unorm: 0x40000 here also
+        # AFBI.[  0.  4] unk1 = 0x4c000
+
+        # ASAHI_CMDBUF_SET_WHEN_RELOADING_Z_OR_S
+        # Actually set when loading *and* storing Z, OR loading *and* storing S
+
+        # Structures embedded in WorkCommand3D
+        if True:
+            wc_3d.struct_1 = Start3DStruct1()
+            wc_3d.struct_1.store_pipeline_bind = cmdbuf.store_pipeline_bind
+            wc_3d.struct_1.store_pipeline_addr = cmdbuf.store_pipeline | 4
+            wc_3d.struct_1.unk_8 = 0x0
+            wc_3d.struct_1.unk_c = 0x0
+
+            TAN_60 = 1.732051
+            wc_3d.struct_1.merge_upper_x = TAN_60 / width
+            wc_3d.struct_1.merge_upper_y = TAN_60 / height
+
+            wc_3d.struct_1.unk_18 = 0x0
+            # ISP_MTILE_SIZE
+            wc_3d.struct_1.tile_blocks_y = mtile_y1
+            wc_3d.struct_1.tile_blocks_x = mtile_x1
+            wc_3d.struct_1.unk_24 = 0x0
+            wc_3d.struct_1.tile_counts = ((tiles_y-1) << 12) | (tiles_x-1)
+            wc_3d.struct_1.unk_2c = 0x8
+            wc_3d.struct_1.depth_clear_val1 = cmdbuf.depth_clear_value
+            wc_3d.struct_1.stencil_clear_val1 = cmdbuf.stencil_clear_value
+            wc_3d.struct_1.unk_35 = 0x7 # clear flags? 2 = depth 4 = stencil?
+            wc_3d.struct_1.unk_36 = 0x0
+            wc_3d.struct_1.unk_38 = 0x0
+            wc_3d.struct_1.unk_3c = 0x1
+            wc_3d.struct_1.unk_40 = 0
+            wc_3d.struct_1.unk_44_padding = bytes(0xac)
+            wc_3d.struct_1.depth_bias_array = Start3DArrayAddr(cmdbuf.depth_bias_array)
+            wc_3d.struct_1.scissor_array = Start3DArrayAddr(cmdbuf.scissor_array)
+            wc_3d.struct_1.visibility_result_buffer = 0x0
+            wc_3d.struct_1.unk_118 = 0x0
+            wc_3d.struct_1.unk_120 = [0] * 37
+            wc_3d.struct_1.unk_reload_pipeline = Start3DClearPipelineBinding(
+                cmdbuf.partial_reload_pipeline_bind, cmdbuf.partial_reload_pipeline | 4)
+            wc_3d.struct_1.unk_258 = 0
+            wc_3d.struct_1.unk_260 = 0
+            wc_3d.struct_1.unk_268 = 0
+            wc_3d.struct_1.unk_270 = 0
+            wc_3d.struct_1.reload_pipeline = Start3DClearPipelineBinding(
+                cmdbuf.partial_reload_pipeline_bind, cmdbuf.partial_reload_pipeline | 4)
+            wc_3d.struct_1.depth_flags = cmdbuf.ds_flags | 0x44
+            wc_3d.struct_1.unk_290 = 0x0
+            wc_3d.struct_1.depth_buffer_ptr1 = cmdbuf.depth_buffer
+            wc_3d.struct_1.unk_2a0 = 0x0
+            wc_3d.struct_1.unk_2a8 = 0x0
+            wc_3d.struct_1.depth_buffer_ptr2 = cmdbuf.depth_buffer
+            wc_3d.struct_1.depth_buffer_ptr3 = cmdbuf.depth_buffer
+            wc_3d.struct_1.depth_aux_buffer_ptr = depth_aux_buffer_addr
+            wc_3d.struct_1.stencil_buffer_ptr1 = cmdbuf.stencil_buffer
+            wc_3d.struct_1.unk_2d0 = 0x0
+            wc_3d.struct_1.unk_2d8 = 0x0
+            wc_3d.struct_1.stencil_buffer_ptr2 = cmdbuf.stencil_buffer
+            wc_3d.struct_1.stencil_buffer_ptr3 = cmdbuf.stencil_buffer
+            wc_3d.struct_1.stencil_aux_buffer_ptr = stencil_aux_buffer_addr
+            wc_3d.struct_1.unk_2f8 = [0x0, 0x0]
+            wc_3d.struct_1.aux_fb_unk0 = 4 #0x8 # sometimes 4
+            wc_3d.struct_1.unk_30c = 0x0
+            wc_3d.struct_1.aux_fb = AuxFBInfo(0xc000, 0, width, height)
+            wc_3d.struct_1.unk_320_padding = bytes(0x10)
+            wc_3d.struct_1.unk_partial_store_pipeline = Start3DStorePipelineBinding(
+                cmdbuf.partial_store_pipeline_bind, cmdbuf.partial_store_pipeline | 4)
+            wc_3d.struct_1.partial_store_pipeline = Start3DStorePipelineBinding(
+                cmdbuf.partial_store_pipeline_bind, cmdbuf.partial_store_pipeline | 4)
+            wc_3d.struct_1.depth_clear_val2 = cmdbuf.depth_clear_value
+            wc_3d.struct_1.stencil_clear_val2 = cmdbuf.stencil_clear_value
+            wc_3d.struct_1.unk_375 = 3
+            wc_3d.struct_1.unk_376 = 0x0
+            wc_3d.struct_1.unk_378 = 0x10
+            wc_3d.struct_1.unk_37c = 0x0
+            wc_3d.struct_1.unk_380 = 0x0
+            wc_3d.struct_1.unk_388 = 0x0
+            wc_3d.struct_1.unk_390_0 = 0x0 # Ventura
+            wc_3d.struct_1.depth_dimensions = (width - 1) | ((height - 1) << 15)
+
+        if True:
+            wc_3d.struct_2 = Start3DStruct2()
+            wc_3d.struct_2.unk_0 = 0xa000
+            wc_3d.struct_2.clear_pipeline = Start3DClearPipelineBinding(
+                cmdbuf.load_pipeline_bind, cmdbuf.load_pipeline | 4)
+            wc_3d.struct_2.unk_18 = 0x88
+            wc_3d.struct_2.scissor_array = cmdbuf.scissor_array
+            wc_3d.struct_2.depth_bias_array = cmdbuf.depth_bias_array
+            wc_3d.struct_2.aux_fb =  wc_3d.struct_1.aux_fb
+            # ISP_ZLS_PIXELS
+            wc_3d.struct_2.depth_dimensions = wc_3d.struct_1.depth_dimensions
+            wc_3d.struct_2.visibility_result_buffer = 0x0
+            # ISP_ZLSCTL
+            wc_3d.struct_2.depth_flags = cmdbuf.ds_flags
+            wc_3d.struct_2.unk_58_g14_0 = 0x4040404
+            wc_3d.struct_2.unk_58_g14_8 = 0
+            wc_3d.struct_2.depth_buffer_ptr1 = cmdbuf.depth_buffer
+            wc_3d.struct_2.depth_buffer_ptr2 = cmdbuf.depth_buffer
+            wc_3d.struct_2.unk_68_g14_0 = 0
+            wc_3d.struct_2.stencil_buffer_ptr1 = cmdbuf.stencil_buffer
+            wc_3d.struct_2.stencil_buffer_ptr2 = cmdbuf.stencil_buffer
+            wc_3d.struct_2.unk_78 = [0] * 4
+            wc_3d.struct_2.depth_aux_buffer_ptr1 = depth_aux_buffer_addr
+            wc_3d.struct_2.unk_a0 = 0
+            wc_3d.struct_2.depth_aux_buffer_ptr2 = depth_aux_buffer_addr
+            wc_3d.struct_2.unk_b0 = 0
+            wc_3d.struct_2.stencil_aux_buffer_ptr1 = stencil_aux_buffer_addr
+            wc_3d.struct_2.unk_c0 = 0
+            wc_3d.struct_2.stencil_aux_buffer_ptr2 = stencil_aux_buffer_addr
+            wc_3d.struct_2.unk_d0 = 0
+            wc_3d.struct_2.tvb_tilemap = tvb_tilemap._addr
+            wc_3d.struct_2.tvb_heapmeta_addr = tvb_heapmeta._addr
+            wc_3d.struct_2.unk_e8 = tiling_params.size1 << 24
+            wc_3d.struct_2.tvb_heapmeta_addr2 = tvb_heapmeta._addr
+            # 0x10000 - clear empty tiles
+            # ISP_CTL (but bits seem to have moved)
+            wc_3d.struct_2.unk_f8 = 0x10280 #0x10280 # TODO: varies 0, 0x280, 0x10000, 0x10280
+            wc_3d.struct_2.aux_fb_ptr = aux_fb._addr
+            wc_3d.struct_2.unk_108 = [0x0, 0x0, 0x0, 0x0, 0x0, 0x0]
+            wc_3d.struct_2.pipeline_base = self.ctx.pipeline_base
+            wc_3d.struct_2.unk_140 = 0x8c60
+            wc_3d.struct_2.unk_148 = 0x0
+            wc_3d.struct_2.unk_150 = 0x0
+            wc_3d.struct_2.unk_158 = 0x1c
+            wc_3d.struct_2.unk_160 = 0
+            wc_3d.struct_2.unk_168_padding = bytes(0x1d8)
+            wc_3d.struct_2.unk_198_padding = bytes(0x1a8)
+
+        if True:
+            wc_3d.struct_6 = Start3DStruct6()
+            wc_3d.struct_6.tvb_overflow_count = 0x0
+            wc_3d.struct_6.unk_8 = 0x0 # 1?
+            wc_3d.struct_6.unk_c = 0x0 # 1?
+            wc_3d.struct_6.unk_10 = 0x0
+            wc_3d.struct_6.encoder_id = cmdbuf.encoder_id
+            wc_3d.struct_6.unk_1c = 0xffffffff
+            wc_3d.struct_6.unknown_buffer = unk_buf._addr
+            wc_3d.struct_6.unk_28 = 0x0
+            wc_3d.struct_6.unk_30 = 0x0
+            wc_3d.struct_6.unk_34 = 0x0
+
+        if True:
+            wc_3d.struct_7 = Start3DStruct7()
+            wc_3d.struct_7.unk_0 = 0x0
+            wc_3d.struct_7.stamp1 = self.stamp_3d1
+            wc_3d.struct_7.stamp2 = self.stamp_3d2
+            wc_3d.struct_7.stamp_value = self.stamp_value_3d
+            wc_3d.struct_7.ev_3d = ev_3d.id
+            wc_3d.struct_7.evctl_index = 0x0
+            wc_3d.struct_7.unk_24 = 1
+            wc_3d.struct_7.uuid = uuid_3d
+            wc_3d.struct_7.prev_stamp_value = self.prev_stamp_value_3d >> 8
+            wc_3d.struct_7.unk_30 = 0x0
+
+        wc_3d.set_addr() # Update inner structure addresses
+        #print("WC3D", hex(wc_3d._addr))
+        #print(" s1", hex(wc_3d.struct_1._addr))
+        #print(" s2", hex(wc_3d.struct_2._addr))
+        #print(" s6", hex(wc_3d.struct_6._addr))
+        #print(" s7", hex(wc_3d.struct_7._addr))
+
+        ms = GPUMicroSequence(agx)
+
+        start_3d = Start3DCmd()
+        start_3d.struct1 = wc_3d.struct_1 # 0x44 bytes!
+        start_3d.struct2 = wc_3d.struct_2 # 0x168 bytes!
+        start_3d.buf_thing = buf_desc
+        start_3d.stats_ptr = agx.initdata.regionB.stats_3d.stats._addr
+        start_3d.busy_flag_ptr = wc_3d.busy_flag._addr
+        start_3d.struct6 = wc_3d.struct_6 # 4 bytes!
+        start_3d.struct7 = wc_3d.struct_7 # 4 bytes!
+        start_3d.cmdqueue_ptr = self.wq_3d.info._addr
+        start_3d.workitem_ptr = wc_3d._addr
+        start_3d.context_id = self.ctx_id
+        start_3d.unk_50 = 0x1
+        start_3d.event_generation = self.event_control.generation
+        start_3d.buffer_mgr_slot = self.buffer_mgr_slot
+        start_3d.unk_5c = 0x0
+        start_3d.prev_stamp_value = self.prev_stamp_value_3d >> 8
+        start_3d.unk_68 = 0x0
+        start_3d.unk_buf_ptr = wc_3d.unk_758._addr
+        start_3d.unk_buf2_ptr = wc_3d.unk_buf2._addr
+        start_3d.unk_7c = 0x0
+        start_3d.unk_80 = 0x0
+        start_3d.unk_84 = 0x0
+        start_3d.uuid = uuid_3d
+        start_3d.attachments = []
+        start_3d.unk_194 = 0
+        start_3d.unkptr_19c = self.event_control.unk_buf._addr
+
+        work.fb = None
+        work.depth = None
+
+        for i in cmdbuf.attachments[:cmdbuf.attachment_count]:
+            cache_lines = align_up(i.size, 128) // 128
+            order = 1 # FIXME
+            start_3d.attachments.append(Attachment(i.pointer, cache_lines, 0x17, order)) # FIXME check
+            if work.fb is None and i.type == ASAHI_ATTACHMENT_C:
+                work.fb = i.pointer
+            if work.depth is None and i.type == ASAHI_ATTACHMENT_Z:
+                work.depth = i.pointer
+        start_3d.attachments += [Attachment(0, 0, 0, 0)] * (16 - len(start_3d.attachments))
+        start_3d.num_attachments = cmdbuf.attachment_count
+        start_3d.unk_190 = 0x0
+
+        start_3d_offset = ms.append(start_3d)
+
+        ts1 = TimestampCmd()
+        ts1.unk_1 = 0x0
+        ts1.unk_2 = 0x0
+        ts1.unk_3 = 0x80
+        ts1.ts0_addr = wc_3d.ts1._addr
+        ts1.ts1_addr = wc_3d.ts2._addr
+        ts1.ts2_addr = wc_3d.ts2._addr
+        ts1.cmdqueue_ptr = self.wq_3d.info._addr
+        ts1.unk_24 = 0x0
+        if Ver.check("V >= V13_0B4"):
+            ts1.unkptr_2c_0 = wc_3d.ts_flag._addr
+        ts1.uuid = uuid_3d
+        ts1.unk_30_padding = 0x0
+        ms.append(ts1)
+
+        ms.append(WaitForInterruptCmd(0, 1, 0))
+
+        ts2 = TimestampCmd()
+        ts2.unk_1 = 0x0
+        ts2.unk_2 = 0x0
+        ts2.unk_3 = 0x0
+        ts2.ts0_addr = wc_3d.ts1._addr
+        ts2.ts1_addr = wc_3d.ts2._addr
+        ts2.ts2_addr = wc_3d.ts3._addr
+        ts2.cmdqueue_ptr = self.wq_3d.info._addr
+        ts2.unk_24 = 0x0
+        if Ver.check("V >= V13_0B4"):
+            ts2.unkptr_2c_0 = wc_3d.ts_flag._addr
+        ts2.uuid = uuid_3d
+        ts2.unk_30_padding = 0x0
+        ms.append(ts2)
+
+        finish_3d = Finalize3DCmd()
+        finish_3d.uuid = uuid_3d
+        finish_3d.unk_8 = 0
+        finish_3d.stamp = self.stamp_3d2
+        finish_3d.stamp_value = self.stamp_value_3d
+        finish_3d.unk_18 = 0
+        finish_3d.buf_thing = buf_desc
+        finish_3d.buffer_mgr = self.buffer_mgr.info
+        finish_3d.unk_2c = 1
+        finish_3d.stats_ptr = agx.initdata.regionB.stats_3d.stats._addr
+        finish_3d.struct7 = wc_3d.struct_7
+        finish_3d.busy_flag_ptr = wc_3d.busy_flag._addr
+        finish_3d.cmdqueue_ptr = self.wq_3d.info._addr
+        finish_3d.workitem_ptr = wc_3d._addr
+        finish_3d.unk_5c = self.ctx_id
+        finish_3d.unk_buf_ptr = wc_3d.unk_758._addr
+        finish_3d.unk_6c = 0
+        finish_3d.unk_74 = 0
+        finish_3d.unk_7c = 0
+        finish_3d.unk_84 = 0
+        finish_3d.unk_8c = 0
+        finish_3d.unk_8c_g14 = 0
+        finish_3d.restart_branch_offset = start_3d_offset - ms.off
+        finish_3d.unk_98 = 0
+        finish_3d.unk_9c = bytes(0x10)
+        ms.append(finish_3d)
+        ms.finalize()
+
+        work.add(ms.obj)
+
+        wc_3d.microsequence_ptr = ms.obj._addr
+        wc_3d.microsequence_size = ms.size
+
+        print(wc_3d)
+        self.wq_3d.submit(wc_3d)
+
+        ##### TA init
+
+        #print(ctx_info)
+        if wait_for is not None:
+            barrier_cmd = agx.kobj.new(WorkCommandBarrier, track=False)
+            work.add(barrier_cmd)
+            if not isinstance(wait_for, tuple):
+                barrier_cmd.stamp = wait_for.renderer.stamp_3d2
+                barrier_cmd.wait_value = wait_for.stamp_value_3d
+                barrier_cmd.event = wait_for.ev_3d.id
+            else:
+                barrier_cmd.stamp_addr = wait_for[0]
+                barrier_cmd.wait_value = wait_for[1]
+                barrier_cmd.event = wait_for[2]
+
+            barrier_cmd.stamp_self = self.stamp_value_ta
+            barrier_cmd.uuid = uuid_ta
+
+            self.wq_ta.submit(barrier_cmd)
+
+        if not self.buffer_mgr_initialized:
+            wc_initbm = agx.kobj.new(WorkCommandInitBM, track=False)
+            work.add(wc_initbm)
+            wc_initbm.context_id = self.ctx_id
+            wc_initbm.buffer_mgr_slot = self.buffer_mgr_slot
+            wc_initbm.unk_c = 0
+            wc_initbm.unk_10 = self.buffer_mgr.info.block_count
+            wc_initbm.buffer_mgr = self.buffer_mgr.info
+            wc_initbm.stamp_value = self.stamp_value_ta
+
+            self.wq_ta.submit(wc_initbm)
+
+            self.buffer_mgr_initialized = True
+
+        ##### TA execution
+
+        work.wc_ta = wc_ta = agx.kobj.new(WorkCommandTA, track=False)
+        work.add(work.wc_ta)
+        wc_ta.context_id = self.ctx_id
+        wc_ta.counter = 1
+        wc_ta.unk_8 = 0
+        wc_ta.event_control = self.event_control
+        wc_ta.buffer_mgr_slot = self.buffer_mgr_slot
+        wc_ta.buffer_mgr = self.buffer_mgr.info
+        wc_ta.buf_thing = buf_desc
+        wc_ta.unk_emptybuf_addr = wc_3d.unk_emptybuf_addr
+        wc_ta.unk_34 = 0x0
+
+        wc_ta.unk_154 = bytes(0x268)
+        wc_ta.unk_3e8 = bytes(0x74)
+        wc_ta.unk_594 = WorkCommand0_UnkBuf()
+
+        wc_ta.ts1 = TimeStamp(0)
+        wc_ta.ts2 = TimeStamp(self.tsta_1._addr)
+        wc_ta.ts3 = TimeStamp(self.tsta_2._addr)
+        wc_ta.unk_5c4 = 0
+        wc_ta.unk_5c8 = 0
+        wc_ta.unk_5cc = 0
+        wc_ta.unk_5d0 = 0
+        wc_ta.unk_5d4 = 1 #0x27 #1
+        # Ventura
+        wc_ta.unk_5e0 = 0
+        wc_ta.unk_5e4 = 0
+        wc_ta.ts_flag = TsFlag()
+
+        # Structures embedded in WorkCommandTA
+        if True:
+            wc_ta.tiling_params = tiling_params
+
+        if True:
+            wc_ta.struct_2 = StartTACmdStruct2()
+            wc_ta.struct_2.unk_0 = 0x200
+            wc_ta.struct_2.unk_8 = 0x1e3ce508 # fixed
+            wc_ta.struct_2.unk_c = 0x1e3ce508 # fixed
+            wc_ta.struct_2.tvb_tilemap = tvb_tilemap._addr
+            wc_ta.struct_2.tvb_cluster_tilemaps = unk_tile_buf1._addr
+            wc_ta.struct_2.tpc = self.tpc._addr
+            wc_ta.struct_2.tvb_heapmeta_addr = tvb_heapmeta._addr | 0x8000_0000_0000_0000
+            wc_ta.struct_2.iogpu_unk_54 = 0x6b0003 # fixed
+            wc_ta.struct_2.iogpu_unk_55 = 0x3a0012 # fixed
+            wc_ta.struct_2.iogpu_unk_56 = 0x1 # fixed
+            wc_ta.struct_2.tvb_cluster_meta1 = unk_tile_buf2._addr | 0x4_0000_0000_0000
+            wc_ta.struct_2.unk_48 = 0xa000
+            wc_ta.struct_2.unk_50 = 0x88 # fixed
+            wc_ta.struct_2.tvb_heapmeta_addr2 = tvb_heapmeta._addr
+            wc_ta.struct_2.unk_60 = 0x0 # fixed
+            wc_ta.struct_2.core_mask = 0xffffffffffffffff
+            #wc_ta.struct_2.unk_68 = 0xff << (8 * (self.buffer_mgr_slot % 8))
+            wc_ta.struct_2.iogpu_deflake_1 = deflake_1._addr
+            wc_ta.struct_2.iogpu_deflake_2 = deflake_2._addr
+            wc_ta.struct_2.unk_80 = 0x1 # fixed
+            wc_ta.struct_2.iogpu_deflake_3 = deflake_3._addr | 0x4_0000_0000_0000 # check
+            wc_ta.struct_2.encoder_addr = cmdbuf.encoder_ptr
+            wc_ta.struct_2.tvb_cluster_meta2 = unk_tile_buf3._addr
+            wc_ta.struct_2.tvb_cluster_meta3 = unk_tile_buf4._addr
+            wc_ta.struct_2.tiling_control = 0xa040 #0xa041 # fixed
+            wc_ta.struct_2.unk_b0 = [0x0, 0x0, 0x0, 0x0, 0x0, 0x0] # fixed
+            wc_ta.struct_2.pipeline_base = self.ctx.pipeline_base
+            wc_ta.struct_2.tvb_cluster_meta4 = unk_tile_buf5._addr | 0x3000_0000_0000_0000
+            wc_ta.struct_2.unk_f0 = 0x20 # fixed
+            wc_ta.struct_2.unk_f8 = 0x8c60 # fixed
+            wc_ta.struct_2.unk_100 = [0x0, 0x0, 0x0] # fixed
+            wc_ta.struct_2.unk_118 = 0x1c # fixed
+
+        if True:
+            wc_ta.struct_3 = StartTACmdStruct3()
+            wc_ta.struct_3.unk_480 = [0x0, 0x0, 0x0, 0x0, 0x0, 0x0] # fixed
+            wc_ta.struct_3.unk_498 = 0x0 # fixed
+            wc_ta.struct_3.unk_4a0 = 0x0 # fixed
+            wc_ta.struct_3.iogpu_deflake_1 = deflake_1._addr
+            wc_ta.struct_3.unk_4ac = 0x0 # fixed
+            wc_ta.struct_3.unk_4b0 = 0x0 # fixed
+            wc_ta.struct_3.unk_4b8 = 0x0 # fixed
+            wc_ta.struct_3.unk_4bc = 0x0 # fixed
+            wc_ta.struct_3.unk_4c4_padding = bytes(0x48)
+            wc_ta.struct_3.unk_50c = 0x0 # fixed
+            wc_ta.struct_3.unk_510 = 0x0 # fixed
+            wc_ta.struct_3.unk_518 = 0x0 # fixed
+            wc_ta.struct_3.unk_520 = 0x0 # fixed
+            wc_ta.struct_3.unk_528 = 0x0 # fixed
+            wc_ta.struct_3.unk_52c = 0x0 # fixed
+            wc_ta.struct_3.unk_530 = 0x0 # fixed
+            wc_ta.struct_3.encoder_id = cmdbuf.encoder_id
+            wc_ta.struct_3.unk_538 = 0x0 # fixed
+            wc_ta.struct_3.unk_53c = 0xffffffff
+            wc_ta.struct_3.unknown_buffer = wc_3d.struct_6.unknown_buffer
+            wc_ta.struct_3.unk_548 = 0x0 # fixed
+            wc_ta.struct_3.unk_550 = [
+                0x0, 0x0, # fixed
+                0x0, # 1 for boot stuff?
+                0x0, 0x0, 0x0] # fixed
+            wc_ta.struct_3.stamp1 = self.stamp_ta1
+            wc_ta.struct_3.stamp2 = self.stamp_ta2
+            wc_ta.struct_3.stamp_value = self.stamp_value_ta
+            wc_ta.struct_3.ev_ta = ev_ta.id
+            wc_ta.struct_3.evctl_index = 0
+            wc_ta.struct_3.unk_584 = 0x0 # 1 for boot stuff?
+            wc_ta.struct_3.uuid2 = uuid_ta
+            wc_ta.struct_3.prev_stamp_value = self.prev_stamp_value_ta >> 8
+            wc_ta.struct_3.unk_590 = 0 # sometimes 1?
+
+        wc_ta.set_addr() # Update inner structure addresses
+        #print("wc_ta", wc_ta)
+
+        ms = GPUMicroSequence(agx)
+
+        start_ta = StartTACmd()
+        start_ta.tiling_params = wc_ta.tiling_params
+        start_ta.struct2 = wc_ta.struct_2 # len 0x120
+        start_ta.buffer_mgr = self.buffer_mgr.info
+        start_ta.buf_thing = buf_desc
+        start_ta.stats_ptr = agx.initdata.regionB.stats_ta.stats._addr
+        start_ta.cmdqueue_ptr = self.wq_ta.info._addr
+        start_ta.context_id = self.ctx_id
+        start_ta.unk_38 = 1
+        start_ta.event_generation = self.event_control.generation
+        start_ta.buffer_mgr_slot = self.buffer_mgr_slot
+        start_ta.unk_48 = 0#1 #0
+        start_ta.unk_50 = 0
+        start_ta.struct3 = wc_ta.struct_3
+
+        start_ta.unkptr_5c = wc_ta.unk_594._addr
+        start_ta.unk_64 = 0x0 # fixed
+        start_ta.unk_68 = 0x0 # sometimes 1?
+        start_ta.uuid = uuid_ta
+        start_ta.unk_70 = 0x0 # fixed
+        start_ta.unk_74 = [ # fixed
+            0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+        ]
+        start_ta.unk_15c = 0x0 # fixed
+        start_ta.unk_160 = 0x0 # fixed
+        start_ta.unk_168 = 0x0 # fixed
+        start_ta.unk_16c = 0x0 # fixed
+        start_ta.unk_170 = 0x0 # fixed
+        start_ta.unk_178 = 0x0 # fixed?
+        start_ta.unk_17c = 0x0
+        start_ta.unkptr_180 = self.event_control.unk_buf._addr
+        start_ta.unk_188 = 0x0
+
+        start_ta_offset = ms.append(start_ta)
+
+        ts1 = TimestampCmd()
+        ts1.unk_1 = 0x0
+        ts1.unk_2 = 0x0
+        ts1.unk_3 = 0x80
+        ts1.ts0_addr = wc_ta.ts1._addr
+        ts1.ts1_addr = wc_ta.ts2._addr
+        ts1.ts2_addr = wc_ta.ts2._addr
+        ts1.cmdqueue_ptr = self.wq_ta.info._addr
+        ts1.unk_24 = 0x0
+        if Ver.check("V >= V13_0B4"):
+            ts1.unkptr_2c_0 = wc_ta.ts_flag._addr
+        ts1.uuid = uuid_ta
+        ts1.unk_30_padding = 0x0
+        ms.append(ts1)
+
+        ms.append(WaitForInterruptCmd(1, 0, 0))
+
+        ts2 = TimestampCmd()
+        ts2.unk_1 = 0x0
+        ts2.unk_2 = 0x0
+        ts2.unk_3 = 0x0
+        ts2.ts0_addr = wc_ta.ts1._addr
+        ts2.ts1_addr = wc_ta.ts2._addr
+        ts2.ts2_addr = wc_ta.ts3._addr
+        ts2.cmdqueue_ptr = self.wq_ta.info._addr
+        ts2.unk_24 = 0x0
+        if Ver.check("V >= V13_0B4"):
+            ts2.unkptr_2c_0 = wc_ta.ts_flag._addr
+        ts2.uuid = uuid_ta
+        ts2.unk_30_padding = 0x0
+        ms.append(ts2)
+
+        finish_ta = FinalizeTACmd()
+        finish_ta.buf_thing = buf_desc
+        finish_ta.buffer_mgr = self.buffer_mgr.info
+        finish_ta.stats_ptr = agx.initdata.regionB.stats_ta.stats._addr
+        finish_ta.cmdqueue_ptr = self.wq_ta.info._addr
+        finish_ta.context_id = self.ctx_id
+        finish_ta.unk_28 = 0x0 # fixed
+        finish_ta.struct3 = wc_ta.struct_3
+        finish_ta.unk_34 = 0x0 # fixed
+        finish_ta.uuid = uuid_ta
+        finish_ta.stamp = self.stamp_ta2
+        finish_ta.stamp_value = self.stamp_value_ta
+        finish_ta.unk_48 = 0x0 # fixed
+        finish_ta.unk_50 = 0x0 # fixed
+        finish_ta.unk_54 = 0x0 # fixed
+        finish_ta.unk_58 = 0x0 # fixed
+        finish_ta.unk_60 = 0x0 # fixed
+        finish_ta.unk_64 = 0x0 # fixed
+        finish_ta.unk_68 = 0x0 # fixed
+        finish_ta.unk_6c_g14 = 0 # fixed
+        finish_ta.restart_branch_offset = start_ta_offset - ms.off
+        finish_ta.unk_70 = 0x0 # fixed
+        finish_ta.unk_74 = bytes(0x10) # Ventura
+        ms.append(finish_ta)
+
+        ms.finalize()
+
+        work.add(ms.obj)
+
+        wc_ta.unkptr_45c = self.tpc._addr
+        wc_ta.tvb_size = tpc_size
+        wc_ta.microsequence_ptr = ms.obj._addr
+        wc_ta.microsequence_size = ms.size
+        wc_ta.ev_3d = ev_3d.id
+        wc_ta.stamp_value = self.stamp_value_ta
+
+        print(wc_ta)
+        self.wq_ta.submit(wc_ta)
+
+        self.agx.log("Submit done")
+        return work
+
+    def run(self):
+        ##### Run queues
+        self.agx.log("Run queues")
+        self.agx.ch.queue[self.queue].q_3D.run(self.wq_3d, self.ev_3d.id)
+        self.agx.ch.queue[self.queue].q_TA.run(self.wq_ta, self.ev_ta.id)
+        self.agx.log("Run done")
+
+    def wait(self):
+        self.agx.log("Waiting...")
+        work = self.work[-1]
+
+        ##### Wait for work completion
+        while not self.ev_3d.fired:
+            self.agx.wait_for_events(timeout=2.0)
+
+        if not self.ev_3d.fired:
+            self.agx.log("3D event didn't fire")
+
+        self.agx.log(f"Event {self.ev_3d.id} fired")
+        #print("Stamps:")
+        #print(self.stamp_ta1.pull())
+        #print(self.stamp_ta2.pull())
+        #print(self.stamp_3d1.pull())
+        #print(self.stamp_3d2.pull())
+
+        #print("WCs:")
+        #print(work.wc_3d.pull())
+        #print(work.wc_ta.pull())
+
+        #if work.fb is not None and work.width and work.height:
+        if work.fb is not None and work.width and work.height and work.width == 1920:
+            agx = self.agx
+            self.agx.log(f"Render {work.width}x{work.height} @ {work.fb:#x}")
+            base, obj = self.agx.find_object(work.fb, self.ctx_id)
+
+            #unswizzle(agx, obj._paddr, work.width, work.height, 4, "fb.bin", grid=False)
+            #open("fb.bin", "wb").write(self.agx.u.iface.readmem(obj._paddr, work.width*work.height*4))
+            #os.system(f"convert -size {work.width}x{work.height} -depth 8 rgba:fb.bin -alpha off frame{self.frames}.png")
+            self.agx.p.fb_blit(0, 0, work.width, work.height, obj._paddr, work.width, PIX_FMT.XBGR)
+
+        if False: #work.depth is not None:
+            base, obj = self.agx.find_object(work.depth, self.ctx_id)
+
+            width = align_up(work.width, 64)
+            height = align_up(work.height, 64)
+
+            obj.pull()
+            chexdump(obj.val)
+
+            unswizzle(self.agx, obj._paddr, work.width, work.height, 4, "depth.bin", grid=False)
+            os.system(f"convert -size {work.width}x{work.height} -depth 8 rgba:depth.bin -alpha off depth.png")
+
+        for i in self.work:
+            i.free()
+
+        self.work = []
diff --git a/tools/proxyclient/m1n1/agx/shim.py b/tools/proxyclient/m1n1/agx/shim.py
new file mode 100644
index 0000000..253812a
--- /dev/null
+++ b/tools/proxyclient/m1n1/agx/shim.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+import errno, ctypes, sys, atexit, os, os.path, mmap
+from construct import *
+
+from m1n1 import malloc
+from m1n1.utils import Register32
+from m1n1.agx import AGX
+from m1n1.agx.render import *
+from m1n1.agx.uapi import *
+from m1n1.proxyutils import *
+from m1n1.utils import *
+
+PAGE_SIZE = 32768
+SHIM_MEM_SIZE = 4 * 1024 * 1024 * 1024
+
+class IOCTL(Register32):
+    NR = 7, 0
+    TYPE = 15, 8
+    SIZE = 29, 16
+    DIR = 31, 30
+
+_IOC_NONE = 0
+_IOC_WRITE = 1
+_IOC_READ = 2
+
+_IO = lambda type, nr: IOCTL(TYPE=type, NR=nr, SIZE=0, DIR=_IOC_NONE)
+_IOR = lambda type, nr, size: IOCTL(TYPE=type, NR=nr, SIZE=size, DIR=_IOC_READ)
+_IOW = lambda type, nr, size: IOCTL(TYPE=type, NR=nr, SIZE=size, DIR=_IOC_WRITE)
+_IOWR = lambda type, nr, size: IOCTL(TYPE=type, NR=nr, SIZE=size, DIR=_IOC_READ|_IOC_WRITE)
+
+DRM_IOCTL_BASE = ord('d')
+
+def IO(nr):
+    def dec(f):
+        f._ioctl = _IO(DRM_IOCTL_BASE, nr)
+        return f
+    return dec
+
+def IOR(nr, cls):
+    def dec(f):
+        f._ioctl = _IOR(DRM_IOCTL_BASE, nr, cls.sizeof())
+        f._arg_cls = cls
+        return f
+    return dec
+
+def IOW(nr, cls):
+    def dec(f):
+        f._ioctl = _IOW(DRM_IOCTL_BASE, nr, cls.sizeof())
+        f._arg_cls = cls
+        return f
+    return dec
+
+def IOWR(nr, cls):
+    def dec(f):
+        f._ioctl = _IOWR(DRM_IOCTL_BASE, nr, cls.sizeof())
+        f._arg_cls = cls
+        return f
+    return dec
+
+class DRMAsahiShim:
+    def __init__(self, memfd):
+        self.memfd = memfd
+        self.initialized = False
+        self.ioctl_map = {}
+        for key in dir(self):
+            f = getattr(self, key)
+            ioctl = getattr(f, "_ioctl", None)
+            if ioctl is not None:
+                self.ioctl_map[ioctl.value] = ioctl, f
+        self.bos = {}
+        self.pull_buffers = bool(os.getenv("ASAHI_SHIM_PULL"))
+        self.dump_frames = bool(os.getenv("ASAHI_SHIM_DUMP"))
+        self.frame = 0
+        self.agx = None
+
+    def read_buf(self, ptr, size):
+        return ctypes.cast(ptr, ctypes.POINTER(ctypes.c_ubyte * size))[0]
+
+    def init_agx(self):
+        from m1n1.setup import p, u, iface
+
+        p.pmgr_adt_clocks_enable("/arm-io/gfx-asc")
+        p.pmgr_adt_clocks_enable("/arm-io/sgx")
+
+        self.agx = agx = AGX(u)
+
+        mon = RegMonitor(u, ascii=True, bufsize=0x8000000)
+        agx.mon = mon
+
+        sgx = agx.sgx_dev
+        #mon.add(sgx.gpu_region_base, sgx.gpu_region_size, "contexts")
+        #mon.add(sgx.gfx_shared_region_base, sgx.gfx_shared_region_size, "gfx-shared")
+        #mon.add(sgx.gfx_handoff_base, sgx.gfx_handoff_size, "gfx-handoff")
+
+        #mon.add(agx.initdasgx.gfx_handoff_base, sgx.gfx_handoff_size, "gfx-handoff")
+
+        atexit.register(p.reboot)
+        agx.start()
+
+    def init(self):
+        if self.initialized:
+            return
+
+        self.init_agx()
+        self.ctx = GPUContext(self.agx)
+        self.ctx.bind(0x17)
+        self.renderer = GPURenderer(self.ctx, 0x40, bm_slot=10, queue=1)
+
+        self.initialized = True
+
+    @IOW(DRM_COMMAND_BASE + 0x00, drm_asahi_submit_t)
+    def submit(self, fd, args):
+        sys.stdout.write(".")
+        sys.stdout.flush()
+
+        size = drm_asahi_cmdbuf_t.sizeof()
+        cmdbuf = drm_asahi_cmdbuf_t.parse(self.read_buf(args.cmdbuf, size))
+
+        self.log("Pushing objects...")
+        for obj in self.bos.values():
+            #if obj._skipped_pushes > 64:# and obj._addr > 0x1200000000 and obj._size > 131072:
+                #continue
+            obj.push(True)
+        self.log("Push done")
+
+        attachment_objs = []
+        for i in cmdbuf.attachments:
+            for obj in self.bos.values():
+                if obj._addr == i.pointer:
+                    attachment_objs.append(obj)
+
+        if self.dump_frames:
+            name = f"shim_frame{self.frame:03d}.agx"
+            f = GPUFrame(self.renderer.ctx)
+            f.cmdbuf = cmdbuf
+            for obj in self.bos.values():
+                f.add_object(obj)
+            f.save(name)
+
+        self.renderer.submit(cmdbuf)
+        self.renderer.run()
+        self.renderer.wait()
+
+        if self.pull_buffers:
+            self.log("Pulling buffers...")
+            for obj in attachment_objs:
+                obj.pull()
+                obj._map[:] = obj.val
+                obj.val = obj._map
+            self.log("Pull done")
+
+        #print("HEAP STATS")
+        #self.ctx.uobj.va.check()
+        #self.ctx.gobj.va.check()
+        #self.ctx.pobj.va.check()
+        #self.agx.kobj.va.check()
+        #self.agx.cmdbuf.va.check()
+        #self.agx.kshared.va.check()
+        #self.agx.kshared2.va.check()
+
+        self.frame += 1
+        return 0
+
+    @IOW(DRM_COMMAND_BASE + 0x01, drm_asahi_wait_bo_t)
+    def wait_bo(self, fd, args):
+        self.log("Wait BO!", args)
+        return 0
+
+    @IOWR(DRM_COMMAND_BASE + 0x02, drm_asahi_create_bo_t)
+    def create_bo(self, fd, args):
+        memfd_offset = args.offset
+
+        if args.flags & ASAHI_BO_PIPELINE:
+            alloc = self.renderer.ctx.pobj
+        else:
+            alloc = self.renderer.ctx.gobj
+
+        obj = alloc.new(args.size, name=f"GBM offset {memfd_offset:#x}", track=False)
+        obj._memfd_offset = memfd_offset
+        obj._pushed = False
+        obj.val = obj._map = mmap.mmap(self.memfd, args.size, offset=memfd_offset)
+        self.bos[memfd_offset] = obj
+        args.offset = obj._addr
+
+        if args.flags & ASAHI_BO_PIPELINE:
+            args.offset -= self.renderer.ctx.pipeline_base
+
+        self.log(f"Create BO @ {memfd_offset:#x}")
+        return 0
+
+    @IOWR(DRM_COMMAND_BASE + 0x04, drm_asahi_get_param_t)
+    def get_param(self, fd, args):
+        self.log("Get Param!", args)
+        return 0
+
+    @IOWR(DRM_COMMAND_BASE + 0x05, drm_asahi_get_bo_offset_t)
+    def get_bo_offset(self, fd, args):
+        self.log("Get BO Offset!", args)
+        return 0
+
+    def bo_free(self, memfd_offset):
+        self.log(f"Free BO @ {memfd_offset:#x}")
+        self.bos[memfd_offset].free()
+        del self.bos[memfd_offset]
+        sys.stdout.flush()
+
+    def ioctl(self, fd, request, p_arg):
+        self.init()
+
+        p_arg = ctypes.c_void_p(p_arg)
+
+        if request not in self.ioctl_map:
+            self.log(f"Unknown ioctl: fd={fd} request={IOCTL(request)} arg={p_arg:#x}")
+            return -errno.ENOSYS
+
+        ioctl, f = self.ioctl_map[request]
+
+        size = ioctl.SIZE
+        if ioctl.DIR & _IOC_WRITE:
+            args = f._arg_cls.parse(self.read_buf(p_arg, size))
+            ret = f(fd, args)
+        elif ioctl.DIR & _IOC_READ:
+            args = f._arg_cls.parse(bytes(size))
+            ret = f(fd, args)
+        else:
+            ret = f(fd)
+
+        if ioctl.DIR & _IOC_READ:
+            data = args.build()
+            assert len(data) == size
+            ctypes.memmove(p_arg, data, size)
+
+        sys.stdout.flush()
+        return ret
+
+    def log(self, s):
+        if self.agx is None:
+            print("[Shim] " + s)
+        else:
+            self.agx.log("[Shim] " + s)
+
+Shim = DRMAsahiShim
diff --git a/tools/proxyclient/m1n1/agx/uapi.py b/tools/proxyclient/m1n1/agx/uapi.py
new file mode 100644
index 0000000..75850cb
--- /dev/null
+++ b/tools/proxyclient/m1n1/agx/uapi.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+from construct import *
+from m1n1.constructutils import ConstructClass
+
+__all__ = []
+
+DRM_COMMAND_BASE = 0x40
+
+ASAHI_BO_PIPELINE = 1
+
+class drm_asahi_submit_t(ConstructClass):
+    subcon = Struct(
+        "cmdbuf" / Int64ul,
+        "in_syncs" / Int64ul,
+        "in_sync_count" / Int32ul,
+        "out_sync" / Int32ul,
+    )
+
+class drm_asahi_wait_bo_t(ConstructClass):
+    subcon = Struct(
+        "handle" / Int32ul,
+        Padding(4),
+        "timeout_ns" / Int64sl,
+    )
+
+class drm_asahi_create_bo_t(ConstructClass):
+    subcon = Struct(
+        "size" /  Int32ul,
+        "flags" / Int32ul,
+        "handle" / Int32ul,
+        Padding(4),
+        "offset" / Int64ul,
+    )
+
+#class drm_asahi_mmap_bo_t(ConstructClass):
+    #subcon = Struct(
+        #"handle" / Int32ul,
+        #"flags" / Int32ul,
+        #"offset" / Int64ul,
+    #)
+
+class drm_asahi_get_param_t(ConstructClass):
+    subcon = Struct(
+        "param" / Int32ul,
+        Padding(4),
+        "value" / Int64ul,
+    )
+
+class drm_asahi_get_bo_offset_t(ConstructClass):
+    subcon = Struct(
+        "handle" / Int32ul,
+        Padding(4),
+        "offset" / Int64ul,
+    )
+
+ASAHI_MAX_ATTACHMENTS = 16
+
+ASAHI_ATTACHMENT_C  = 0
+ASAHI_ATTACHMENT_Z  = 1
+ASAHI_ATTACHMENT_S  = 2
+
+class drm_asahi_attachment_t(ConstructClass):
+    subcon = Struct(
+        "type" / Int32ul,
+        "size" / Int32ul,
+        "pointer" / Int64ul,
+    )
+
+ASAHI_CMDBUF_LOAD_C = (1 << 0)
+ASAHI_CMDBUF_LOAD_Z = (1 << 1)
+ASAHI_CMDBUF_LOAD_S = (1 << 2)
+
+class drm_asahi_cmdbuf_t(ConstructClass):
+    subcon = Struct(
+        "flags" / Int64ul,
+
+        "encoder_ptr" / Int64ul,
+        "encoder_id" / Int32ul,
+
+        "cmd_ta_id" / Int32ul,
+        "cmd_3d_id" / Int32ul,
+
+        "ds_flags" / Int32ul,
+        "depth_buffer" / Int64ul,
+        "stencil_buffer" / Int64ul,
+
+        "scissor_array" / Int64ul,
+        "depth_bias_array" / Int64ul,
+
+        "fb_width" / Int32ul,
+        "fb_height" / Int32ul,
+
+        "load_pipeline" / Int32ul,
+        "load_pipeline_bind" / Int32ul,
+
+        "store_pipeline" / Int32ul,
+        "store_pipeline_bind" / Int32ul,
+
+        "partial_reload_pipeline" / Int32ul,
+        "partial_reload_pipeline_bind" / Int32ul,
+
+        "partial_store_pipeline" / Int32ul,
+        "partial_store_pipeline_bind" / Int32ul,
+
+        "depth_clear_value" / Float32l,
+        "stencil_clear_value" / Int8ul,
+        Padding(3),
+
+        "attachments" / Array(ASAHI_MAX_ATTACHMENTS, drm_asahi_attachment_t),
+        "attachment_count" / Int32ul,
+    )
+
+__all__.extend(k for k, v in globals().items()
+               if ((callable(v) or isinstance(v, type)) and v.__module__ == __name__) or isinstance(v, int))