diff --git a/src/audiomanager.cpp b/src/audiomanager.cpp
index 884265e..2689cf0 100644
--- a/src/audiomanager.cpp
+++ b/src/audiomanager.cpp
@@ -1,6 +1,8 @@
 #include "audiomanager.hh"
 
+#include "common/hardware/dma.h"
 #include "common/hardware/spu.h"
+#include <psyqo/kernel.hh>
 #include <psyqo/spu.hh>
 #include <psyqo/xprintf.h>
 
@@ -65,25 +67,34 @@ bool AudioManager::loadClip(int clipIndex, const uint8_t* adpcmData, uint32_t si
         return false;
     }
 
-    // psyqo::SPU::dmaWrite takes dataSize as uint16_t so upload in chunks
-    // for clips larger than 65532 bytes (largest multiple-of-4 that fits).
+    // psyqo::SPU::dmaWrite takes dataSize as uint16_t and uses blockSize=4:
+    //   BCR = blockSize | ((dataSize / blockSize) << 16)
+    //   block_count = dataSize / blockSize (integer division - truncates!)
+    //   actual bytes = block_count * blockSize * 4
     //
-    // psyqo DMA math: BCR = blockSize | ((dataSize/blockSize) << 16)
-    //   blockSize=4 → 4 words per block = 16 bytes per block
-    //   block count = dataSize/blockSize
-    //   total bytes = blockSize × (dataSize/blockSize) × 4 = dataSize × 4
-    // So dataSize = bytesThisRound / 4 gives the correct byte count.
+    // With blockSize=4: each block = 16 bytes. Max block_count that fits
+    // in uint16_t's BCR field: 4095. Max clean transfer: 4095 * 16 = 65520.
+    // bytesThisRound MUST be a multiple of 16 to avoid the integer division
+    // truncation causing fewer bytes to be DMA'd than the pointer advances.
     const uint8_t* src = adpcmData;
     uint32_t remaining = alignedSize;
     uint32_t dstAddr = addr;
     while (remaining > 0) {
-        // Max transfer per call: 65532 bytes (16383 blocks × 4 bytes each).
-        uint32_t bytesThisRound = (remaining > 65532u) ? 65532u : remaining;
-        bytesThisRound &= ~3u;  // DMA alignment
+        // Max transfer per call: 65520 bytes (4095 blocks * 16 bytes each).
+        uint32_t bytesThisRound = (remaining > 65520u) ? 65520u : remaining;
+        bytesThisRound &= ~15u;  // 16-byte block alignment
         if (bytesThisRound == 0) break;
 
         uint16_t dmaSizeParam = (uint16_t)(bytesThisRound / 4);
         psyqo::SPU::dmaWrite(dstAddr, src, dmaSizeParam, 4);
+
+        // PSYQo's internal waitForStatus only spins ~10000 iterations (~1.8ms).
+        // On real hardware, SPU DMA for 65KB takes tens of milliseconds.
+        // The timeout fires, the function returns, and the next chunk starts
+        // while the previous transfer is still in progress - corrupting data.
+        // Spin here until the DMA controller's busy bit actually clears.
+        while (DMA_CTRL[DMA_SPU].CHCR & (1 << 24)) {}
+
         src += bytesThisRound;
         dstAddr += bytesThisRound;
         remaining -= bytesThisRound;
@@ -92,6 +103,12 @@ bool AudioManager::loadClip(int clipIndex, const uint8_t* adpcmData, uint32_t si
     // dmaWrite() now properly restores transfer mode to idle after each
     // DMA transfer, so no manual SPU_CTRL fix-up is needed here.
 
+    // Restore SPU to manual (non-DMA) mode after upload.
+    // psyqo::SPU::dmaWrite sets SPU_CTRL bit 5 (DMA write mode) but never
+    // clears it. On real hardware, voice register writes (pitch, volume, etc.)
+    // may be ignored while the SPU bus is still in DMA mode.
+    SPU_CTRL &= ~(0b11 << 4);
+
     m_clips[clipIndex].spuAddr = addr;
     m_clips[clipIndex].size = sizeBytes;
     m_clips[clipIndex].sampleRate = sampleRate;
@@ -125,21 +142,10 @@ int AudioManager::play(int clipIndex, int volume, int pan) {
         rightVol = (uint16_t)((uint32_t)vol * p / 127);
     }
 
-    psyqo::SPU::ChannelPlaybackConfig config;
-    config.sampleRate.value = static_cast<uint16_t>(((uint32_t)clip.sampleRate << 12) / 44100);
-    config.volumeLeft  = leftVol;
-    config.volumeRight = rightVol;
-    config.adsr = DEFAULT_ADSR;
-
     // Set the repeat address depending on loop mode.
-    // The new psyqo::SPU::getNextFreeChannel() uses the ENDX register:
-    // a channel is "free" when its ENDX bit is set (voice reached loop-end).
-    // silenceChannels() points voices at psyqo's silent dummy sample at 0x1000
-    // that immediately sets ENDX, so stopped channels are detected as free.
-    //
-    // Looping clips:     repeat → clip start  (loop back to beginning).
-    // Non-looping clips: repeat → dummy 0x1000 (go silent after clip ends,
-    //                    dummy's loop-end flag re-sets ENDX → channel freed).
+    // Looping clips:     repeat -> clip start  (loop back to beginning).
+    // Non-looping clips: repeat -> dummy 0x1000 (go silent after clip ends,
+    //                    dummy's loop-end flag re-sets ENDX -> channel freed).
     constexpr uint16_t DUMMY_SPU_ADDR = 0x1000;
     if (clip.loop) {
         SPU_VOICES[ch].sampleRepeatAddr = static_cast<uint16_t>(clip.spuAddr / 8);
@@ -147,9 +153,38 @@ int AudioManager::play(int clipIndex, int volume, int pan) {
         SPU_VOICES[ch].sampleRepeatAddr = DUMMY_SPU_ADDR / 8;
     }
 
-    psyqo::SPU::playADPCM(static_cast<uint8_t>(ch),
-                           static_cast<uint16_t>(clip.spuAddr),
-                           config, true);
+    // Build playback config
+    psyqo::SPU::ChannelPlaybackConfig config;
+    config.sampleRate.value = static_cast<uint16_t>(((uint32_t)clip.sampleRate << 12) / 44100);
+    config.volumeLeft  = leftVol;
+    config.volumeRight = rightVol;
+    config.adsr = DEFAULT_ADSR;
+
+    // Write SPU voice registers directly instead of PSYQo's playADPCM(),
+    // which truncates addresses above 64KB (uint16_t parameter).
+    // The sampleStartAddr register stores addr/8, so uint16_t covers
+    // the full 512KB SPU RAM range.
+
+    // KEY_OFF (hard cut)
+    if (ch > 15) {
+        SPU_KEY_OFF_HIGH = 1 << (ch - 16);
+    } else {
+        SPU_KEY_OFF_LOW = 1 << ch;
+    }
+
+    SPU_VOICES[ch].volumeLeft  = config.volumeLeft;
+    SPU_VOICES[ch].volumeRight = config.volumeRight;
+    SPU_VOICES[ch].sampleRate  = config.sampleRate.value;
+    SPU_VOICES[ch].sampleStartAddr = static_cast<uint16_t>(clip.spuAddr / 8);
+    SPU_VOICES[ch].ad = config.adsr & 0xFFFF;
+    SPU_VOICES[ch].sr = (config.adsr >> 16) & 0xFFFF;
+
+    // KEY_ON
+    if (ch > 15) {
+        SPU_KEY_ON_HIGH = 1 << (ch - 16);
+    } else {
+        SPU_KEY_ON_LOW = 1 << ch;
+    }
 
     return static_cast<int>(ch);
 }
diff --git a/src/audiomanager.hh b/src/audiomanager.hh
index 52c753b..5ca8c14 100644
--- a/src/audiomanager.hh
+++ b/src/audiomanager.hh
@@ -15,10 +15,12 @@ static constexpr int MAX_VOICES = 24;
 /// psyqo places a 16-byte silent dummy sample at 0x1000.
 /// User clips start at 0x1010.
 ///
-/// Upper bound is 0x10000 (64KB) because psyqo::SPU::playADPCM()
-/// takes a uint16_t for the SPU RAM address.
+/// Note: psyqo::SPU::playADPCM() takes a uint16_t for the address,
+/// which would limit to 64KB. We bypass it and write SPU registers
+/// directly to address the full 512KB range (register stores addr/8,
+/// so uint16_t covers 0-0x7FFF8).
 static constexpr uint32_t SPU_RAM_START = 0x1010;
-static constexpr uint32_t SPU_RAM_END = 0x10000;
+static constexpr uint32_t SPU_RAM_END = 0x80000;
 
 /// Default ADSR: instant attack, sustain at max, ~46ms linear release.
 ///  Lower 16-bit (AD): attack linear shift=0 step=0("+7"), decay shift=0,
diff --git a/third_party/nugget b/third_party/nugget
index 338ec49..6681630 160000
--- a/third_party/nugget
+++ b/third_party/nugget
@@ -1 +1 @@
-Subproject commit 338ec49a574d0eb0a4218795bf526dcf2b43ecda
+Subproject commit 668163091e2e10bcbb7704985a579f7a0be39692