// Cross-platform sprite codegen runtime: spriteCompile uses the // per-CPU emit function selected at compile time, allocates a slot // in the codegen arena, copies the emitted bytes in, and populates // sp->slot + sp->routineOffsets. spriteCompiledDraw casts the slot // address to a function pointer and calls it through cdecl. // // Each per-CPU emitter (src/codegen/spriteEmit{X86,68k,Iigs}.c) // just produces bytes; this file is the only consumer of the // codegen arena from the sprite side. #include #include #include "joey/sprite.h" #include "joey/surface.h" #include "codegenArenaInternal.h" #include "spriteEmitter.h" #include "spriteInternal.h" #include "surfaceInternal.h" // Largest scratch buffer needed for any single emit call. 16 KB // covers a 32x32 sprite even on 68k (the biggest mixed-RMW byte- // emit at 16 bytes/byte * (16*17 dest bytes per shift) ~= 4.5 KB, // times shift count 2). Round up generously. #define SPRITE_EMIT_SCRATCH_BYTES (16u * 1024u) // Compile-time selection of the per-CPU emitter. One src/codegen/ // spriteEmit*.c file is built per platform, but the dispatch lives // in this file so spriteCompile + spriteCompiledDraw aren't // duplicated three times. static uint16_t emitDrawForTarget(uint8_t *out, const SpriteT *sp, uint8_t shift) { #if defined(JOEYLIB_PLATFORM_DOS) return spriteEmitDrawX86(out, sp, shift); #elif defined(JOEYLIB_PLATFORM_AMIGA) || defined(JOEYLIB_PLATFORM_ATARIST) return spriteEmitDraw68k(out, sp, shift); #elif defined(JOEYLIB_PLATFORM_IIGS) return spriteEmitDrawIigs(out, sp, shift); #else # error "spriteCompile: no emitter selected for this platform" #endif } bool spriteCompile(SpriteT *sp) { uint8_t *scratch; uint32_t totalSize; uint8_t shift; ArenaSlotT *slot; uint8_t *dst; uint16_t written; uint16_t offset; if (sp == NULL) { return false; } if (sp->slot != NULL) { return true; } if (sp->tileData == NULL) { return false; } scratch = (uint8_t *)malloc(SPRITE_EMIT_SCRATCH_BYTES); if (scratch == NULL) { return false; } totalSize = 0; for (shift = 0; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) { written = emitDrawForTarget(scratch, sp, shift); totalSize += written; } if (totalSize > 0xFFFFu) { free(scratch); return false; } slot = codegenArenaAlloc(totalSize); if (slot == NULL) { free(scratch); return false; } dst = codegenArenaBase() + slot->offset; offset = 0; for (shift = 0; shift < JOEY_SPRITE_SHIFT_COUNT; shift++) { written = emitDrawForTarget(dst + offset, sp, shift); sp->routineOffsets[shift][SPRITE_OP_DRAW] = offset; sp->routineOffsets[shift][SPRITE_OP_SAVE] = 0; sp->routineOffsets[shift][SPRITE_OP_RESTORE] = 0; offset = (uint16_t)(offset + written); } sp->slot = slot; free(scratch); return true; } #if defined(JOEYLIB_PLATFORM_IIGS) // IIgs uses inline asm + a self-modifying call stub instead of a C // function-pointer cast. The build uses ORCA-C large memory model // (-b for sprite demos) so pointers are 24-bit and JSL works // cross-bank. // // `sta abs,Y` on 65816 uses the data bank register (DBR) for the // high byte of the effective address, so we need DBR = dst's bank // during the body. malloc under -b can return memory in any bank, // so we don't trust DBR to already match -- the stub explicitly // sets DBR from the dst pointer's bank byte and restores it before // returning to C. // // Stub layout (14 bytes): // 00: 8B PHB ; save caller DBR // 01: A9 bk LDA #destBank ; A = dst bank (8-bit M) // 03: 48 PHA // 04: AB PLB ; DBR = dst bank // 05: A0 lo hi LDY #destOffset ; Y = low 16 of dst (X=16) // 08: 22 lo mid bk JSL routine // 0C: AB PLB ; restore caller DBR // 0D: 6B RTL // // Patched per call: byte 2 (destBank), bytes 6-7 (destOffset16), // bytes 9-11 (target 24-bit). The compiled routine assumes // M=8 / X=16 / Y=destOffset on entry; the stub arranges that. static unsigned char gSpriteCallStub[14]; void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) { uint8_t shift; uint32_t destAddr; uint16_t destOffset; uint8_t destBank; uint32_t fnAddr; { uint8_t *destPtr; uint8_t destBytes[4]; shift = (uint8_t)(x & 1); destPtr = &dst->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)]; memcpy(destBytes, &destPtr, 4); destAddr = (uint32_t)destBytes[0] | ((uint32_t)destBytes[1] << 8) | ((uint32_t)destBytes[2] << 16); destOffset = (uint16_t)(destAddr & 0xFFFFu); destBank = (uint8_t)((destAddr >> 16) & 0xFFu); fnAddr = codegenArenaBaseAddr() + sp->slot->offset + (uint32_t)sp->routineOffsets[shift][SPRITE_OP_DRAW]; } (void)destAddr; gSpriteCallStub[ 0] = 0x8B; gSpriteCallStub[ 1] = 0xA9; gSpriteCallStub[ 2] = destBank; gSpriteCallStub[ 3] = 0x48; gSpriteCallStub[ 4] = 0xAB; gSpriteCallStub[ 5] = 0xA0; gSpriteCallStub[ 6] = (unsigned char)(destOffset & 0xFFu); gSpriteCallStub[ 7] = (unsigned char)((destOffset >> 8) & 0xFFu); gSpriteCallStub[ 8] = 0x22; gSpriteCallStub[ 9] = (unsigned char)(fnAddr & 0xFFu); gSpriteCallStub[10] = (unsigned char)((fnAddr >> 8) & 0xFFu); gSpriteCallStub[11] = (unsigned char)((fnAddr >> 16) & 0xFFu); gSpriteCallStub[12] = 0xAB; gSpriteCallStub[13] = 0x6B; // ORCA-C compiles this function under `longa on` (M=16) and emits // the function epilogue assuming M=16 at exit -- the deallocation // ADC takes a 2-byte immediate. The byte writes to gSpriteCallStub // above leave M=8, so PHP captured M=8 and PLP would restore M=8. // That mode mismatch caused the epilogue's `ADC #imm; TCS` bytes // to be re-decoded as a wider ADC swallowing the TCS, S never // adjusted, RTL popped the wrong bytes, control fell into BSS and // BRK'd. Use REP/SEP without PHP/PLP and explicitly restore M=16 // before returning to compiled C. asm { rep #0x30 sep #0x20 jsl gSpriteCallStub rep #0x20 } } #else void spriteCompiledDraw(SurfaceT *dst, const SpriteT *sp, int16_t x, int16_t y) { typedef void (*DrawFn)(uint8_t *destRow); uint8_t shift; uint8_t *destRow; DrawFn fn; shift = (uint8_t)(x & 1); destRow = &dst->pixels[(uint16_t)y * SURFACE_BYTES_PER_ROW + ((uint16_t)x >> 1)]; fn = (DrawFn)(codegenArenaBase() + sp->slot->offset + sp->routineOffsets[shift][SPRITE_OP_DRAW]); fn(destRow); } #endif