// taskswitch.c -- Cooperative task switching library for DJGPP
//
// Uses inline assembly for context switching (i386 and x86_64).  The
// scheduler uses credit-based weighted round-robin so all tasks run,
// but higher-priority tasks run proportionally more often.
//
// Task storage is a stb_ds dynamic array that grows as needed.
// Terminated task slots are recycled by tsCreate().
//
// Why inline asm instead of setjmp/longjmp for context switching:
// setjmp/longjmp only save callee-saved registers and don't give us
// control over the stack pointer in a portable way. We need to set up
// a brand-new stack for each task and jump into a trampoline  -- setjmp
// can't bootstrap a fresh stack. The asm approach also avoids ABI
// differences in jmp_buf layout across DJGPP versions.
//
// Why stb_ds dynamic array instead of a linked list:
// Tasks are indexed by integer ID for O(1) lookup (tsGetState, tsKill,
// etc.). A linked list would require O(n) traversal for every ID-based
// operation. The array also has better cache locality during the
// scheduler's linear scan. The downside (holes after termination) is
// mitigated by slot recycling in findFreeSlot().

#define STB_DS_IMPLEMENTATION
#include "thirdparty/stb_ds.h"

#include "taskswitch.h"
#include <stdlib.h>
#include <string.h>

// ============================================================================
// Internal types
// ============================================================================

// Only callee-saved registers need to be in the context struct because
// the C calling convention guarantees the caller has already saved
// everything else. The compiler treats contextSwitch() as a normal
// function call, so caller-saved regs (eax/ecx/edx on i386,
// rax/rcx/rdx/r8-r11 on x86_64) are spilled by the compiler before
// the call. This minimizes context size and switch overhead.
//
// Field order is critical: the asm uses hardcoded byte offsets into
// this struct. Reordering fields will silently corrupt context switches.
#if defined(__x86_64__)
// Saved CPU context for x86_64 (field order matches asm byte offsets)
typedef struct {
    uintptr_t rbx; // offset  0
    uintptr_t r12; // offset  8
    uintptr_t r13; // offset 16
    uintptr_t r14; // offset 24
    uintptr_t r15; // offset 32
    uintptr_t rbp; // offset 40
    uintptr_t rsp; // offset 48
    uintptr_t rip; // offset 56
} TaskContextT;
#else
// Saved CPU context for i386 (field order matches asm byte offsets)
typedef struct {
    uintptr_t ebx; // offset  0
    uintptr_t esi; // offset  4
    uintptr_t edi; // offset  8
    uintptr_t ebp; // offset 12
    uintptr_t esp; // offset 16
    uintptr_t eip; // offset 20
} TaskContextT;
#endif

// Task control block  -- one per task slot. The 'allocated' flag tracks
// whether the slot is live or recyclable, separate from the state enum,
// because we need to distinguish "never used" from "terminated and reaped".
// The 'isMain' flag protects task 0 from kill/pause  -- destroying the
// main task would orphan all other tasks with no scheduler to resume them.
typedef struct {
    char          name[TS_NAME_MAX];
    TaskContextT  context;
    uint8_t      *stack;
    uint32_t      stackSize;
    TaskStateE    state;
    int32_t       priority;
    int32_t       credits;
    TaskEntryT    entry;
    void         *arg;
    bool          isMain;
    bool          allocated; // true if slot is in use, false if free for reuse
} TaskBlockT;

// ============================================================================
// Module state
// ============================================================================

// stb_ds dynamic array of task control blocks. Slot 0 is always the main
// task. Slots 1..N are app tasks. Terminated slots have allocated=false
// and are reused by findFreeSlot() to prevent unbounded growth.
static TaskBlockT *tasks       = NULL;
// Index of the currently executing task. Updated only by tsYield, tsExit,
// tsPause (self-pause), and tsRecoverToMain.
static uint32_t    currentIdx  = 0;
static bool        initialized = false;

// ============================================================================
// Forward declarations
// ============================================================================

// Static helpers
static void     contextSwitch(TaskContextT *save, TaskContextT *restore);
static int32_t  findFreeSlot(void);
static uint32_t scheduleNext(void);
static void     taskTrampoline(void);

// Public API prototypes are provided by taskswitch.h via #include.
// Explicit prototypes repeated here per project convention:
uint32_t   tsActiveCount(void);
int32_t    tsCreate(const char *name, TaskEntryT entry, void *arg, uint32_t stackSize, int32_t priority);
uint32_t   tsCurrentId(void);
void       tsExit(void);
const char *tsGetName(uint32_t taskId);
int32_t    tsGetPriority(uint32_t taskId);
TaskStateE tsGetState(uint32_t taskId);
int32_t    tsInit(void);
int32_t    tsKill(uint32_t taskId);
int32_t    tsPause(uint32_t taskId);
void       tsRecoverToMain(void);
int32_t    tsResume(uint32_t taskId);
int32_t    tsSetPriority(uint32_t taskId, int32_t priority);
void       tsShutdown(void);
void       tsYield(void);

// ============================================================================
// Static functions (alphabetical)
// ============================================================================

// Switch execution from the current task to another by saving and restoring
// callee-saved registers and the stack pointer.  The return address is
// captured as a local label so that when another task switches back to us,
// execution resumes right after the save point.
//
// The mechanism:
// 1. Save all callee-saved regs + esp/rsp into *save
// 2. Capture the address of local label "1:" as the saved EIP/RIP
// 3. Load all regs + esp/rsp from *restore
// 4. Jump to the restored EIP/RIP (which is label "1:" in the other task)
//
// For a newly created task, the restored EIP points to taskTrampoline
// (set up in tsCreate) rather than label "1:", so the first switch into
// a task bootstraps it into its entry function.
//
// noinline is critical: if the compiler inlines this, the callee-saved
// register assumptions break because the enclosing function may use
// different register allocation. The asm clobber list tells GCC which
// registers we destroy so it spills them before the call.
//
// The "memory" clobber acts as a compiler fence, ensuring all memory
// writes are flushed before the switch and re-read after resumption.
#if defined(__x86_64__)
// x86_64: save rbx, r12-r15, rbp, rsp, rip.
// Inputs via GCC constraints:  %rdi = save ptr,  %rsi = restore ptr.
static void __attribute__((noinline)) contextSwitch(TaskContextT *save, TaskContextT *restore) {
    __asm__ __volatile__(
        // Save current context
        "movq %%rbx,  0(%%rdi)\n\t"
        "movq %%r12,  8(%%rdi)\n\t"
        "movq %%r13, 16(%%rdi)\n\t"
        "movq %%r14, 24(%%rdi)\n\t"
        "movq %%r15, 32(%%rdi)\n\t"
        "movq %%rbp, 40(%%rdi)\n\t"
        "movq %%rsp, 48(%%rdi)\n\t"
        // RIP-relative lea captures the resume point address
        "leaq 1f(%%rip), %%rax\n\t"
        "movq %%rax, 56(%%rdi)\n\t"
        // Restore new context  -- once rsp is swapped we're on the other
        // task's stack. The jmp completes the switch.
        "movq  0(%%rsi), %%rbx\n\t"
        "movq  8(%%rsi), %%r12\n\t"
        "movq 16(%%rsi), %%r13\n\t"
        "movq 24(%%rsi), %%r14\n\t"
        "movq 32(%%rsi), %%r15\n\t"
        "movq 40(%%rsi), %%rbp\n\t"
        "movq 48(%%rsi), %%rsp\n\t"
        "movq 56(%%rsi), %%rax\n\t"
        "jmp  *%%rax\n\t"
        // Resume point: when someone switches back to us, execution
        // continues here as if contextSwitch() just returned normally.
        "1:\n\t"
        :
        : "D" (save), "S" (restore)
        : "rax", "rcx", "rdx", "r8", "r9", "r10", "r11", "memory", "cc"
    );
}
#else
// i386: save ebx, esi, edi, ebp, esp, eip.
// Inputs via GCC constraints:  %eax = save ptr,  %edx = restore ptr.
static void __attribute__((noinline)) contextSwitch(TaskContextT *save, TaskContextT *restore) {
    __asm__ __volatile__(
        // Save current context
        "movl %%ebx,  0(%%eax)\n\t"
        "movl %%esi,  4(%%eax)\n\t"
        "movl %%edi,  8(%%eax)\n\t"
        "movl %%ebp, 12(%%eax)\n\t"
        "movl %%esp, 16(%%eax)\n\t"
        // i386 can't do RIP-relative lea, so use an absolute label address
        "movl $1f,   20(%%eax)\n\t"
        // Restore new context
        "movl  0(%%edx), %%ebx\n\t"
        "movl  4(%%edx), %%esi\n\t"
        "movl  8(%%edx), %%edi\n\t"
        "movl 12(%%edx), %%ebp\n\t"
        "movl 16(%%edx), %%esp\n\t"
        "movl 20(%%edx), %%eax\n\t"
        "jmp  *%%eax\n\t"
        "1:\n\t"
        :
        : "a" (save), "d" (restore)
        : "ecx", "memory", "cc"
    );
}
#endif


// Find a free (terminated or unallocated) slot in the task array.
// Returns the index, or -1 if no free slot exists.
// Starts at 1 because slot 0 is always the main task and cannot be reused.
// Linear scan is fine  -- SHELL_MAX_APPS caps the practical limit at ~32 tasks.
static int32_t findFreeSlot(void) {
    ptrdiff_t count = arrlen(tasks);
    for (ptrdiff_t i = 1; i < count; i++) {
        if (!tasks[i].allocated) {
            return (int32_t)i;
        }
    }
    return -1;
}


// Find the next task to run using credit-based weighted round-robin.
// Each ready task holds (priority + 1) credits.  One credit is consumed
// per scheduling turn.  When no ready task has credits left, every
// ready task is refilled.  This guarantees all tasks run while giving
// higher-priority tasks proportionally more turns.
//
// Algorithm (variant of Linux 2.4's goodness() scheduler):
// 1. Scan forward from currentIdx looking for a ready task with credits > 0
// 2. If found, decrement its credits and select it
// 3. If no task has credits, refill ALL ready tasks (one "epoch")
// 4. Scan again after refill
//
// The round-robin scan starts at (currentIdx + 1) and wraps, ensuring
// fairness among tasks with equal priority  -- no task gets picked twice
// in a row unless it's the only ready task.
//
// If no ready tasks exist at all (everything paused/terminated), return
// currentIdx so the caller stays on the current task (always task 0 in
// practice, since task 0 is the shell's main loop and never pauses).
static uint32_t scheduleNext(void) {
    uint32_t count = (uint32_t)arrlen(tasks);

    // First pass: look for a ready task with remaining credits
    for (uint32_t i = 1; i <= count; i++) {
        uint32_t idx = (currentIdx + i) % count;
        if (tasks[idx].allocated && tasks[idx].state == TaskStateReady && tasks[idx].credits > 0) {
            tasks[idx].credits--;
            return idx;
        }
    }

    // All credits exhausted  -- start a new epoch by refilling every ready task
    bool anyReady = false;
    for (uint32_t i = 0; i < count; i++) {
        if (tasks[i].allocated && tasks[i].state == TaskStateReady) {
            tasks[i].credits = tasks[i].priority + 1;
            anyReady         = true;
        }
    }

    if (!anyReady) {
        return currentIdx;
    }

    // Pick the first ready task after refill
    for (uint32_t i = 1; i <= count; i++) {
        uint32_t idx = (currentIdx + i) % count;
        if (tasks[idx].allocated && tasks[idx].state == TaskStateReady && tasks[idx].credits > 0) {
            tasks[idx].credits--;
            return idx;
        }
    }

    return currentIdx;
}


// Entry point for every new task. The first context switch into a new task
// jumps here (via the EIP/RIP set up in tsCreate). This is a trampoline
// rather than calling entry directly because we need to call tsExit() when
// the entry function returns  -- if we just set EIP to the entry function,
// it would return to a garbage address (the dummy 0 on the stack).
// The trampoline ensures clean task termination even if the app forgets
// to call tsExit() explicitly.
static void taskTrampoline(void) {
    TaskBlockT *task = &tasks[currentIdx];
    task->entry(task->arg);
    tsExit();
}

// ============================================================================
// Public API (alphabetical, main-equivalent functions last if applicable)
// ============================================================================

uint32_t tsActiveCount(void) {
    if (!initialized) {
        return 0;
    }

    uint32_t active = 0;
    ptrdiff_t count = arrlen(tasks);
    for (ptrdiff_t i = 0; i < count; i++) {
        if (tasks[i].allocated && tasks[i].state != TaskStateTerminated) {
            active++;
        }
    }
    return active;
}


int32_t tsCreate(const char *name, TaskEntryT entry, void *arg, uint32_t stackSize, int32_t priority) {
    if (!initialized || !entry) {
        return TS_ERR_PARAM;
    }
    if (stackSize == 0) {
        stackSize = TS_DEFAULT_STACK_SIZE;
    }

    // Reuse a terminated/free slot, or append a new one.
    // Recycling avoids unbounded array growth when apps are repeatedly
    // launched and terminated over the lifetime of the shell.
    int32_t id = findFreeSlot();
    if (id < 0) {
        TaskBlockT blank = {0};
        arrput(tasks, blank);
        id = (int32_t)(arrlen(tasks) - 1);
    }

    TaskBlockT *task = &tasks[id];
    memset(task, 0, sizeof(*task));

    task->stack = (uint8_t *)malloc(stackSize);
    if (!task->stack) {
        return TS_ERR_NOMEM;
    }

    if (name) {
        strncpy(task->name, name, TS_NAME_MAX - 1);
        task->name[TS_NAME_MAX - 1] = '\0';
    }

    task->stackSize = stackSize;
    task->state     = TaskStateReady;
    task->priority  = priority;
    task->credits   = priority + 1;
    task->entry     = entry;
    task->arg       = arg;
    task->isMain    = false;
    task->allocated = true;

    // Set up initial stack (grows downward, 16-byte aligned).
    // The ABI requires 16-byte stack alignment at function entry. We align
    // the top, then push a dummy return address (0) to simulate a CALL
    // instruction  -- this keeps the stack aligned for the trampoline.
    // The dummy address is never used because taskTrampoline calls tsExit()
    // which switches away without returning, but it satisfies debuggers
    // and ABI checkers that expect a return address at the bottom of each frame.
    uintptr_t top = (uintptr_t)(task->stack + stackSize);
    top &= ~(uintptr_t)0xF;
    top -= sizeof(uintptr_t);
    *(uintptr_t *)top = 0; // dummy return address; trampoline never returns

#if defined(__x86_64__)
    task->context.rsp = top;
    task->context.rbp = 0;
    task->context.rbx = 0;
    task->context.r12 = 0;
    task->context.r13 = 0;
    task->context.r14 = 0;
    task->context.r15 = 0;
    task->context.rip = (uintptr_t)taskTrampoline;
#else
    task->context.esp = top;
    task->context.ebp = 0;
    task->context.ebx = 0;
    task->context.esi = 0;
    task->context.edi = 0;
    task->context.eip = (uintptr_t)taskTrampoline;
#endif

    return id;
}


uint32_t tsCurrentId(void) {
    return currentIdx;
}


// Self-termination. Frees resources and switches to the next task.
// This function never returns  -- the terminated task's context is abandoned.
// We save to tasks[prev].context even though we'll never restore it because
// contextSwitch always writes to the save pointer; the data is harmless
// and will be overwritten when the slot is recycled.
void tsExit(void) {
    if (!initialized || tasks[currentIdx].isMain) {
        return;
    }

    tasks[currentIdx].state = TaskStateTerminated;

    // Free the stack immediately  -- safe because we're about to switch
    // away and never return. The context switch itself doesn't touch
    // the old stack after swapping ESP/RSP.
    free(tasks[currentIdx].stack);
    tasks[currentIdx].stack     = NULL;
    tasks[currentIdx].allocated = false;

    uint32_t next = scheduleNext();
    uint32_t prev = currentIdx;

    currentIdx        = next;
    tasks[next].state = TaskStateRunning;

    contextSwitch(&tasks[prev].context, &tasks[next].context);
    // Terminated task never resumes here
}


const char *tsGetName(uint32_t taskId) {
    if (!initialized || taskId >= (uint32_t)arrlen(tasks)) {
        return NULL;
    }
    if (!tasks[taskId].allocated) {
        return NULL;
    }
    return tasks[taskId].name;
}


int32_t tsGetPriority(uint32_t taskId) {
    if (!initialized || taskId >= (uint32_t)arrlen(tasks)) {
        return TS_ERR_PARAM;
    }
    if (!tasks[taskId].allocated) {
        return TS_ERR_PARAM;
    }
    return tasks[taskId].priority;
}


TaskStateE tsGetState(uint32_t taskId) {
    if (!initialized || taskId >= (uint32_t)arrlen(tasks)) {
        return TaskStateTerminated;
    }
    if (!tasks[taskId].allocated) {
        return TaskStateTerminated;
    }
    return tasks[taskId].state;
}


// Register the calling context as task 0 (main). No stack is allocated
// because the main task uses the process stack. The main task's context
// struct is filled in lazily by contextSwitch on the first tsYield()  --
// until then, the saved EIP/ESP are zero, which is fine because we
// never restore task 0 from a cold start.
int32_t tsInit(void) {
    if (initialized) {
        return TS_ERR_PARAM;
    }

    // Start with the main task at slot 0
    TaskBlockT main = {0};
    strncpy(main.name, "main", TS_NAME_MAX - 1);
    main.state     = TaskStateRunning;
    main.priority  = TS_PRIORITY_NORMAL;
    main.credits   = TS_PRIORITY_NORMAL + 1;
    main.isMain    = true;
    main.stack     = NULL;
    main.allocated = true;

    arrput(tasks, main);

    currentIdx  = 0;
    initialized = true;
    return TS_OK;
}


// Forcibly terminate another task. This is safe in a cooperative system
// because the target is guaranteed to be suspended at a yield point  -- it
// cannot be in the middle of a critical section. The stack is freed and
// the slot is recycled immediately.
//
// Cannot kill self (use tsExit instead)  -- killing self would free the
// stack we're currently executing on. Cannot kill main (task 0) because
// the shell's main loop must always be runnable for crash recovery.
//
// The shell uses this for two purposes:
// 1. shellForceKillApp: "End Task" from the task manager
// 2. Crash recovery: after a signal handler longjmps to main, the
//    crashed task's slot is cleaned up via tsKill
int32_t tsKill(uint32_t taskId) {
    if (!initialized || taskId >= (uint32_t)arrlen(tasks)) {
        return TS_ERR_PARAM;
    }
    if (!tasks[taskId].allocated) {
        return TS_ERR_PARAM;
    }
    if (tasks[taskId].isMain) {
        return TS_ERR_STATE;
    }
    if (taskId == currentIdx) {
        return TS_ERR_STATE;
    }
    if (tasks[taskId].state == TaskStateTerminated) {
        return TS_ERR_STATE;
    }

    tasks[taskId].state = TaskStateTerminated;
    free(tasks[taskId].stack);
    tasks[taskId].stack     = NULL;
    tasks[taskId].allocated = false;

    return TS_OK;
}


int32_t tsPause(uint32_t taskId) {
    if (!initialized || taskId >= (uint32_t)arrlen(tasks)) {
        return TS_ERR_PARAM;
    }
    if (!tasks[taskId].allocated) {
        return TS_ERR_PARAM;
    }
    if (tasks[taskId].isMain) {
        return TS_ERR_STATE;
    }
    if (tasks[taskId].state != TaskStateRunning && tasks[taskId].state != TaskStateReady) {
        return TS_ERR_STATE;
    }

    tasks[taskId].state = TaskStatePaused;

    // If we paused ourselves, must yield immediately  -- a paused task
    // won't be selected by scheduleNext, so staying on CPU would deadlock.
    // If pausing another task, no yield needed; it will simply be skipped
    // the next time the scheduler scans.
    if (taskId == currentIdx) {
        uint32_t next = scheduleNext();
        if (next != currentIdx) {
            uint32_t prev = currentIdx;

            currentIdx        = next;
            tasks[next].state = TaskStateRunning;

            contextSwitch(&tasks[prev].context, &tasks[next].context);
        }
    }

    return TS_OK;
}


// Emergency recovery after a crash in an app task. When a signal handler
// fires (e.g., SIGSEGV), DJGPP's signal dispatch saves the exception
// state and calls our handler. The handler does longjmp back to the
// shell's setjmp point in main(), which restores the main task's stack.
// However, the task switcher's currentIdx still points to the crashed
// app task. This function fixes the bookkeeping so the scheduler treats
// task 0 as the running task again.
//
// The crashed task's slot is NOT freed here  -- its stack is corrupt and
// the caller (shellMain's crash recovery) must call shellForceKillApp
// to clean it up properly (destroying windows, closing DXE, etc.).
void tsRecoverToMain(void) {
    if (!initialized) {
        return;
    }

    currentIdx        = 0;
    tasks[0].state = TaskStateRunning;
}


int32_t tsResume(uint32_t taskId) {
    if (!initialized || taskId >= (uint32_t)arrlen(tasks)) {
        return TS_ERR_PARAM;
    }
    if (!tasks[taskId].allocated) {
        return TS_ERR_PARAM;
    }
    if (tasks[taskId].state != TaskStatePaused) {
        return TS_ERR_STATE;
    }

    // Transition from Paused back to Ready and refill credits immediately.
    // Without the refill, a resumed task might have 0 credits and would have
    // to wait for the next epoch to run, making resume feel sluggish.
    tasks[taskId].state   = TaskStateReady;
    tasks[taskId].credits = tasks[taskId].priority + 1;
    return TS_OK;
}


int32_t tsSetPriority(uint32_t taskId, int32_t priority) {
    if (!initialized || taskId >= (uint32_t)arrlen(tasks)) {
        return TS_ERR_PARAM;
    }
    if (!tasks[taskId].allocated) {
        return TS_ERR_PARAM;
    }
    if (tasks[taskId].state == TaskStateTerminated) {
        return TS_ERR_STATE;
    }

    tasks[taskId].priority = priority;
    tasks[taskId].credits  = priority + 1;
    return TS_OK;
}


void tsShutdown(void) {
    if (!initialized) {
        return;
    }

    ptrdiff_t count = arrlen(tasks);
    for (ptrdiff_t i = 0; i < count; i++) {
        free(tasks[i].stack);
    }

    arrfree(tasks);
    tasks       = NULL;
    currentIdx  = 0;
    initialized = false;
}


// The core cooperative yield. Called explicitly by app code (or implicitly
// via the shell's idle callback and main loop). If no other task is ready,
// returns immediately  -- no context switch overhead when running solo.
//
// The state transition: current task moves Running -> Ready (still
// schedulable), next task moves Ready -> Running. The previous task will
// resume here when someone else yields and the scheduler picks it again.
void tsYield(void) {
    if (!initialized) {
        return;
    }

    uint32_t next = scheduleNext();
    if (next == currentIdx) {
        return;
    }

    uint32_t prev = currentIdx;

    // Only transition to Ready if still Running  -- a task that paused itself
    // will already be in Paused state when tsYield is called from tsPause.
    if (tasks[prev].state == TaskStateRunning) {
        tasks[prev].state = TaskStateReady;
    }

    currentIdx        = next;
    tasks[next].state = TaskStateRunning;

    contextSwitch(&tasks[prev].context, &tasks[next].context);
}