sched: Implement basic scheduling and signal handling system

Note: this is probably 25% broken, but it works right now as written, so I hope it all works. - Added a new scheduler header file (scheduler.h) defining task structures, scheduling policies, and signal handling mechanisms. - Integrated scheduling functions into the syscall interface, including SYS_GETPID, SYS_GETPPID, SYS_EXIT, SYS_SCHED_YIELD, SYS_NICE, SYS_KILL, SYS_SIGACTION, SYS_SIGPROCMASK, SYS_SCHED_GETSCHEDULER, and SYS_SCHED_SETSCHEDULER. - Updated syscall handler to manage new scheduling-related syscalls and signal actions. Signed-off-by: kaguya <vpshinomiya@protonmail.com>
2026-04-26 22:46:28 -04:00
parent 336af1c2ad
commit 7d99745ff9
20 changed files with 1561 additions and 53 deletions
@@ -5,24 +5,53 @@
 #include "mm/memory.h"
 #include "libk/stdio.h"
 #include "fs/elf.h"
+#include "sched/scheduler.h"

 extern uintptr_t g_hhdm_offset;

 #define USER_STACK_TOP 0x00007FFFFFFFE000ULL
-#define USER_STACK_PAGES 4
+#define USER_STACK_PAGES 8
 #define USER_STACK_SIZE   (USER_STACK_PAGES * PAGE_SIZE)

-#define PTE_PRESENT  (1ULL << 0)
-#define PTE_WRITABLE (1ULL << 1)
-#define PTE_USER     (1ULL << 2)
+



 static uint64_t user_stack_phys_base = 0;

-extern struct pagemap *kernel_pagemap;
+//extern struct pagemap *kernel_pagemap;

-uintptr_t setup_user_stack(void)
+struct pagemap *create_user_pagemap(void)
+{
+    struct pagemap *pm = kmalloc(sizeof(struct pagemap));
+    if (!pm) {
+        printf("Failed to allocate user pagemap struct!\n");
+        return NULL;
+    }
+
+    spinlock_init(&pm->lock);
+
+    /* Allocate a fresh PML4 (physical page) */
+    pm->top_level = (uint64_t *)((uintptr_t)pmm_allocz(1) + MEM_PHYS_OFFSET);
+    if (!pm->top_level) {
+        printf("Failed to allocate user PML4!\n");
+        kfree(pm);
+        return NULL;
+    }
+
+    /* Copy kernel higher-half mappings (kernel + HHDM) */
+    for (size_t i = 256; i < 512; i++) {
+        pm->top_level[i] = kernel_pagemap->top_level[i];
+    }
+
+    /* Lower half remains zero (user address space) */
+    printf("[usermode] user pagemap created (PML4 phys = 0x%lx)\n",
+           (uint64_t)pm->top_level - MEM_PHYS_OFFSET);
+
+    return pm;
+}
+
+uintptr_t setup_user_stack(struct pagemap *pagemap)
 {
    user_stack_phys_base = (uint64_t)pmm_alloc(USER_STACK_PAGES);

@@ -37,7 +66,7 @@ uintptr_t setup_user_stack(void)
        uintptr_t virt = stack_bottom + i * PAGE_SIZE;
        uintptr_t phys = user_stack_phys_base + i * PAGE_SIZE;

-        if (!vmm_map_page(kernel_pagemap,
+        if (!vmm_map_page(pagemap,
                          virt,
                          phys,
                          PAGE_READ | PAGE_WRITE | PAGE_USER,
@@ -47,19 +76,13 @@ uintptr_t setup_user_stack(void)
            for (;;);
        }

-        // zero physical page through HHDM
-        //memset((void *)(phys + g_hhdm_offset), 0, PAGE_SIZE);
    }

    uintptr_t rsp = USER_STACK_TOP;
-
    rsp &= ~0xFULL;
-
    return rsp;
 }

-
-// usermode.c
 __attribute__((naked))
 void enter_user_mode(uint64_t rip, uint64_t rsp)
 {
@@ -98,21 +121,27 @@ void enter_user_mode(uint64_t rip, uint64_t rsp)

 void start_userspace(void)
 {
-    void *entry = NULL;

-    if (!ELF_Read("init.elf", &entry)) {
+    struct pagemap *user_pagemap = create_user_pagemap();
+    if (!user_pagemap) {
+        printf("Failed to create user pagemap\n");
+        for (;;);
+    }
+
+    void *elf_entry = NULL;
+    if (!ELF_Read("init.elf", &elf_entry, user_pagemap)) {
        printf("Failed to load init.elf\n");
        for(;;);
    }

-    if (!entry) {
+    if (!elf_entry) {
        printf("ELF has no entry point\n");
        for(;;);
    }

-    uintptr_t rsp = setup_user_stack();
+    uintptr_t user_rsp = setup_user_stack(user_pagemap);

-    printf("Entering usermode RIP=%p RSP=%p\n", entry, (void*)rsp);
+    printf("Entering usermode RIP=%p RSP=%p\n", elf_entry, (void*)user_rsp);

-    enter_user_mode((uint64_t)entry, (rsp & ~0xFULL));
+    sched_create_user_task("init", (uint64_t)elf_entry, user_rsp, user_pagemap);
 }
@@ -6,6 +6,7 @@
 #include "e9.h"
 #include "limine.h"
 #include "apic.h"
+#include "sched/scheduler.h"

 __attribute__((used, section(".limine_requests")))
 volatile struct limine_date_at_boot_request boot_request = {
@@ -39,7 +40,7 @@ void PIT_IRQ_Handler(Registers* regs)
        lapic_eoi();
    }
    
-    // You can add scheduler / time logic here later
+    sched_tick();
 }

 /* ========================= */
@@ -7,16 +7,13 @@
 #include "fs/ext2.h"

 extern uintptr_t g_hhdm_offset;
-extern struct pagemap *kernel_pagemap;

 #define ELF_BUFFER_SIZE (1024 * 1024)


-#define PTE_PRESENT  (1ULL << 0)
-#define PTE_WRITABLE (1ULL << 1)
-#define PTE_USER     (1ULL << 2)

-bool ELF_Read(const char* path, void** entryPoint)
+
+bool ELF_Read(const char* path, void** entryPoint, struct pagemap *target_pagemap)
 {
    uint32_t size;

@@ -41,6 +38,15 @@ bool ELF_Read(const char* path, void** entryPoint)

    ELFHeader* header = (ELFHeader*)elf_buffer;

+    printf("=== ELF DEBUG ===\n");
+    printf("Entry point VA      = 0x%lx\n", header->ProgramEntryPosition);
+    printf("PHDR offset         = 0x%lx\n", header->ProgramHeaderTablePosition);
+    printf("PHDR count          = %u\n", header->ProgramHeaderTableEntryCount);
+
+
+
+    printf("=== END ELF DEBUG ===\n");
+
    // ── validate ELF ──────────────────────────────────
    if (memcmp(header->Magic, ELF_MAGIC, 4) != 0) {
        printf("ELF: bad magic\n");
@@ -82,8 +88,11 @@ bool ELF_Read(const char* path, void** entryPoint)
        ELFProgramHeader* ph = (ELFProgramHeader*)(ph_table +
            i * header->ProgramHeaderTableEntrySize);

-        if (ph->Type != ELF_PROGRAM_TYPE_LOAD)
+        if (ph->Type != ELF_PROGRAM_TYPE_LOAD) {
+            printf("LOAD segment: VA=0x%lx  FileSz=0x%lx  MemSz=0x%lx\n",
+                ph->VirtualAddress, ph->FileSize, ph->MemorySize);
            continue;
+        }

        uint64_t virt   = ph->VirtualAddress;
        uint64_t offset = ph->Offset;
@@ -114,7 +123,7 @@ bool ELF_Read(const char* path, void** entryPoint)
            uint64_t phys_addr = phys_base + p * PAGE_SIZE;

            bool success = vmm_map_page(
-                kernel_pagemap,
+                target_pagemap,
                virt_addr,
                phys_addr,
                PAGE_READ | PAGE_WRITE | PAGE_USER,   // RW + User mode
@@ -1,6 +1,7 @@
 #pragma once
 #include <stdint.h>
 #include <stdbool.h>
+#include "mm/vmm.h"

 #define ELF_MAGIC ("\x7F" "ELF")

@@ -114,4 +115,4 @@ enum ELFProgramType {
 };


-bool ELF_Read(const char* path, void** entryPoint);
+bool ELF_Read(const char* path, void** entryPoint, struct pagemap *target_pagemap);
@@ -131,13 +131,19 @@ int VFS_Read_internal(fd_t fd, uint8_t* buf, size_t size)

        // naive: read whole file then slice
        uint8_t* tmp = kmalloc(file_size);
-        if (!ext2_read_file(&file->ext2.inode, tmp))
+        if (!tmp) {
            return -1;
-
+        }
+        if (!ext2_read_file(&file->ext2.inode, tmp)) {
+            kfree(tmp);
+            return -1;
+        }
+            
        for (size_t i = 0; i < size; i++)
            buf[i] = tmp[file->offset + i];

        file->offset += size;
+        kfree(tmp);
        return size;
    }

@@ -31,6 +31,7 @@
 #include "arch/x86_64/sys/apic.h"
 #include "arch/x86_64/sys/ioapic.h"
 #include "drivers/input/ps2.h"
+#include "sched/scheduler.h"


 uintptr_t g_hhdm_offset;
@@ -128,7 +129,7 @@ static uacpi_interrupt_ret handle_power_button(uacpi_handle ctx) {


 void kmain(void) {
-    if (LIMINE_BASE_REVISION_SUPPORTED(limine_base_revision) == false) {
+    if (LIMINE_BASE_REVISION_SUPPORTED(limine_base_revision) == false) { 
        hcf();
    }

@@ -350,8 +351,19 @@ void kmain(void) {


    syscall_init();
+
+    sched_init();
+
    start_userspace();

+    sched_yield();
+
+
+    for (;;) {
+        sched_yield();
+    }
+
    // We're done, just hang...
-    hcf();
-}
+    //hcf();
+}
+
@@ -23,6 +23,10 @@ extern volatile struct limine_executable_address_request kernel_address_request;
 #define PAGE_USER     (1ULL << 2)
 #define PAGE_NO_EXECUTE (1ULL << 63)

+#define PTE_PRESENT  (1ULL << 0)
+#define PTE_WRITABLE (1ULL << 1)
+#define PTE_USER     (1ULL << 2)
+
 struct pagemap {
    spinlock_t lock;
    uint64_t *top_level;
@@ -36,4 +40,6 @@ void vmm_init(struct limine_memmap_entry **memmap, size_t memmap_entries);
 void vmm_switch_pagemap(struct pagemap *pagemap);
 bool vmm_map_page(struct pagemap *pagemap, uint64_t virt, uint64_t phys,
                  uint64_t flags, enum page_size pg_size);
-uint64_t vmm_virt_to_phys(struct pagemap *pagemap, uint64_t virt);
+uint64_t vmm_virt_to_phys(struct pagemap *pagemap, uint64_t virt);
+uint64_t *vmm_virt_to_pte(struct pagemap *pagemap, uintptr_t virt_addr,
+						  bool allocate);
@@ -3,7 +3,11 @@
 #include <stdint.h>

 struct cpu_local {
-    uint64_t self;        // +0x00  (GS:0x00)
-    uint64_t user_rsp;    // +0x08  (GS:0x08) — saved user RSP on syscall entry
-    uint64_t kernel_rsp;  // +0x10  (GS:0x10) — kernel stack for syscall handler
+    uint64_t  self;        /* +0x00  (GS:0x00) — points to this struct  */
+    uint64_t  user_rsp;    /* +0x08  (GS:0x08) — saved user RSP on SYSCALL */
+    uint64_t  kernel_rsp;  /* +0x10  (GS:0x10) — kernel stack for syscall  */
+    /* Future SMP fields: */
+    uint32_t  cpu_id;      /* +0x18  logical CPU index                   */
+    uint32_t  _pad;
+    void     *current;     /* +0x20  pointer to current task_t           */
 };
@@ -0,0 +1,75 @@
+
+# void sched_context_switch(struct cpu_context *from,
+#                           struct cpu_context *to);
+#
+# struct cpu_context layout (from sched.h):
+#   offset 0 : rsp  (uint64_t)
+#   offset 8 : cr3  (uint64_t)
+#
+# Strategy
+# --------
+# Only callee-saved registers need explicit saving; caller-saved registers
+# (rax, rcx, rdx, rsi, rdi, r8–r11) are already on the caller's stack per
+# the System V ABI.
+#
+# 1. Push all callee-saved GPRs onto the *current* kernel stack.
+# 2. Save RSP → from->rsp.
+# 3. If to->cr3 is non-zero and differs from the current CR3, load it
+#    (switches address space for user processes).
+# 4. Load RSP ← to->rsp  (switch to next task's kernel stack).
+# 5. Pop all callee-saved GPRs from the *new* stack.
+# 6. ret — pops the return address placed there during task creation
+#    (kthread_trampoline / user_task_trampoline) for a brand-new task,
+#    or returns into the schedule() call-site for a resumed task.
+#
+# NOTE: This function is called with IF=0 (interrupts disabled) and must
+# NOT modify IF itself.  The trampolines re-enable interrupts after the
+# first-ever schedule-in of a task.
+
+.section .text
+.global sched_context_switch
+.type   sched_context_switch, @function
+
+sched_context_switch:
+    # ── Save outgoing task ─────────────────────────────────────────────
+    pushq   %rbx
+    pushq   %rbp
+    pushq   %r12
+    pushq   %r13
+    pushq   %r14
+    pushq   %r15
+
+    # from->rsp = RSP  (rdi = struct cpu_context *from, offset 0 = rsp)
+    movq    %rsp, 0(%rdi)
+
+    # ── Switch address space (CR3) if needed ───────────────────────────
+    # to->cr3 is at offset 8 in struct cpu_context
+    movq    8(%rsi), %rax
+    testq   %rax, %rax          # 0 means "keep current CR3" (kernel thread)
+    jz      .Lno_cr3
+
+    movq    %cr3, %rcx
+    cmpq    %rax, %rcx          # Same CR3? Don't flush the TLB needlessly.
+    je      .Lno_cr3
+
+    movq    %rax, %cr3          # Load new page table root (flushes TLB)
+
+.Lno_cr3:
+    # ── Switch to incoming task's kernel stack ─────────────────────────
+    # to->rsp is at offset 0  (rsi = struct cpu_context *to)
+    movq    0(%rsi), %rsp
+
+    # ── Restore incoming task's callee-saved registers ─────────────────
+    popq    %r15
+    popq    %r14
+    popq    %r13
+    popq    %r12
+    popq    %rbp
+    popq    %rbx
+
+    # ret pops the "return address" from the new task's stack.
+    #  • Brand-new task  → jumps to kthread_trampoline or user_task_trampoline
+    #  • Resumed task    → returns into schedule(), then back up the call chain
+    ret
+
+.size sched_context_switch, . - sched_context_switch
@@ -0,0 +1,935 @@
+#include "scheduler.h"
+#include "mm/memory.h"
+#include "mm/pmm.h"
+#include "libk/stdio.h"
+#include "arch/x86_64/cpu/io.h"
+#include "arch/x86_64/sys/pit.h"
+#include "string.h"
+
+/* =====================================================================
+ *  Forward declarations for GDT/TSS (defined in gdt.c)
+ * ===================================================================== */
+typedef struct {
+    uint32_t reserved0;
+    uint64_t rsp0;
+    uint64_t rsp1;
+    uint64_t rsp2;
+    uint64_t reserved1;
+    uint64_t ist[7];
+    uint64_t reserved2;
+    uint16_t reserved3;
+    uint16_t iopb_offset;
+} __attribute__((packed)) TSS;
+
+extern TSS kernel_tss;
+
+/* =====================================================================
+ *  Globals
+ * ===================================================================== */
+struct runqueue g_runqueue  = {0};
+task_t         *g_current_task = NULL;
+
+/* PIT tick counter (defined in pit.c) */
+extern volatile uint64_t g_Ticks;
+
+/* =====================================================================
+ *  Linux-compatible nice → CPU weight table  (NICE_0_LOAD = 1024)
+ *
+ *  vruntime_delta = actual_ticks * NICE_0_LOAD / weight[nice + 20]
+ *  timeslice      = BASE * weight[nice + 20] / NICE_0_LOAD
+ *
+ *  Low weight (high nice) → vruntime accumulates faster → scheduled less.
+ * ===================================================================== */
+#define NICE_0_LOAD  1024u
+
+static const uint32_t nice_to_weight[40] = {
+    /* nice -20 */ 88761, 71755, 56483, 46273, 36291,
+    /* nice -15 */ 29154, 23254, 18705, 14949, 11916,
+    /* nice -10 */  9548,  7620,  6100,  4904,  3906,
+    /* nice  -5 */  3121,  2501,  1991,  1586,  1277,
+    /* nice   0 */  1024,   820,   655,   526,   423,
+    /* nice  +5 */   335,   272,   215,   172,   137,
+    /* nice +10 */   110,    87,    70,    56,    45,
+    /* nice +15 */    36,    29,    23,    18,    15,
+};
+
+static inline uint32_t weight_for_nice(int nice) {
+    int idx = nice + 20;
+    if (idx < 0)  idx = 0;
+    if (idx > 39) idx = 39;
+    return nice_to_weight[idx];
+}
+
+/* =====================================================================
+ *  PID allocator  (simple monotonic counter, single-CPU safe)
+ * ===================================================================== */
+static pid_t g_next_pid = 1;
+
+static pid_t alloc_pid(void) {
+    return g_next_pid++;
+}
+
+/* =====================================================================
+ *  RT bitmap helpers  (O(1) find-first-set)
+ * ===================================================================== */
+static inline void rt_bitmap_set(struct rt_prio_array *arr, int prio) {
+    /* prio is 1..99; store at bit (prio-1) */
+    int bit = prio - 1;
+    arr->bitmap[bit / 64] |= (1ULL << (bit % 64));
+}
+
+static inline void rt_bitmap_clear(struct rt_prio_array *arr, int prio) {
+    int bit = prio - 1;
+    arr->bitmap[bit / 64] &= ~(1ULL << (bit % 64));
+}
+
+/* Returns the highest-priority (lowest numeric) non-empty RT queue,
+ * or -1 if all are empty. */
+static inline int rt_bitmap_first(const struct rt_prio_array *arr) {
+    if (arr->bitmap[0]) return __builtin_ctzll(arr->bitmap[0]) + 1;
+    if (arr->bitmap[1]) return __builtin_ctzll(arr->bitmap[1]) + 65;
+    return -1;
+}
+
+/* =====================================================================
+ *  RT FIFO queue operations
+ * ===================================================================== */
+static void rt_enqueue(struct runqueue *rq, task_t *task) {
+    int prio = task->static_prio;
+    struct rt_prio_array *arr = &rq->rt;
+
+    task->rq_next = NULL;
+    task->rq_prev = arr->tail[prio];
+
+    if (arr->tail[prio])
+        arr->tail[prio]->rq_next = task;
+    else
+        arr->head[prio] = task;
+
+    arr->tail[prio] = task;
+    rt_bitmap_set(arr, prio);
+    arr->total++;
+    rq->nr_running++;
+}
+
+/* Dequeue the head of the given priority's FIFO */
+static task_t *rt_dequeue_head(struct runqueue *rq, int prio) {
+    struct rt_prio_array *arr = &rq->rt;
+    task_t *task = arr->head[prio];
+    if (!task) return NULL;
+
+    arr->head[prio] = task->rq_next;
+    if (arr->head[prio])
+        arr->head[prio]->rq_prev = NULL;
+    else {
+        arr->tail[prio] = NULL;
+        rt_bitmap_clear(arr, prio);
+    }
+
+    task->rq_next = task->rq_prev = NULL;
+    arr->total--;
+    rq->nr_running--;
+    return task;
+}
+
+/* Remove a specific task from an RT queue (O(1) with doubly-linked list) */
+static void rt_remove(struct runqueue *rq, task_t *task) {
+    int prio = task->static_prio;
+    struct rt_prio_array *arr = &rq->rt;
+
+    if (task->rq_prev) task->rq_prev->rq_next = task->rq_next;
+    else               arr->head[prio]         = task->rq_next;
+
+    if (task->rq_next) task->rq_next->rq_prev = task->rq_prev;
+    else               arr->tail[prio]         = task->rq_prev;
+
+    if (!arr->head[prio])
+        rt_bitmap_clear(arr, prio);
+
+    task->rq_next = task->rq_prev = NULL;
+    arr->total--;
+    rq->nr_running--;
+}
+
+/* =====================================================================
+ *  CFS (normal) queue operations  —  sorted ascending by vruntime
+ * ===================================================================== */
+
+/*
+ * Insert task into the CFS list, keeping it sorted by vruntime.
+ * O(n) — acceptable for hobby-kernel task counts; replace with
+ * red-black tree if you hit performance issues.
+ */
+static void cfs_enqueue(struct runqueue *rq, task_t *task) {
+    /* New tasks start at min_vruntime so they don't starve incumbents
+     * but also don't get a massive head-start. */
+    if (task->vruntime < rq->min_vruntime)
+        task->vruntime = rq->min_vruntime;
+
+    task_t **pp = &rq->cfs_head;
+    task_t  *prev = NULL;
+
+    while (*pp && (*pp)->vruntime <= task->vruntime) {
+        prev = *pp;
+        pp   = &(*pp)->rq_next;
+    }
+
+    task->rq_next = *pp;
+    task->rq_prev = prev;
+    if (*pp)  (*pp)->rq_prev = task;
+    *pp = task;
+
+    rq->cfs_count++;
+    rq->nr_running++;
+}
+
+/* Remove the task with the smallest vruntime (head of list) */
+static task_t *cfs_dequeue_min(struct runqueue *rq) {
+    task_t *task = rq->cfs_head;
+    if (!task) return NULL;
+
+    rq->cfs_head = task->rq_next;
+    if (rq->cfs_head) {
+        rq->cfs_head->rq_prev = NULL;
+        rq->min_vruntime = rq->cfs_head->vruntime;
+    }
+
+    task->rq_next = task->rq_prev = NULL;
+    rq->cfs_count--;
+    rq->nr_running--;
+    return task;
+}
+
+/* Remove a specific task from the CFS queue */
+static void cfs_remove(struct runqueue *rq, task_t *task) {
+    if (task->rq_prev) task->rq_prev->rq_next = task->rq_next;
+    else               rq->cfs_head            = task->rq_next;
+
+    if (task->rq_next) task->rq_next->rq_prev = task->rq_prev;
+
+    task->rq_next = task->rq_prev = NULL;
+    rq->cfs_count--;
+    rq->nr_running--;
+}
+
+/* =====================================================================
+ *  Timeslice calculation
+ * ===================================================================== */
+static uint64_t calc_timeslice(const task_t *task) {
+    switch (task->policy) {
+    case SCHED_FIFO:
+        return UINT64_MAX;  /* Runs until it yields or blocks */
+
+    case SCHED_RR:
+        return SCHED_RR_SLICE_MS;
+
+    case SCHED_IDLE:
+        return SCHED_BASE_SLICE_MS;
+
+    default: /* SCHED_NORMAL / SCHED_BATCH */
+    {
+        uint32_t w   = weight_for_nice(task->nice);
+        uint64_t ms  = (uint64_t)SCHED_BASE_SLICE_MS * w / NICE_0_LOAD;
+        if (ms < SCHED_MIN_SLICE_MS) ms = SCHED_MIN_SLICE_MS;
+        if (ms > SCHED_MAX_SLICE_MS) ms = SCHED_MAX_SLICE_MS;
+        return ms;
+    }
+    }
+}
+
+/* Update a task's vruntime based on how many ticks it actually ran */
+static void update_vruntime(task_t *task, uint64_t elapsed_ticks) {
+    task->sum_exec_runtime += elapsed_ticks;
+
+    if (task->policy == SCHED_NORMAL || task->policy == SCHED_BATCH) {
+        /* vruntime_delta = ticks * NICE_0_LOAD / weight
+         * High-weight (low nice) tasks accumulate vruntime slowly → more CPU. */
+        uint32_t w = weight_for_nice(task->nice);
+        uint64_t delta = elapsed_ticks * NICE_0_LOAD / w;
+        task->vruntime += delta;
+    } else {
+        /* RT and idle: track raw ticks for accounting; vruntime unused */
+        task->vruntime += elapsed_ticks;
+    }
+}
+
+/* =====================================================================
+ *  sched_enqueue / sched_dequeue  (public, uses run-queue lock)
+ * ===================================================================== */
+void sched_enqueue(task_t *task) {
+    uint64_t flags;
+    spinlock_acquire_irqsave(&g_runqueue.lock, &flags);
+
+    task->state = TASK_RUNNING;
+
+    if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) {
+        rt_enqueue(&g_runqueue, task);
+    } else if (task->policy == SCHED_IDLE) {
+        /* SCHED_IDLE: only one idle task, stored separately */
+        g_runqueue.idle = task;
+    } else {
+        cfs_enqueue(&g_runqueue, task);
+    }
+
+    spinlock_release_irqrestore(&g_runqueue.lock, flags);
+}
+
+void sched_dequeue(task_t *task) {
+    uint64_t flags;
+    spinlock_acquire_irqsave(&g_runqueue.lock, &flags);
+
+    if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) {
+        rt_remove(&g_runqueue, task);
+    } else if (task->policy != SCHED_IDLE) {
+        cfs_remove(&g_runqueue, task);
+    }
+
+    spinlock_release_irqrestore(&g_runqueue.lock, flags);
+}
+
+/* =====================================================================
+ *  pick_next_task  (called with interrupts OFF and lock held)
+ *
+ *  Priority order:
+ *    1. Highest RT priority with a runnable task
+ *    2. Normal task with smallest vruntime
+ *    3. Idle task (always exists, never NULL)
+ * ===================================================================== */
+static task_t *pick_next_task(struct runqueue *rq) {
+    /* 1. Real-time */
+    int rt_prio = rt_bitmap_first(&rq->rt);
+    if (rt_prio > 0) {
+        return rt_dequeue_head(rq, rt_prio);
+    }
+
+    /* 2. CFS normal */
+    if (rq->cfs_count > 0) {
+        return cfs_dequeue_min(rq);
+    }
+
+    /* 3. Idle fallback */
+    return rq->idle;
+}
+
+/* =====================================================================
+ *  Trampolines
+ *  These are the "return addresses" pushed onto a new task's kernel
+ *  stack.  When the task is scheduled for the first time, ret inside
+ *  sched_context_switch() jumps here.
+ * ===================================================================== */
+static void kthread_trampoline(void) {
+    /* We arrive with interrupts disabled (just after a context switch).
+     * Re-enable them before entering user code. */
+    x86_64_EnableInterrupts();
+
+    task_t *self = g_current_task;
+    self->kthread_entry(self->kthread_arg);
+
+    /* Thread returned — treat it as a clean exit. */
+    sched_exit(0);
+}
+
+static void user_task_trampoline(void) {
+    x86_64_EnableInterrupts();
+
+    task_t *self = g_current_task;
+
+    /*
+     * Build an iretq frame on the current (kernel) stack and enter
+     * user mode.  We reset the stack pointer to the very top of the
+     * kernel stack first, so the iretq frame doesn't sit below a
+     * pile of stale context-switch frames.
+     *
+     * Segment selectors (from your GDT / STAR setup):
+     *   User CS = 0x23  (GDT index 4, RPL 3)
+     *   User SS = 0x1B  (GDT index 3, RPL 3)
+     */
+    uint64_t kstack_top = (uint64_t)self->kernel_stack + self->kernel_stack_size;
+    uint64_t user_rip   = self->user_entry;
+    uint64_t user_rsp   = self->user_stack_top;
+
+    asm volatile(
+        "movq %0, %%rsp\n\t"        /* Reset kernel RSP to stack top    */
+        "pushq $0x1B\n\t"           /* SS  – user data segment          */
+        "pushq %1\n\t"              /* RSP – user stack pointer         */
+        "pushfq\n\t"                /* RFLAGS                           */
+        "orq  $0x200, (%%rsp)\n\t"  /* Set IF so user code runs with    */
+                                    /* interrupts enabled               */
+        "pushq $0x23\n\t"           /* CS  – user code segment          */
+        "pushq %2\n\t"              /* RIP – user entry point           */
+        "iretq\n\t"
+        :
+        : "r"(kstack_top), "r"(user_rsp), "r"(user_rip)
+        : "memory"
+    );
+
+    __builtin_unreachable();
+}
+
+/* =====================================================================
+ *  Kernel stack setup for a new task
+ *
+ *  Lay out:  [trampoline_addr] [r15=0] [r14=0] [r13=0] [r12=0]
+ *            [rbp=0] [rbx=0]   ← ctx.rsp points here
+ *
+ *  sched_context_switch pops in order: rbx, rbp, r12, r13, r14, r15,
+ *  then ret → trampoline.
+ * ===================================================================== */
+static void setup_initial_kstack(task_t *task, void *trampoline) {
+    uint64_t *sp = (uint64_t *)((uint8_t *)task->kernel_stack
+                                + task->kernel_stack_size);
+
+    *--sp = (uint64_t)trampoline; /* "return address" → trampoline      */
+    *--sp = 0;                    /* r15                                 */
+    *--sp = 0;                    /* r14                                 */
+    *--sp = 0;                    /* r13                                 */
+    *--sp = 0;                    /* r12                                 */
+    *--sp = 0;                    /* rbp                                 */
+    *--sp = 0;                    /* rbx                                 */
+
+    task->ctx.rsp = (uint64_t)sp;
+}
+
+/* =====================================================================
+ *  Task allocation helper
+ * ===================================================================== */
+static task_t *alloc_task(const char *name, bool is_user) {
+    task_t *task = kmalloc(sizeof(task_t));
+    if (!task) return NULL;
+    memset(task, 0, sizeof(task_t));
+
+    strncpy(task->name, name, sizeof(task->name) - 1);
+    task->is_user     = is_user;
+    task->state       = TASK_RUNNING;
+    task->policy      = SCHED_NORMAL;
+    task->nice        = NICE_DEFAULT;
+    task->static_prio = NICE_TO_PRIO(NICE_DEFAULT);
+    task->prio        = task->static_prio;
+    task->pid         = alloc_pid();
+    task->ppid        = g_current_task ? g_current_task->pid : 0;
+    task->parent      = g_current_task;
+
+    /* Default: all signals use SIG_DFL */
+    for (int i = 0; i < _NSIG; i++)
+        task->sigactions[i].sa_handler = SIG_DFL;
+
+    /* Allocate kernel stack */
+    task->kernel_stack_size = KSTACK_SIZE;
+    task->kernel_stack       = kmalloc(KSTACK_SIZE);
+    if (!task->kernel_stack) {
+        kfree(task);
+        return NULL;
+    }
+    memset(task->kernel_stack, 0xCC, KSTACK_SIZE); /* poison */
+
+    return task;
+}
+
+/* =====================================================================
+ *  Public task-creation API
+ * ===================================================================== */
+task_t *sched_create_kthread(const char *name,
+                              void (*entry)(void *), void *arg)
+{
+    task_t *task = alloc_task(name, false);
+    if (!task) return NULL;
+
+    task->kthread_entry = entry;
+    task->kthread_arg   = arg;
+    task->ctx.cr3       = 0;   /* Kernel threads share kernel_pagemap */
+
+    setup_initial_kstack(task, kthread_trampoline);
+
+    task->time_slice = calc_timeslice(task);
+    task->vruntime   = g_runqueue.min_vruntime;
+
+    sched_enqueue(task);
+    printf("[sched] kthread '%s' pid=%d created\n", task->name, task->pid);
+    return task;
+}
+
+task_t *sched_create_user_task(const char *name,
+                                uint64_t entry_rip, uint64_t user_rsp,
+                                struct pagemap *pm)
+{
+    task_t *task = alloc_task(name, true);
+    if (!task) return NULL;
+
+    task->pagemap       = pm;
+    task->user_entry    = entry_rip;
+    task->user_stack_top= user_rsp;
+
+    /* CR3 = physical address of PML4 */
+    task->ctx.cr3 = (uint64_t)pm->top_level - MEM_PHYS_OFFSET;
+
+    setup_initial_kstack(task, user_task_trampoline);
+
+    task->time_slice = calc_timeslice(task);
+    task->vruntime   = g_runqueue.min_vruntime;
+
+    sched_enqueue(task);
+    printf("[sched] user task '%s' pid=%d created, entry=0x%lx\n",
+           task->name, task->pid, entry_rip);
+    return task;
+}
+
+/* =====================================================================
+ *  Idle task
+ * ===================================================================== */
+static void idle_entry(void *arg) {
+    (void)arg;
+    for (;;) {
+        asm volatile("sti; hlt; cli" ::: "memory");
+        /* If we get here, an interrupt fired; schedule() will be
+         * called by sched_tick if a real task became runnable. */
+    }
+}
+
+/* =====================================================================
+ *  sched_init  —  set up the idle task and initialise the run queue
+ * ===================================================================== */
+void sched_init(void) {
+    spinlock_init(&g_runqueue.lock);
+    g_runqueue.min_vruntime = 0;
+
+    /* Synthesise a "current" descriptor for the boot thread so that
+     * the first schedule() call has something valid in g_current_task. */
+    task_t *boot = kmalloc(sizeof(task_t));
+    if (!boot) {
+        printf("[sched] FATAL: cannot allocate boot task\n");
+        while (1) asm volatile("hlt");
+    }
+    memset(boot, 0, sizeof(task_t));
+    strncpy(boot->name, "boot", 63);
+    boot->pid         = alloc_pid();   /* pid 1 */
+    boot->state       = TASK_RUNNING;
+    boot->policy      = SCHED_NORMAL;
+    boot->nice        = NICE_DEFAULT;
+    boot->static_prio = NICE_TO_PRIO(NICE_DEFAULT);
+    boot->prio        = boot->static_prio;
+    boot->time_slice  = calc_timeslice(boot);
+    boot->slice_start = g_Ticks;
+    /* kernel_stack: we're already running on it; RSP will be saved by
+     * the first sched_context_switch() call, so no setup needed. */
+    boot->is_user     = false;
+
+    g_current_task   = boot;
+    g_runqueue.current = boot;
+
+    /* Create the idle task but do NOT go through sched_create_kthread
+     * because we store it separately (not on the CFS/RT queues). */
+    task_t *idle = alloc_task("idle", false);
+    if (!idle) {
+        printf("[sched] FATAL: cannot allocate idle task\n");
+        while (1) asm volatile("hlt");
+    }
+    idle->policy      = SCHED_IDLE;
+    idle->static_prio = IDLE_PRIO;
+    idle->prio        = IDLE_PRIO;
+    idle->kthread_entry = idle_entry;
+    idle->kthread_arg   = NULL;
+    idle->ctx.cr3       = 0;
+    setup_initial_kstack(idle, kthread_trampoline);
+
+    g_runqueue.idle = idle;
+
+    printf("[sched] initialised; boot pid=%d\n", boot->pid);
+}
+
+/* =====================================================================
+ *  schedule()  —  the heart of the scheduler
+ *
+ *  Must be called with interrupts disabled (IF=0).  Restores IF when
+ *  the scheduled-in task next runs (via its own stack context or via
+ *  the kthread_trampoline which calls x86_64_EnableInterrupts).
+ * ===================================================================== */
+void schedule(void) {
+    /*
+     * We deliberately do NOT use spinlock_acquire_irqsave here because
+     * we're already called with IF=0 (either from an ISR or from
+     * sched_yield/sched_block which do cli first).
+     * We use a plain spinlock_acquire_or_wait so that on SMP (future)
+     * another CPU spinning on the lock eventually gets it.
+     */
+    spinlock_acquire_or_wait(&g_runqueue.lock);
+
+    task_t *prev = g_runqueue.current;
+    task_t *next = pick_next_task(&g_runqueue);
+
+    if (next == prev || next == NULL) {
+        /* Nothing to switch to; keep running current task. */
+        spinlock_drop(&g_runqueue.lock);
+        return;
+    }
+
+    /* Account for time the current task actually ran */
+    uint64_t now     = g_Ticks;
+    uint64_t elapsed = (prev->slice_start <= now)
+                       ? (now - prev->slice_start)
+                       : 0;
+    update_vruntime(prev, elapsed);
+
+    /* Re-enqueue the outgoing task if it is still runnable (preempted).
+     * If it blocked/exited, its state is no longer TASK_RUNNING. */
+    if (prev->state == TASK_RUNNING && prev != g_runqueue.idle) {
+        prev->time_slice = calc_timeslice(prev); /* refresh slice */
+
+        if (prev->policy == SCHED_FIFO || prev->policy == SCHED_RR) {
+            rt_enqueue(&g_runqueue, prev);
+        } else if (prev->policy != SCHED_IDLE) {
+            cfs_enqueue(&g_runqueue, prev);
+        }
+    }
+
+    /* Set up the incoming task */
+    next->state        = TASK_RUNNING;
+    next->need_reschedule = false;
+    next->slice_start  = now;
+
+    g_runqueue.current = next;
+    g_current_task     = next;
+    g_runqueue.nr_switches++;
+
+    /* Update TSS.RSP0 so that user-mode interrupts for this task use
+     * the correct kernel stack. */
+    if (next->is_user && next->kernel_stack) {
+        kernel_tss.rsp0 = (uint64_t)next->kernel_stack
+                          + next->kernel_stack_size;
+    }
+
+    spinlock_drop(&g_runqueue.lock);
+
+    /* ---- Context switch -------------------------------------------- */
+    sched_context_switch(&prev->ctx, &next->ctx);
+
+    /*
+     * When we return here we are BACK in the context of `prev`
+     * (which has just been rescheduled).  Process any pending signals
+     * before returning to user space.
+     */
+    task_handle_pending_signals();
+}
+
+/* =====================================================================
+ *  sched_tick()  —  called from the PIT/LAPIC IRQ every millisecond
+ * ===================================================================== */
+void sched_tick(void) {
+    task_t *cur = g_current_task;
+    if (!cur) return;
+
+    /* Decrement remaining timeslice */
+    if (cur->time_slice > 0)
+        cur->time_slice--;
+
+    /*
+     * Trigger a reschedule if:
+     *   (a) The timeslice ran out, or
+     *   (b) need_reschedule was set by a wakeup of a higher-prio task.
+     */
+    if (cur->time_slice == 0 || cur->need_reschedule) {
+        /* schedule() expects IF=0 — guaranteed here because we are
+         * inside an IRQ handler; the CPU cleared IF on entry. */
+        schedule();
+    }
+}
+
+/* =====================================================================
+ *  sched_yield()  —  voluntary CPU release
+ * ===================================================================== */
+void sched_yield(void) {
+    x86_64_DisableInterrupts();
+    g_current_task->time_slice = 0;  /* Force preemption */
+    schedule();
+    x86_64_EnableInterrupts();
+}
+
+/* =====================================================================
+ *  sched_block()  —  put current task to sleep
+ *
+ *  Caller must set the task state BEFORE calling (the function
+ *  honours whatever state is already set).  Alternatively pass the
+ *  desired reason and we set it here.
+ * ===================================================================== */
+void sched_block(task_state_t reason) {
+    x86_64_DisableInterrupts();
+    g_current_task->state = reason;
+    schedule();
+    x86_64_EnableInterrupts();
+    /* When we return here the task has been woken up. */
+}
+
+/* =====================================================================
+ *  sched_wake()  —  wake a sleeping task
+ *  Safe to call from interrupt context.
+ * ===================================================================== */
+void sched_wake(task_t *task) {
+    if (!task) return;
+
+    uint64_t flags;
+    spinlock_acquire_irqsave(&g_runqueue.lock, &flags);
+
+    if (task->state != TASK_RUNNING) {
+        task->state      = TASK_RUNNING;
+        task->time_slice = calc_timeslice(task);
+
+        if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) {
+            rt_enqueue(&g_runqueue, task);
+        } else if (task->policy != SCHED_IDLE) {
+            cfs_enqueue(&g_runqueue, task);
+        }
+
+        /*
+         * Preempt the current task if the woken task has strictly
+         * higher priority (lower numeric priority value).
+         */
+        task_t *cur = g_runqueue.current;
+        if (cur && task->prio < cur->prio) {
+            cur->need_reschedule = true;
+        }
+    }
+
+    spinlock_release_irqrestore(&g_runqueue.lock, flags);
+}
+
+/* =====================================================================
+ *  sched_exit()  —  terminate the current task (noreturn)
+ * ===================================================================== */
+void sched_exit(int exit_code) {
+    x86_64_DisableInterrupts();
+
+    task_t *self = g_current_task;
+    self->exit_code = exit_code;
+    self->state     = TASK_ZOMBIE;
+
+    /* Notify parent (send SIGCHLD) */
+    if (self->parent)
+        task_send_signal(self->parent, SIGCHLD);
+
+    printf("[sched] task '%s' pid=%d exited with code %d\n",
+           self->name, self->pid, exit_code);
+
+    /* Hand off to someone else; we will never return. */
+    schedule();
+
+    /* schedule() should never return to a ZOMBIE task, but just in case: */
+    for (;;) asm volatile("hlt");
+    __builtin_unreachable();
+}
+
+/* =====================================================================
+ *  Signal delivery
+ * ===================================================================== */
+
+/* Default signal actions */
+typedef enum { SIG_ACTION_TERM, SIG_ACTION_CORE, SIG_ACTION_IGN,
+               SIG_ACTION_STOP, SIG_ACTION_CONT } sig_default_action_t;
+
+static sig_default_action_t default_action(int signum) {
+    switch (signum) {
+    case SIGHUP:  case SIGINT:  case SIGKILL: case SIGPIPE:
+    case SIGALRM: case SIGTERM: case SIGUSR1: case SIGUSR2:
+    case SIGPROF: case SIGVTALRM: case SIGSTKFLT:
+        return SIG_ACTION_TERM;
+    case SIGQUIT: case SIGILL:  case SIGABRT: case SIGFPE:
+    case SIGSEGV: case SIGBUS:  case SIGSYS:  case SIGTRAP:
+    case SIGXCPU: case SIGXFSZ:
+        return SIG_ACTION_CORE;   /* we treat CORE same as TERM for now */
+    case SIGCHLD: case SIGURG:  case SIGWINCH: case SIGIO:  case SIGPWR:
+        return SIG_ACTION_IGN;
+    case SIGSTOP: case SIGTSTP: case SIGTTIN: case SIGTTOU:
+        return SIG_ACTION_STOP;
+    case SIGCONT:
+        return SIG_ACTION_CONT;
+    default:
+        return SIG_ACTION_TERM;
+    }
+}
+
+int task_send_signal(task_t *task, int signum) {
+    if (!task) return -1;
+    if (signum <= 0 || signum >= _NSIG) return -1;
+
+    uint64_t flags;
+    spinlock_acquire_irqsave(&g_runqueue.lock, &flags);
+
+    /* Set pending bit */
+    task->pending_signals |= (1ULL << signum);
+
+    /* SIGKILL and SIGCONT always wake the target */
+    if (signum == SIGKILL || signum == SIGCONT) {
+        if (task->state == TASK_INTERRUPTIBLE ||
+            task->state == TASK_STOPPED ||
+            task->state == TASK_UNINTERRUPTIBLE) {
+            task->state = TASK_RUNNING;
+            /* Re-enqueue (simplified: call rt/cfs directly since lock held) */
+            if (task->policy == SCHED_FIFO || task->policy == SCHED_RR)
+                rt_enqueue(&g_runqueue, task);
+            else if (task->policy != SCHED_IDLE)
+                cfs_enqueue(&g_runqueue, task);
+        }
+    } else if (!(task->signal_mask & (1ULL << signum))) {
+        /* Unblocked signal: wake an interruptible sleeper */
+        if (task->state == TASK_INTERRUPTIBLE) {
+            task->state = TASK_RUNNING;
+            if (task->policy == SCHED_FIFO || task->policy == SCHED_RR)
+                rt_enqueue(&g_runqueue, task);
+            else if (task->policy != SCHED_IDLE)
+                cfs_enqueue(&g_runqueue, task);
+        }
+    }
+
+    spinlock_release_irqrestore(&g_runqueue.lock, flags);
+    return 0;
+}
+
+/*
+ * Handle pending signals for the current task.
+ * Called just before returning to user space (end of schedule(), syscall
+ * return path, or end of IRQ handler for user-mode tasks).
+ */
+void task_handle_pending_signals(void) {
+    task_t *self = g_current_task;
+    if (!self) return;
+
+    while (self->pending_signals & ~self->signal_mask) {
+        /* Find the lowest-numbered pending, unblocked signal */
+        uint64_t deliverable = self->pending_signals & ~self->signal_mask;
+        int signum = __builtin_ctzll(deliverable) + 1; /* +1: bit 0 = sig 1 */
+        if (signum >= _NSIG) break;
+
+        /* Clear the pending bit */
+        self->pending_signals &= ~(1ULL << (signum - 1));
+
+        sighandler_t handler = self->sigactions[signum].sa_handler;
+
+        if (handler == SIG_IGN) {
+            /* Explicitly ignored */
+            if (signum == SIGCHLD) continue; /* common: reap silently */
+            continue;
+
+        } else if (handler != SIG_DFL) {
+            /*
+             * User-defined handler.
+             *
+             * A full POSIX implementation would build a signal frame on
+             * the user stack and set registers so that iretq delivers
+             * the signal; that requires knowing the saved RFLAGS/RIP
+             * from the ISR frame.  We leave this as a TODO and just
+             * call the handler directly for kernel threads.
+             *
+             * For user tasks this is the point where you would push a
+             * ucontext_t / sigframe onto the user stack and adjust the
+             * saved user RIP in the ISR frame.
+             */
+            if (!self->is_user) {
+                handler(signum);
+            } else {
+                /* TODO: build user-space signal frame */
+                printf("[signal] TODO: deliver signal %d to user task '%s'\n",
+                       signum, self->name);
+            }
+
+        } else {
+            /* SIG_DFL */
+            switch (default_action(signum)) {
+            case SIG_ACTION_TERM:
+            case SIG_ACTION_CORE:
+                printf("[signal] task '%s' pid=%d killed by signal %d\n",
+                       self->name, self->pid, signum);
+                sched_exit(128 + signum);
+                break; /* unreachable */
+
+            case SIG_ACTION_STOP:
+                self->state = TASK_STOPPED;
+                /* Notify parent */
+                if (self->parent) task_send_signal(self->parent, SIGCHLD);
+                sched_block(TASK_STOPPED);
+                break;
+
+            case SIG_ACTION_CONT:
+                /* Already running (we were woken to handle this) */
+                break;
+
+            case SIG_ACTION_IGN:
+                break;
+            }
+        }
+    }
+}
+
+/* =====================================================================
+ *  sched_find_task  (linear scan — O(n), suitable for small task counts)
+ * ===================================================================== */
+task_t *sched_find_task(pid_t pid) {
+    /*
+     * Walk the CFS list and RT queues.  In a production kernel this
+     * would be a hash table.  For KirkOS this is fine.
+     */
+    task_t *t = g_runqueue.cfs_head;
+    while (t) {
+        if (t->pid == pid) return t;
+        t = t->rq_next;
+    }
+    for (int p = RT_PRIO_MIN; p <= RT_PRIO_MAX; p++) {
+        t = g_runqueue.rt.head[p];
+        while (t) {
+            if (t->pid == pid) return t;
+            t = t->rq_next;
+        }
+    }
+    if (g_runqueue.current && g_runqueue.current->pid == pid)
+        return g_runqueue.current;
+    return NULL;
+}
+
+/* =====================================================================
+ *  Priority / scheduler controls
+ * ===================================================================== */
+int task_set_nice(task_t *task, int nice) {
+    if (nice < NICE_MIN) nice = NICE_MIN;
+    if (nice > NICE_MAX) nice = NICE_MAX;
+
+    int old_nice = task->nice;
+
+    task->nice        = nice;
+    task->static_prio = NICE_TO_PRIO(nice);
+    task->prio        = task->static_prio;
+
+    /*
+     * Recompute the timeslice.  If the task is currently on a queue we
+     * would need to re-sort it (out of scope here — next schedule() will
+     * pick the right slot when it re-enqueues).
+     */
+    task->time_slice = calc_timeslice(task);
+
+    return old_nice;
+}
+
+int task_set_scheduler(task_t *task, int policy, int rt_prio) {
+    if (policy != SCHED_NORMAL && policy != SCHED_FIFO &&
+        policy != SCHED_RR    && policy != SCHED_BATCH &&
+        policy != SCHED_IDLE)
+        return -1;
+
+    if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
+        (rt_prio < RT_PRIO_MIN || rt_prio > RT_PRIO_MAX))
+        return -1;
+
+    /* Remove from current queue, change policy, re-enqueue */
+    bool was_queued = (task->state == TASK_RUNNING &&
+                       task != g_runqueue.current);
+    if (was_queued)
+        sched_dequeue(task);
+
+    task->policy      = policy;
+    task->static_prio = (policy == SCHED_FIFO || policy == SCHED_RR)
+                        ? rt_prio
+                        : NICE_TO_PRIO(task->nice);
+    task->prio        = task->static_prio;
+    task->time_slice  = calc_timeslice(task);
+
+    if (was_queued)
+        sched_enqueue(task);
+
+    return 0;
+}
@@ -0,0 +1,312 @@
+#pragma once
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include "mm/vmm.h"
+#include "mp/spinlock.h"
+
+/* =====================================================================
+ *  POSIX signal numbers
+ * ===================================================================== */
+#define SIGHUP     1
+#define SIGINT     2
+#define SIGQUIT    3
+#define SIGILL     4
+#define SIGTRAP    5
+#define SIGABRT    6
+#define SIGBUS     7
+#define SIGFPE     8
+#define SIGKILL    9   /* cannot be caught or ignored */
+#define SIGUSR1    10
+#define SIGSEGV    11
+#define SIGUSR2    12
+#define SIGPIPE    13
+#define SIGALRM    14
+#define SIGTERM    15
+#define SIGSTKFLT  16
+#define SIGCHLD    17
+#define SIGCONT    18
+#define SIGSTOP    19  /* cannot be caught or ignored */
+#define SIGTSTP    20
+#define SIGTTIN    21
+#define SIGTTOU    22
+#define SIGURG     23
+#define SIGXCPU    24
+#define SIGXFSZ    25
+#define SIGVTALRM  26
+#define SIGPROF    27
+#define SIGWINCH   28
+#define SIGIO      29
+#define SIGPWR     30
+#define SIGSYS     31
+#define _NSIG      32
+
+typedef void (*sighandler_t)(int signum);
+#define SIG_DFL ((sighandler_t)0)  /* default action */
+#define SIG_IGN ((sighandler_t)1)  /* ignore signal  */
+#define SIG_ERR ((sighandler_t)-1) /* error return   */
+
+#define SA_NOCLDSTOP  0x00000001
+#define SA_NOCLDWAIT  0x00000002
+#define SA_SIGINFO    0x00000004
+#define SA_RESTORER   0x04000000
+#define SA_ONSTACK    0x08000000
+#define SA_RESTART    0x10000000
+#define SA_NODEFER    0x40000000
+#define SA_RESETHAND  0x80000000
+
+struct sigaction {
+    sighandler_t sa_handler;
+    uint64_t     sa_mask;    /* signals blocked while handler runs */
+    int          sa_flags;
+};
+
+/* =====================================================================
+ *  Scheduling policies (POSIX)
+ * ===================================================================== */
+#define SCHED_NORMAL  0   /* Fair time-sharing (CFS-like, nice values)  */
+#define SCHED_FIFO    1   /* Real-time FIFO – runs until yield or block  */
+#define SCHED_RR      2   /* Real-time round-robin with fixed timeslice  */
+#define SCHED_BATCH   3   /* CPU-bound variant of NORMAL, no preemption boost */
+#define SCHED_IDLE    5   /* Only runs when nothing else is runnable     */
+
+/* Priority ranges:
+ *   RT tasks:     static_prio  1 .. 99  (1 = highest)
+ *   Normal tasks: static_prio 100 .. 139  (maps from nice -20 .. +19)
+ *   Idle:         static_prio 140
+ */
+#define MAX_RT_PRIO      100
+#define MAX_PRIO         140
+#define RT_PRIO_MIN      1
+#define RT_PRIO_MAX      99
+#define NICE_MIN         (-20)
+#define NICE_MAX         19
+#define NICE_DEFAULT     0
+#define IDLE_PRIO        140
+
+/* nice ↔ static_prio conversions for SCHED_NORMAL */
+#define NICE_TO_PRIO(n)  (MAX_RT_PRIO + (n) + 20)
+#define PRIO_TO_NICE(p)  ((p) - MAX_RT_PRIO - 20)
+
+/* Timeslice constants (ticks, PIT at 1000 Hz → 1 tick = 1 ms) */
+#define SCHED_BASE_SLICE_MS  10   /* base timeslice for NICE_DEFAULT    */
+#define SCHED_MIN_SLICE_MS   1    /* minimum timeslice (1 ms)           */
+#define SCHED_MAX_SLICE_MS   100  /* maximum timeslice (100 ms)         */
+#define SCHED_RR_SLICE_MS    10   /* fixed timeslice for SCHED_RR       */
+
+/* =====================================================================
+ *  Task states
+ * ===================================================================== */
+typedef enum task_state {
+    TASK_RUNNING         = 0,  /* Runnable – on a run queue or executing */
+    TASK_INTERRUPTIBLE   = 1,  /* Sleeping, can be woken by signal       */
+    TASK_UNINTERRUPTIBLE = 2,  /* Sleeping, ignores signals (D state)    */
+    TASK_STOPPED         = 4,  /* Halted by SIGSTOP / SIGTSTP            */
+    TASK_ZOMBIE          = 8,  /* Exited, waiting for parent to wait()   */
+    TASK_DEAD            = 16, /* Fully reaped, memory can be freed      */
+} task_state_t;
+
+/* =====================================================================
+ *  Minimal CPU context
+ *
+ *  Only RSP and CR3 live here; all callee-saved GPRs are pushed onto
+ *  the kernel stack by sched_context_switch() before RSP is saved.
+ *  This keeps the struct tiny and the assembly dead simple.
+ * ===================================================================== */
+struct cpu_context {
+    uint64_t rsp;  /* Saved kernel stack pointer                         */
+    uint64_t cr3;  /* Physical address of PML4 (0 = stay on kernel map)  */
+};
+
+/* =====================================================================
+ *  Task / Process descriptor
+ * ===================================================================== */
+typedef int    pid_t;
+typedef struct task task_t;
+
+struct task {
+    /* ---- CPU context (must stay first – asm references it at offset 0) */
+    struct cpu_context ctx;
+
+    /* ---- Identity ---------------------------------------------------- */
+    pid_t  pid;
+    pid_t  ppid;
+    char   name[64];
+    bool   is_user;           /* true  = user process, false = kernel thread */
+
+    /* ---- Scheduling policy and priority ------------------------------ */
+    int    policy;            /* SCHED_NORMAL / SCHED_FIFO / SCHED_RR / … */
+    int    static_prio;       /* Immutable base priority                   */
+    int    nice;              /* -20 .. +19, SCHED_NORMAL only             */
+    int    prio;              /* Effective priority (may be boosted)       */
+
+    /* ---- State ------------------------------------------------------- */
+    volatile task_state_t state;
+    bool   need_reschedule;   /* Set when a higher-priority task wakes up  */
+
+    /* ---- Time accounting (ticks, 1 tick = 1 ms at 1000 Hz PIT) ------- */
+    uint64_t vruntime;          /* Virtual runtime (tick-equivalents, weighted) */
+    uint64_t sum_exec_runtime;  /* Total CPU time consumed (raw ticks)     */
+    uint64_t time_slice;        /* Remaining timeslice (ticks)             */
+    uint64_t slice_start;       /* Tick when the current slice began       */
+
+    /* ---- Memory ------------------------------------------------------ */
+    struct pagemap *pagemap;    /* NULL → use kernel_pagemap               */
+    void   *kernel_stack;       /* Pointer to bottom of kernel-stack alloc */
+    size_t  kernel_stack_size;
+
+    /* ---- Entry points ------------------------------------------------ */
+    uint64_t user_entry;        /* User-space RIP for user tasks           */
+    uint64_t user_stack_top;    /* User-space RSP for user tasks           */
+    void   (*kthread_entry)(void *arg); /* Kernel thread entry point       */
+    void   *kthread_arg;
+
+    /* ---- Signals ----------------------------------------------------- */
+    uint64_t         pending_signals;  /* Bitmask of unhandled signals     */
+    uint64_t         signal_mask;      /* Blocked (SIG_BLOCK) signals      */
+    struct sigaction sigactions[_NSIG];
+
+    /* ---- Exit status ------------------------------------------------- */
+    int exit_code;
+
+    /* ---- Run-queue linkage (doubly-linked, intrusive) ---------------- */
+    task_t *rq_next;
+    task_t *rq_prev;
+
+    /* ---- Process tree ------------------------------------------------ */
+    task_t *parent;
+    task_t *first_child;
+    task_t *next_sibling;
+};
+
+/* =====================================================================
+ *  Run queue
+ *
+ *  Two sub-queues per CPU (single CPU for now, MP-ready by design):
+ *
+ *  1. RT array  – 99 FIFO lists indexed by RT priority.  Highest
+ *                 priority with a runnable task is O(1) via bitmap.
+ *
+ *  2. CFS list  – Tasks sorted by vruntime (ascending).  Pick-next
+ *                 is O(1) (front of list); insert is O(n) – good
+ *                 enough for now, swap in an rb-tree later.
+ *
+ *  Idle task is stored separately and is returned only when both
+ *  sub-queues are empty.
+ * ===================================================================== */
+#define RT_QUEUE_LEVELS  MAX_RT_PRIO   /* 100 levels (index 0 = unused, 1–99 used) */
+
+struct rt_prio_array {
+    /*
+     * Bitmap: bit N is set ↔ rt_queue[N] is non-empty.
+     * Two 64-bit words cover 128 bits, enough for 100 levels.
+     */
+    uint64_t bitmap[2];
+    task_t  *head[RT_QUEUE_LEVELS]; /* FIFO queue heads */
+    task_t  *tail[RT_QUEUE_LEVELS]; /* FIFO queue tails */
+    int      total;                  /* Total RT tasks enqueued */
+};
+
+struct runqueue {
+    spinlock_t lock;
+
+    /* Real-time (SCHED_FIFO / SCHED_RR) */
+    struct rt_prio_array rt;
+
+    /* Normal (SCHED_NORMAL / SCHED_BATCH) — sorted ascending by vruntime */
+    task_t  *cfs_head;
+    int      cfs_count;
+    uint64_t min_vruntime;     /* Lower bound; new tasks start from here */
+
+    /* Idle fallback (SCHED_IDLE) */
+    task_t  *idle;
+
+    /* Currently executing task on this CPU */
+    task_t  *current;
+
+    /* Statistics */
+    uint64_t nr_switches;
+    uint64_t nr_running;       /* Total runnable tasks (all classes) */
+};
+
+/* =====================================================================
+ *  Globals (single-CPU; extend to per-cpu array for SMP)
+ * ===================================================================== */
+extern struct runqueue g_runqueue;
+extern task_t         *g_current_task;  /* Pointer to currently-running task */
+
+/* =====================================================================
+ *  Public scheduler API
+ * ===================================================================== */
+
+/* Initialise the scheduler (call after PMM + VMM + PIT are ready) */
+void sched_init(void);
+
+/* Create a kernel thread and enqueue it immediately */
+task_t *sched_create_kthread(const char *name,
+                              void (*entry)(void *), void *arg);
+
+/* Create a user-space task and enqueue it immediately */
+task_t *sched_create_user_task(const char *name,
+                                uint64_t entry_rip, uint64_t user_rsp,
+                                struct pagemap *pm);
+
+/* Add a task to the appropriate run queue */
+void sched_enqueue(task_t *task);
+
+/* Remove a task from its run queue (does NOT free it) */
+void sched_dequeue(task_t *task);
+
+/* Pick next task and perform context switch (call with IF=0) */
+void schedule(void);
+
+/* Called from the PIT/LAPIC timer IRQ every tick (1 ms) */
+void sched_tick(void);
+
+/* Voluntarily give up the CPU */
+void sched_yield(void);
+
+/* Block current task (IF must be 0 on entry; IF restored by schedule) */
+void sched_block(task_state_t reason);
+
+/* Wake a sleeping task (safe to call from IRQ context) */
+void sched_wake(task_t *task);
+
+/* Terminate the current task (noreturn) */
+void sched_exit(int exit_code) __attribute__((noreturn));
+
+/* ---- Signal API ------------------------------------------------------- */
+
+/* Send signal signum to task (safe from any context) */
+int  task_send_signal(task_t *task, int signum);
+
+/* Find a task by PID (NULL if not found) */
+task_t *sched_find_task(pid_t pid);
+
+/* Process pending signals for the current task (call before returning to user) */
+void task_handle_pending_signals(void);
+
+/* ---- Priority / policy control --------------------------------------- */
+
+/* Set nice value [-20, +19] for a SCHED_NORMAL task; returns old nice */
+int task_set_nice(task_t *task, int nice);
+
+/* Change scheduling policy + RT priority; returns 0 on success */
+int task_set_scheduler(task_t *task, int policy, int rt_prio);
+
+/* ---- Convenience ----------------------------------------------------- */
+static inline task_t *sched_current(void) { return g_current_task; }
+
+/* ---- Assembly context switch (defined in sched_switch.S) ------------- */
+/*
+ * Save callee-saved registers of the current context onto its kernel
+ * stack and record RSP in *from.  Then switch to *to's stack, restore
+ * its callee-saved registers, and return — which resumes wherever *to
+ * last called schedule().  For first-time tasks the "return" jumps to
+ * the appropriate trampoline.
+ */
+void sched_context_switch(struct cpu_context *from,
+                          struct cpu_context *to);
+
+/* Kernel stack size for each task */
+#define KSTACK_SIZE  (32 * 1024)   /* 32 KiB — comfortable headroom */
@@ -3,6 +3,7 @@
 #include "mp/percpu.h"
 #include "fs/vfs.h"
 #include "syscall.h"
+#include "sched/scheduler.h"

 #define MSR_EFER          0xC0000080
 #define MSR_STAR          0xC0000081
@@ -41,7 +42,7 @@ uint64_t syscall_handler(uint64_t num,
            uint8_t* buf    = (uint8_t*)arg2;
            size_t len      = (size_t)arg3;

-            return (uint64_t)VFS_Read(fd, buf, len);
+            return VFS_Read(fd, buf, len);
        }

        case SYS_WRITE:
@@ -65,6 +66,99 @@ uint64_t syscall_handler(uint64_t num,
            return (uint64_t)VFS_Close(fd);
        }

+        case SYS_GETPID:
+            return (uint64_t)sched_current()->pid;
+ 
+        case SYS_GETPPID:
+            return (uint64_t)sched_current()->ppid;
+ 
+        case SYS_EXIT:
+        case SYS_EXIT_GROUP:
+            sched_exit((int)arg1);
+            //noreturn 
+ 
+        case SYS_SCHED_YIELD:
+            sched_yield();
+            return 0;
+ 
+        case SYS_NICE:
+        {
+            int increment = (int)arg1;
+            int old_nice  = sched_current()->nice;
+            int new_nice  = old_nice + increment;
+            return (uint64_t)task_set_nice(sched_current(), new_nice);
+        }
+ 
+        case SYS_KILL:
+        {
+            pid_t   target = (pid_t)arg1;
+            int     sig    = (int)arg2;
+            task_t *t      = sched_find_task(target);
+            if (!t) return (uint64_t)-1;
+            return (uint64_t)task_send_signal(t, sig);
+        }
+ 
+        case SYS_SIGACTION:
+        {
+            int signum = (int)arg1;
+            const struct sigaction *act  = (const struct sigaction *)arg2;
+            struct sigaction       *oact = (struct sigaction *)arg3;
+ 
+            if (signum <= 0 || signum >= _NSIG)
+                return (uint64_t)-1;
+            if (signum == SIGKILL || signum == SIGSTOP)
+                return (uint64_t)-1; // cannot override
+ 
+            task_t *cur = sched_current();
+            if (oact)
+                *oact = cur->sigactions[signum];
+            if (act)
+                cur->sigactions[signum] = *act;
+            return 0;
+        }
+ 
+        case SYS_SIGPROCMASK:
+        {
+            // how: 0=SIG_BLOCK, 1=SIG_UNBLOCK, 2=SIG_SETMASK
+            int      how     = (int)arg1;
+            uint64_t new_set = arg2;
+            uint64_t *old    = (uint64_t *)arg3;
+ 
+            task_t *cur = sched_current();
+            if (old) *old = cur->signal_mask;
+ 
+            // SIGKILL and SIGSTOP can never be blocked
+            new_set &= ~((1ULL << SIGKILL) | (1ULL << SIGSTOP));
+ 
+            switch (how) {
+            case 0: cur->signal_mask |=  new_set; break; // SIG_BLOCK
+            case 1: cur->signal_mask &= ~new_set; break; // SIG_UNBLOCK
+            case 2: cur->signal_mask  =  new_set; break; // SIG_SETMASK
+            default: return (uint64_t)-1;
+            }
+            return 0;
+        }
+ 
+        case SYS_SCHED_GETSCHEDULER:
+        {
+            pid_t   target = (pid_t)arg1;
+            task_t *t      = target ? sched_find_task(target)
+                                     : sched_current();
+            if (!t) return (uint64_t)-1;
+            return (uint64_t)t->policy;
+        }
+ 
+        case SYS_SCHED_SETSCHEDULER:
+        {
+            pid_t   target  = (pid_t)arg1;
+            int     policy  = (int)arg2;
+            int     rt_prio = (int)arg3;
+            task_t *t       = target ? sched_find_task(target)
+                                      : sched_current();
+            if (!t) return (uint64_t)-1;
+            return (uint64_t)task_set_scheduler(t, policy, rt_prio);
+        }
+
        default:
            return (uint64_t)-1;
    }
@@ -6,5 +6,17 @@
 #define SYS_OPEN    2
 #define SYS_CLOSE   3

+#define SYS_SCHED_YIELD    24
+#define SYS_GETPID         39
+#define SYS_GETPPID        110
+#define SYS_NICE           34
+#define SYS_KILL           62
+#define SYS_SIGACTION      13    /* rt_sigaction on Linux */
+#define SYS_SIGPROCMASK    14    /* rt_sigprocmask on Linux */
+#define SYS_EXIT           60
+#define SYS_EXIT_GROUP     231
+#define SYS_SCHED_GETSCHEDULER  138
+#define SYS_SCHED_SETSCHEDULER  139
+

 void syscall_init(void);
@@ -27,7 +27,9 @@ syscall_entry:
    pop  %rsi                # rsi = a1  (2nd param)
    mov  %rax, %rdi          # rdi = num (1st param)

+    sub $8, %rsp
    call syscall_handler
+    add $8, %rsp

    # ── Restore user context ───────────────────────────────────────────
    pop %r11                 # user RFLAGS