From 7d99745ff93945e71d1f2b32f15b2e64e24e6d05 Mon Sep 17 00:00:00 2001 From: kaguya Date: Sun, 26 Apr 2026 22:46:28 -0400 Subject: [PATCH] sched: Implement basic scheduling and signal handling system Note: this is probably 25% broken, but it works right now as written, so I hope it all works. - Added a new scheduler header file (scheduler.h) defining task structures, scheduling policies, and signal handling mechanisms. - Integrated scheduling functions into the syscall interface, including SYS_GETPID, SYS_GETPPID, SYS_EXIT, SYS_SCHED_YIELD, SYS_NICE, SYS_KILL, SYS_SIGACTION, SYS_SIGPROCMASK, SYS_SCHED_GETSCHEDULER, and SYS_SCHED_SETSCHEDULER. - Updated syscall handler to manage new scheduling-related syscalls and signal actions. Signed-off-by: kaguya --- .gitignore | 2 +- ext2_root/init.elf | Bin 5288 -> 5384 bytes src/arch/x86_64/cpu/usermode.c | 67 ++- src/arch/x86_64/sys/pit.c | 3 +- src/fs/elf.c | 23 +- src/fs/elf.h | 3 +- src/fs/vfs.c | 10 +- src/main.c | 18 +- src/mm/vmm.h | 8 +- src/mp/percpu.h | 10 +- src/sched/context_switch.S | 75 +++ src/sched/scheduler.c | 935 +++++++++++++++++++++++++++++++++ src/sched/scheduler.h | 312 +++++++++++ src/syscall/syscall.c | 96 +++- src/syscall/syscall.h | 12 + src/syscall/syscall_entry.S | 2 + user/build/init.elf | Bin 5288 -> 5384 bytes user/build/init.o | Bin 1936 -> 2040 bytes user/include/syscalls.h | 18 +- user/programs/init.c | 20 +- 20 files changed, 1561 insertions(+), 53 deletions(-) create mode 100644 src/sched/context_switch.S create mode 100644 src/sched/scheduler.c create mode 100644 src/sched/scheduler.h diff --git a/.gitignore b/.gitignore index 709eb78..201c107 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ build/ -user/build/* \ No newline at end of file +user/build diff --git a/ext2_root/init.elf b/ext2_root/init.elf index f98c8589f2ebfdedfc22f92011d846ad1146ccd4..613b512992b19cca8224e72463b77bd071dfd25b 100755 GIT binary patch delta 498 zcmZ3X*`YN-Lhv(#0|N*^NCt)z6E)3^jxd777+^Gm0EEv1q(Im~5W;t0bO19P7$=^R zo4A2P2vcI?sWkq2$I$%@K&6f$p?jHt8iEaPdvw0==)C9AdCQ~ujeuw8CC|Lt-=zsuw*%x1B*`7jKt4npk|@|{pm|6_yBfd>k9+i%sC0OA`T)&_+u+f8 z6let0s9Q)9VB7YCtb}TNg(Ug<-wTWX|Np=K$1r)dz*LihfB*k$166{63J@CraSu@J zFi60OPoS5{oll~N*^y77jm3>mqnXv8&th_dpvmMJf(neEC+`%LXFNRlrr>+VUz1M? z=?j7a2MnO$YBhPGpzvfFVHrVq2t#EIz%nsFnF6Su0;o&~SZ0o}3geN<2Zik!A53Nx Tv1f`9n(QQE$5=eMQA8a8X4#F| delta 407 zcmeCsTA?{XLU0L#0|N*^NCt+CiJIm{35;Md1{lpC0O7L$DG>eu<$r)``@k^ql-$G( z973298&9S2*LyU-5!k^5)VO~GP>10+!%O_E`wjq^9-YT`X#&M}fGp%^-N6i`Akqls z{H(hgfWknf9=#lock); + + /* Allocate a fresh PML4 (physical page) */ + pm->top_level = (uint64_t *)((uintptr_t)pmm_allocz(1) + MEM_PHYS_OFFSET); + if (!pm->top_level) { + printf("Failed to allocate user PML4!\n"); + kfree(pm); + return NULL; + } + + /* Copy kernel higher-half mappings (kernel + HHDM) */ + for (size_t i = 256; i < 512; i++) { + pm->top_level[i] = kernel_pagemap->top_level[i]; + } + + /* Lower half remains zero (user address space) */ + printf("[usermode] user pagemap created (PML4 phys = 0x%lx)\n", + (uint64_t)pm->top_level - MEM_PHYS_OFFSET); + + return pm; +} + +uintptr_t setup_user_stack(struct pagemap *pagemap) { user_stack_phys_base = (uint64_t)pmm_alloc(USER_STACK_PAGES); @@ -37,7 +66,7 @@ uintptr_t setup_user_stack(void) uintptr_t virt = stack_bottom + i * PAGE_SIZE; uintptr_t phys = user_stack_phys_base + i * PAGE_SIZE; - if (!vmm_map_page(kernel_pagemap, + if (!vmm_map_page(pagemap, virt, phys, PAGE_READ | PAGE_WRITE | PAGE_USER, @@ -47,19 +76,13 @@ uintptr_t setup_user_stack(void) for (;;); } - // zero physical page through HHDM - //memset((void *)(phys + g_hhdm_offset), 0, PAGE_SIZE); } uintptr_t rsp = USER_STACK_TOP; - rsp &= ~0xFULL; - return rsp; } - -// usermode.c __attribute__((naked)) void enter_user_mode(uint64_t rip, uint64_t rsp) { @@ -98,21 +121,27 @@ void enter_user_mode(uint64_t rip, uint64_t rsp) void start_userspace(void) { - void *entry = NULL; - if (!ELF_Read("init.elf", &entry)) { + struct pagemap *user_pagemap = create_user_pagemap(); + if (!user_pagemap) { + printf("Failed to create user pagemap\n"); + for (;;); + } + + void *elf_entry = NULL; + if (!ELF_Read("init.elf", &elf_entry, user_pagemap)) { printf("Failed to load init.elf\n"); for(;;); } - if (!entry) { + if (!elf_entry) { printf("ELF has no entry point\n"); for(;;); } - uintptr_t rsp = setup_user_stack(); + uintptr_t user_rsp = setup_user_stack(user_pagemap); - printf("Entering usermode RIP=%p RSP=%p\n", entry, (void*)rsp); + printf("Entering usermode RIP=%p RSP=%p\n", elf_entry, (void*)user_rsp); - enter_user_mode((uint64_t)entry, (rsp & ~0xFULL)); + sched_create_user_task("init", (uint64_t)elf_entry, user_rsp, user_pagemap); } \ No newline at end of file diff --git a/src/arch/x86_64/sys/pit.c b/src/arch/x86_64/sys/pit.c index c6708db..7b90331 100644 --- a/src/arch/x86_64/sys/pit.c +++ b/src/arch/x86_64/sys/pit.c @@ -6,6 +6,7 @@ #include "e9.h" #include "limine.h" #include "apic.h" +#include "sched/scheduler.h" __attribute__((used, section(".limine_requests"))) volatile struct limine_date_at_boot_request boot_request = { @@ -39,7 +40,7 @@ void PIT_IRQ_Handler(Registers* regs) lapic_eoi(); } - // You can add scheduler / time logic here later + sched_tick(); } /* ========================= */ diff --git a/src/fs/elf.c b/src/fs/elf.c index 310001c..9c9a772 100644 --- a/src/fs/elf.c +++ b/src/fs/elf.c @@ -7,16 +7,13 @@ #include "fs/ext2.h" extern uintptr_t g_hhdm_offset; -extern struct pagemap *kernel_pagemap; #define ELF_BUFFER_SIZE (1024 * 1024) -#define PTE_PRESENT (1ULL << 0) -#define PTE_WRITABLE (1ULL << 1) -#define PTE_USER (1ULL << 2) -bool ELF_Read(const char* path, void** entryPoint) + +bool ELF_Read(const char* path, void** entryPoint, struct pagemap *target_pagemap) { uint32_t size; @@ -41,6 +38,15 @@ bool ELF_Read(const char* path, void** entryPoint) ELFHeader* header = (ELFHeader*)elf_buffer; + printf("=== ELF DEBUG ===\n"); + printf("Entry point VA = 0x%lx\n", header->ProgramEntryPosition); + printf("PHDR offset = 0x%lx\n", header->ProgramHeaderTablePosition); + printf("PHDR count = %u\n", header->ProgramHeaderTableEntryCount); + + + + printf("=== END ELF DEBUG ===\n"); + // ── validate ELF ────────────────────────────────── if (memcmp(header->Magic, ELF_MAGIC, 4) != 0) { printf("ELF: bad magic\n"); @@ -82,8 +88,11 @@ bool ELF_Read(const char* path, void** entryPoint) ELFProgramHeader* ph = (ELFProgramHeader*)(ph_table + i * header->ProgramHeaderTableEntrySize); - if (ph->Type != ELF_PROGRAM_TYPE_LOAD) + if (ph->Type != ELF_PROGRAM_TYPE_LOAD) { + printf("LOAD segment: VA=0x%lx FileSz=0x%lx MemSz=0x%lx\n", + ph->VirtualAddress, ph->FileSize, ph->MemorySize); continue; + } uint64_t virt = ph->VirtualAddress; uint64_t offset = ph->Offset; @@ -114,7 +123,7 @@ bool ELF_Read(const char* path, void** entryPoint) uint64_t phys_addr = phys_base + p * PAGE_SIZE; bool success = vmm_map_page( - kernel_pagemap, + target_pagemap, virt_addr, phys_addr, PAGE_READ | PAGE_WRITE | PAGE_USER, // RW + User mode diff --git a/src/fs/elf.h b/src/fs/elf.h index bec6d4f..029eb8a 100644 --- a/src/fs/elf.h +++ b/src/fs/elf.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include "mm/vmm.h" #define ELF_MAGIC ("\x7F" "ELF") @@ -114,4 +115,4 @@ enum ELFProgramType { }; -bool ELF_Read(const char* path, void** entryPoint); \ No newline at end of file +bool ELF_Read(const char* path, void** entryPoint, struct pagemap *target_pagemap); \ No newline at end of file diff --git a/src/fs/vfs.c b/src/fs/vfs.c index b148d7f..a0b2783 100644 --- a/src/fs/vfs.c +++ b/src/fs/vfs.c @@ -131,13 +131,19 @@ int VFS_Read_internal(fd_t fd, uint8_t* buf, size_t size) // naive: read whole file then slice uint8_t* tmp = kmalloc(file_size); - if (!ext2_read_file(&file->ext2.inode, tmp)) + if (!tmp) { return -1; - + } + if (!ext2_read_file(&file->ext2.inode, tmp)) { + kfree(tmp); + return -1; + } + for (size_t i = 0; i < size; i++) buf[i] = tmp[file->offset + i]; file->offset += size; + kfree(tmp); return size; } diff --git a/src/main.c b/src/main.c index ce36f14..decb2b1 100644 --- a/src/main.c +++ b/src/main.c @@ -31,6 +31,7 @@ #include "arch/x86_64/sys/apic.h" #include "arch/x86_64/sys/ioapic.h" #include "drivers/input/ps2.h" +#include "sched/scheduler.h" uintptr_t g_hhdm_offset; @@ -128,7 +129,7 @@ static uacpi_interrupt_ret handle_power_button(uacpi_handle ctx) { void kmain(void) { - if (LIMINE_BASE_REVISION_SUPPORTED(limine_base_revision) == false) { + if (LIMINE_BASE_REVISION_SUPPORTED(limine_base_revision) == false) { hcf(); } @@ -350,8 +351,19 @@ void kmain(void) { syscall_init(); + + sched_init(); + start_userspace(); + sched_yield(); + + + for (;;) { + sched_yield(); + } + // We're done, just hang... - hcf(); -} \ No newline at end of file + //hcf(); +} + diff --git a/src/mm/vmm.h b/src/mm/vmm.h index dd8732c..370cf7f 100644 --- a/src/mm/vmm.h +++ b/src/mm/vmm.h @@ -23,6 +23,10 @@ extern volatile struct limine_executable_address_request kernel_address_request; #define PAGE_USER (1ULL << 2) #define PAGE_NO_EXECUTE (1ULL << 63) +#define PTE_PRESENT (1ULL << 0) +#define PTE_WRITABLE (1ULL << 1) +#define PTE_USER (1ULL << 2) + struct pagemap { spinlock_t lock; uint64_t *top_level; @@ -36,4 +40,6 @@ void vmm_init(struct limine_memmap_entry **memmap, size_t memmap_entries); void vmm_switch_pagemap(struct pagemap *pagemap); bool vmm_map_page(struct pagemap *pagemap, uint64_t virt, uint64_t phys, uint64_t flags, enum page_size pg_size); -uint64_t vmm_virt_to_phys(struct pagemap *pagemap, uint64_t virt); \ No newline at end of file +uint64_t vmm_virt_to_phys(struct pagemap *pagemap, uint64_t virt); +uint64_t *vmm_virt_to_pte(struct pagemap *pagemap, uintptr_t virt_addr, + bool allocate); \ No newline at end of file diff --git a/src/mp/percpu.h b/src/mp/percpu.h index b922ce5..b312718 100644 --- a/src/mp/percpu.h +++ b/src/mp/percpu.h @@ -3,7 +3,11 @@ #include struct cpu_local { - uint64_t self; // +0x00 (GS:0x00) - uint64_t user_rsp; // +0x08 (GS:0x08) — saved user RSP on syscall entry - uint64_t kernel_rsp; // +0x10 (GS:0x10) — kernel stack for syscall handler + uint64_t self; /* +0x00 (GS:0x00) — points to this struct */ + uint64_t user_rsp; /* +0x08 (GS:0x08) — saved user RSP on SYSCALL */ + uint64_t kernel_rsp; /* +0x10 (GS:0x10) — kernel stack for syscall */ + /* Future SMP fields: */ + uint32_t cpu_id; /* +0x18 logical CPU index */ + uint32_t _pad; + void *current; /* +0x20 pointer to current task_t */ }; \ No newline at end of file diff --git a/src/sched/context_switch.S b/src/sched/context_switch.S new file mode 100644 index 0000000..7d44db2 --- /dev/null +++ b/src/sched/context_switch.S @@ -0,0 +1,75 @@ + +# void sched_context_switch(struct cpu_context *from, +# struct cpu_context *to); +# +# struct cpu_context layout (from sched.h): +# offset 0 : rsp (uint64_t) +# offset 8 : cr3 (uint64_t) +# +# Strategy +# -------- +# Only callee-saved registers need explicit saving; caller-saved registers +# (rax, rcx, rdx, rsi, rdi, r8–r11) are already on the caller's stack per +# the System V ABI. +# +# 1. Push all callee-saved GPRs onto the *current* kernel stack. +# 2. Save RSP → from->rsp. +# 3. If to->cr3 is non-zero and differs from the current CR3, load it +# (switches address space for user processes). +# 4. Load RSP ← to->rsp (switch to next task's kernel stack). +# 5. Pop all callee-saved GPRs from the *new* stack. +# 6. ret — pops the return address placed there during task creation +# (kthread_trampoline / user_task_trampoline) for a brand-new task, +# or returns into the schedule() call-site for a resumed task. +# +# NOTE: This function is called with IF=0 (interrupts disabled) and must +# NOT modify IF itself. The trampolines re-enable interrupts after the +# first-ever schedule-in of a task. + +.section .text +.global sched_context_switch +.type sched_context_switch, @function + +sched_context_switch: + # ── Save outgoing task ───────────────────────────────────────────── + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + # from->rsp = RSP (rdi = struct cpu_context *from, offset 0 = rsp) + movq %rsp, 0(%rdi) + + # ── Switch address space (CR3) if needed ─────────────────────────── + # to->cr3 is at offset 8 in struct cpu_context + movq 8(%rsi), %rax + testq %rax, %rax # 0 means "keep current CR3" (kernel thread) + jz .Lno_cr3 + + movq %cr3, %rcx + cmpq %rax, %rcx # Same CR3? Don't flush the TLB needlessly. + je .Lno_cr3 + + movq %rax, %cr3 # Load new page table root (flushes TLB) + +.Lno_cr3: + # ── Switch to incoming task's kernel stack ───────────────────────── + # to->rsp is at offset 0 (rsi = struct cpu_context *to) + movq 0(%rsi), %rsp + + # ── Restore incoming task's callee-saved registers ───────────────── + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + + # ret pops the "return address" from the new task's stack. + # • Brand-new task → jumps to kthread_trampoline or user_task_trampoline + # • Resumed task → returns into schedule(), then back up the call chain + ret + +.size sched_context_switch, . - sched_context_switch \ No newline at end of file diff --git a/src/sched/scheduler.c b/src/sched/scheduler.c new file mode 100644 index 0000000..fa4cd02 --- /dev/null +++ b/src/sched/scheduler.c @@ -0,0 +1,935 @@ +#include "scheduler.h" +#include "mm/memory.h" +#include "mm/pmm.h" +#include "libk/stdio.h" +#include "arch/x86_64/cpu/io.h" +#include "arch/x86_64/sys/pit.h" +#include "string.h" + +/* ===================================================================== + * Forward declarations for GDT/TSS (defined in gdt.c) + * ===================================================================== */ +typedef struct { + uint32_t reserved0; + uint64_t rsp0; + uint64_t rsp1; + uint64_t rsp2; + uint64_t reserved1; + uint64_t ist[7]; + uint64_t reserved2; + uint16_t reserved3; + uint16_t iopb_offset; +} __attribute__((packed)) TSS; + +extern TSS kernel_tss; + +/* ===================================================================== + * Globals + * ===================================================================== */ +struct runqueue g_runqueue = {0}; +task_t *g_current_task = NULL; + +/* PIT tick counter (defined in pit.c) */ +extern volatile uint64_t g_Ticks; + +/* ===================================================================== + * Linux-compatible nice → CPU weight table (NICE_0_LOAD = 1024) + * + * vruntime_delta = actual_ticks * NICE_0_LOAD / weight[nice + 20] + * timeslice = BASE * weight[nice + 20] / NICE_0_LOAD + * + * Low weight (high nice) → vruntime accumulates faster → scheduled less. + * ===================================================================== */ +#define NICE_0_LOAD 1024u + +static const uint32_t nice_to_weight[40] = { + /* nice -20 */ 88761, 71755, 56483, 46273, 36291, + /* nice -15 */ 29154, 23254, 18705, 14949, 11916, + /* nice -10 */ 9548, 7620, 6100, 4904, 3906, + /* nice -5 */ 3121, 2501, 1991, 1586, 1277, + /* nice 0 */ 1024, 820, 655, 526, 423, + /* nice +5 */ 335, 272, 215, 172, 137, + /* nice +10 */ 110, 87, 70, 56, 45, + /* nice +15 */ 36, 29, 23, 18, 15, +}; + +static inline uint32_t weight_for_nice(int nice) { + int idx = nice + 20; + if (idx < 0) idx = 0; + if (idx > 39) idx = 39; + return nice_to_weight[idx]; +} + +/* ===================================================================== + * PID allocator (simple monotonic counter, single-CPU safe) + * ===================================================================== */ +static pid_t g_next_pid = 1; + +static pid_t alloc_pid(void) { + return g_next_pid++; +} + +/* ===================================================================== + * RT bitmap helpers (O(1) find-first-set) + * ===================================================================== */ +static inline void rt_bitmap_set(struct rt_prio_array *arr, int prio) { + /* prio is 1..99; store at bit (prio-1) */ + int bit = prio - 1; + arr->bitmap[bit / 64] |= (1ULL << (bit % 64)); +} + +static inline void rt_bitmap_clear(struct rt_prio_array *arr, int prio) { + int bit = prio - 1; + arr->bitmap[bit / 64] &= ~(1ULL << (bit % 64)); +} + +/* Returns the highest-priority (lowest numeric) non-empty RT queue, + * or -1 if all are empty. */ +static inline int rt_bitmap_first(const struct rt_prio_array *arr) { + if (arr->bitmap[0]) return __builtin_ctzll(arr->bitmap[0]) + 1; + if (arr->bitmap[1]) return __builtin_ctzll(arr->bitmap[1]) + 65; + return -1; +} + +/* ===================================================================== + * RT FIFO queue operations + * ===================================================================== */ +static void rt_enqueue(struct runqueue *rq, task_t *task) { + int prio = task->static_prio; + struct rt_prio_array *arr = &rq->rt; + + task->rq_next = NULL; + task->rq_prev = arr->tail[prio]; + + if (arr->tail[prio]) + arr->tail[prio]->rq_next = task; + else + arr->head[prio] = task; + + arr->tail[prio] = task; + rt_bitmap_set(arr, prio); + arr->total++; + rq->nr_running++; +} + +/* Dequeue the head of the given priority's FIFO */ +static task_t *rt_dequeue_head(struct runqueue *rq, int prio) { + struct rt_prio_array *arr = &rq->rt; + task_t *task = arr->head[prio]; + if (!task) return NULL; + + arr->head[prio] = task->rq_next; + if (arr->head[prio]) + arr->head[prio]->rq_prev = NULL; + else { + arr->tail[prio] = NULL; + rt_bitmap_clear(arr, prio); + } + + task->rq_next = task->rq_prev = NULL; + arr->total--; + rq->nr_running--; + return task; +} + +/* Remove a specific task from an RT queue (O(1) with doubly-linked list) */ +static void rt_remove(struct runqueue *rq, task_t *task) { + int prio = task->static_prio; + struct rt_prio_array *arr = &rq->rt; + + if (task->rq_prev) task->rq_prev->rq_next = task->rq_next; + else arr->head[prio] = task->rq_next; + + if (task->rq_next) task->rq_next->rq_prev = task->rq_prev; + else arr->tail[prio] = task->rq_prev; + + if (!arr->head[prio]) + rt_bitmap_clear(arr, prio); + + task->rq_next = task->rq_prev = NULL; + arr->total--; + rq->nr_running--; +} + +/* ===================================================================== + * CFS (normal) queue operations — sorted ascending by vruntime + * ===================================================================== */ + +/* + * Insert task into the CFS list, keeping it sorted by vruntime. + * O(n) — acceptable for hobby-kernel task counts; replace with + * red-black tree if you hit performance issues. + */ +static void cfs_enqueue(struct runqueue *rq, task_t *task) { + /* New tasks start at min_vruntime so they don't starve incumbents + * but also don't get a massive head-start. */ + if (task->vruntime < rq->min_vruntime) + task->vruntime = rq->min_vruntime; + + task_t **pp = &rq->cfs_head; + task_t *prev = NULL; + + while (*pp && (*pp)->vruntime <= task->vruntime) { + prev = *pp; + pp = &(*pp)->rq_next; + } + + task->rq_next = *pp; + task->rq_prev = prev; + if (*pp) (*pp)->rq_prev = task; + *pp = task; + + rq->cfs_count++; + rq->nr_running++; +} + +/* Remove the task with the smallest vruntime (head of list) */ +static task_t *cfs_dequeue_min(struct runqueue *rq) { + task_t *task = rq->cfs_head; + if (!task) return NULL; + + rq->cfs_head = task->rq_next; + if (rq->cfs_head) { + rq->cfs_head->rq_prev = NULL; + rq->min_vruntime = rq->cfs_head->vruntime; + } + + task->rq_next = task->rq_prev = NULL; + rq->cfs_count--; + rq->nr_running--; + return task; +} + +/* Remove a specific task from the CFS queue */ +static void cfs_remove(struct runqueue *rq, task_t *task) { + if (task->rq_prev) task->rq_prev->rq_next = task->rq_next; + else rq->cfs_head = task->rq_next; + + if (task->rq_next) task->rq_next->rq_prev = task->rq_prev; + + task->rq_next = task->rq_prev = NULL; + rq->cfs_count--; + rq->nr_running--; +} + +/* ===================================================================== + * Timeslice calculation + * ===================================================================== */ +static uint64_t calc_timeslice(const task_t *task) { + switch (task->policy) { + case SCHED_FIFO: + return UINT64_MAX; /* Runs until it yields or blocks */ + + case SCHED_RR: + return SCHED_RR_SLICE_MS; + + case SCHED_IDLE: + return SCHED_BASE_SLICE_MS; + + default: /* SCHED_NORMAL / SCHED_BATCH */ + { + uint32_t w = weight_for_nice(task->nice); + uint64_t ms = (uint64_t)SCHED_BASE_SLICE_MS * w / NICE_0_LOAD; + if (ms < SCHED_MIN_SLICE_MS) ms = SCHED_MIN_SLICE_MS; + if (ms > SCHED_MAX_SLICE_MS) ms = SCHED_MAX_SLICE_MS; + return ms; + } + } +} + +/* Update a task's vruntime based on how many ticks it actually ran */ +static void update_vruntime(task_t *task, uint64_t elapsed_ticks) { + task->sum_exec_runtime += elapsed_ticks; + + if (task->policy == SCHED_NORMAL || task->policy == SCHED_BATCH) { + /* vruntime_delta = ticks * NICE_0_LOAD / weight + * High-weight (low nice) tasks accumulate vruntime slowly → more CPU. */ + uint32_t w = weight_for_nice(task->nice); + uint64_t delta = elapsed_ticks * NICE_0_LOAD / w; + task->vruntime += delta; + } else { + /* RT and idle: track raw ticks for accounting; vruntime unused */ + task->vruntime += elapsed_ticks; + } +} + +/* ===================================================================== + * sched_enqueue / sched_dequeue (public, uses run-queue lock) + * ===================================================================== */ +void sched_enqueue(task_t *task) { + uint64_t flags; + spinlock_acquire_irqsave(&g_runqueue.lock, &flags); + + task->state = TASK_RUNNING; + + if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) { + rt_enqueue(&g_runqueue, task); + } else if (task->policy == SCHED_IDLE) { + /* SCHED_IDLE: only one idle task, stored separately */ + g_runqueue.idle = task; + } else { + cfs_enqueue(&g_runqueue, task); + } + + spinlock_release_irqrestore(&g_runqueue.lock, flags); +} + +void sched_dequeue(task_t *task) { + uint64_t flags; + spinlock_acquire_irqsave(&g_runqueue.lock, &flags); + + if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) { + rt_remove(&g_runqueue, task); + } else if (task->policy != SCHED_IDLE) { + cfs_remove(&g_runqueue, task); + } + + spinlock_release_irqrestore(&g_runqueue.lock, flags); +} + +/* ===================================================================== + * pick_next_task (called with interrupts OFF and lock held) + * + * Priority order: + * 1. Highest RT priority with a runnable task + * 2. Normal task with smallest vruntime + * 3. Idle task (always exists, never NULL) + * ===================================================================== */ +static task_t *pick_next_task(struct runqueue *rq) { + /* 1. Real-time */ + int rt_prio = rt_bitmap_first(&rq->rt); + if (rt_prio > 0) { + return rt_dequeue_head(rq, rt_prio); + } + + /* 2. CFS normal */ + if (rq->cfs_count > 0) { + return cfs_dequeue_min(rq); + } + + /* 3. Idle fallback */ + return rq->idle; +} + +/* ===================================================================== + * Trampolines + * These are the "return addresses" pushed onto a new task's kernel + * stack. When the task is scheduled for the first time, ret inside + * sched_context_switch() jumps here. + * ===================================================================== */ +static void kthread_trampoline(void) { + /* We arrive with interrupts disabled (just after a context switch). + * Re-enable them before entering user code. */ + x86_64_EnableInterrupts(); + + task_t *self = g_current_task; + self->kthread_entry(self->kthread_arg); + + /* Thread returned — treat it as a clean exit. */ + sched_exit(0); +} + +static void user_task_trampoline(void) { + x86_64_EnableInterrupts(); + + task_t *self = g_current_task; + + /* + * Build an iretq frame on the current (kernel) stack and enter + * user mode. We reset the stack pointer to the very top of the + * kernel stack first, so the iretq frame doesn't sit below a + * pile of stale context-switch frames. + * + * Segment selectors (from your GDT / STAR setup): + * User CS = 0x23 (GDT index 4, RPL 3) + * User SS = 0x1B (GDT index 3, RPL 3) + */ + uint64_t kstack_top = (uint64_t)self->kernel_stack + self->kernel_stack_size; + uint64_t user_rip = self->user_entry; + uint64_t user_rsp = self->user_stack_top; + + asm volatile( + "movq %0, %%rsp\n\t" /* Reset kernel RSP to stack top */ + "pushq $0x1B\n\t" /* SS – user data segment */ + "pushq %1\n\t" /* RSP – user stack pointer */ + "pushfq\n\t" /* RFLAGS */ + "orq $0x200, (%%rsp)\n\t" /* Set IF so user code runs with */ + /* interrupts enabled */ + "pushq $0x23\n\t" /* CS – user code segment */ + "pushq %2\n\t" /* RIP – user entry point */ + "iretq\n\t" + : + : "r"(kstack_top), "r"(user_rsp), "r"(user_rip) + : "memory" + ); + + __builtin_unreachable(); +} + +/* ===================================================================== + * Kernel stack setup for a new task + * + * Lay out: [trampoline_addr] [r15=0] [r14=0] [r13=0] [r12=0] + * [rbp=0] [rbx=0] ← ctx.rsp points here + * + * sched_context_switch pops in order: rbx, rbp, r12, r13, r14, r15, + * then ret → trampoline. + * ===================================================================== */ +static void setup_initial_kstack(task_t *task, void *trampoline) { + uint64_t *sp = (uint64_t *)((uint8_t *)task->kernel_stack + + task->kernel_stack_size); + + *--sp = (uint64_t)trampoline; /* "return address" → trampoline */ + *--sp = 0; /* r15 */ + *--sp = 0; /* r14 */ + *--sp = 0; /* r13 */ + *--sp = 0; /* r12 */ + *--sp = 0; /* rbp */ + *--sp = 0; /* rbx */ + + task->ctx.rsp = (uint64_t)sp; +} + +/* ===================================================================== + * Task allocation helper + * ===================================================================== */ +static task_t *alloc_task(const char *name, bool is_user) { + task_t *task = kmalloc(sizeof(task_t)); + if (!task) return NULL; + memset(task, 0, sizeof(task_t)); + + strncpy(task->name, name, sizeof(task->name) - 1); + task->is_user = is_user; + task->state = TASK_RUNNING; + task->policy = SCHED_NORMAL; + task->nice = NICE_DEFAULT; + task->static_prio = NICE_TO_PRIO(NICE_DEFAULT); + task->prio = task->static_prio; + task->pid = alloc_pid(); + task->ppid = g_current_task ? g_current_task->pid : 0; + task->parent = g_current_task; + + /* Default: all signals use SIG_DFL */ + for (int i = 0; i < _NSIG; i++) + task->sigactions[i].sa_handler = SIG_DFL; + + /* Allocate kernel stack */ + task->kernel_stack_size = KSTACK_SIZE; + task->kernel_stack = kmalloc(KSTACK_SIZE); + if (!task->kernel_stack) { + kfree(task); + return NULL; + } + memset(task->kernel_stack, 0xCC, KSTACK_SIZE); /* poison */ + + return task; +} + +/* ===================================================================== + * Public task-creation API + * ===================================================================== */ +task_t *sched_create_kthread(const char *name, + void (*entry)(void *), void *arg) +{ + task_t *task = alloc_task(name, false); + if (!task) return NULL; + + task->kthread_entry = entry; + task->kthread_arg = arg; + task->ctx.cr3 = 0; /* Kernel threads share kernel_pagemap */ + + setup_initial_kstack(task, kthread_trampoline); + + task->time_slice = calc_timeslice(task); + task->vruntime = g_runqueue.min_vruntime; + + sched_enqueue(task); + printf("[sched] kthread '%s' pid=%d created\n", task->name, task->pid); + return task; +} + +task_t *sched_create_user_task(const char *name, + uint64_t entry_rip, uint64_t user_rsp, + struct pagemap *pm) +{ + task_t *task = alloc_task(name, true); + if (!task) return NULL; + + task->pagemap = pm; + task->user_entry = entry_rip; + task->user_stack_top= user_rsp; + + /* CR3 = physical address of PML4 */ + task->ctx.cr3 = (uint64_t)pm->top_level - MEM_PHYS_OFFSET; + + setup_initial_kstack(task, user_task_trampoline); + + task->time_slice = calc_timeslice(task); + task->vruntime = g_runqueue.min_vruntime; + + sched_enqueue(task); + printf("[sched] user task '%s' pid=%d created, entry=0x%lx\n", + task->name, task->pid, entry_rip); + return task; +} + +/* ===================================================================== + * Idle task + * ===================================================================== */ +static void idle_entry(void *arg) { + (void)arg; + for (;;) { + asm volatile("sti; hlt; cli" ::: "memory"); + /* If we get here, an interrupt fired; schedule() will be + * called by sched_tick if a real task became runnable. */ + } +} + +/* ===================================================================== + * sched_init — set up the idle task and initialise the run queue + * ===================================================================== */ +void sched_init(void) { + spinlock_init(&g_runqueue.lock); + g_runqueue.min_vruntime = 0; + + /* Synthesise a "current" descriptor for the boot thread so that + * the first schedule() call has something valid in g_current_task. */ + task_t *boot = kmalloc(sizeof(task_t)); + if (!boot) { + printf("[sched] FATAL: cannot allocate boot task\n"); + while (1) asm volatile("hlt"); + } + memset(boot, 0, sizeof(task_t)); + strncpy(boot->name, "boot", 63); + boot->pid = alloc_pid(); /* pid 1 */ + boot->state = TASK_RUNNING; + boot->policy = SCHED_NORMAL; + boot->nice = NICE_DEFAULT; + boot->static_prio = NICE_TO_PRIO(NICE_DEFAULT); + boot->prio = boot->static_prio; + boot->time_slice = calc_timeslice(boot); + boot->slice_start = g_Ticks; + /* kernel_stack: we're already running on it; RSP will be saved by + * the first sched_context_switch() call, so no setup needed. */ + boot->is_user = false; + + g_current_task = boot; + g_runqueue.current = boot; + + /* Create the idle task but do NOT go through sched_create_kthread + * because we store it separately (not on the CFS/RT queues). */ + task_t *idle = alloc_task("idle", false); + if (!idle) { + printf("[sched] FATAL: cannot allocate idle task\n"); + while (1) asm volatile("hlt"); + } + idle->policy = SCHED_IDLE; + idle->static_prio = IDLE_PRIO; + idle->prio = IDLE_PRIO; + idle->kthread_entry = idle_entry; + idle->kthread_arg = NULL; + idle->ctx.cr3 = 0; + setup_initial_kstack(idle, kthread_trampoline); + + g_runqueue.idle = idle; + + printf("[sched] initialised; boot pid=%d\n", boot->pid); +} + +/* ===================================================================== + * schedule() — the heart of the scheduler + * + * Must be called with interrupts disabled (IF=0). Restores IF when + * the scheduled-in task next runs (via its own stack context or via + * the kthread_trampoline which calls x86_64_EnableInterrupts). + * ===================================================================== */ +void schedule(void) { + /* + * We deliberately do NOT use spinlock_acquire_irqsave here because + * we're already called with IF=0 (either from an ISR or from + * sched_yield/sched_block which do cli first). + * We use a plain spinlock_acquire_or_wait so that on SMP (future) + * another CPU spinning on the lock eventually gets it. + */ + spinlock_acquire_or_wait(&g_runqueue.lock); + + task_t *prev = g_runqueue.current; + task_t *next = pick_next_task(&g_runqueue); + + if (next == prev || next == NULL) { + /* Nothing to switch to; keep running current task. */ + spinlock_drop(&g_runqueue.lock); + return; + } + + /* Account for time the current task actually ran */ + uint64_t now = g_Ticks; + uint64_t elapsed = (prev->slice_start <= now) + ? (now - prev->slice_start) + : 0; + update_vruntime(prev, elapsed); + + /* Re-enqueue the outgoing task if it is still runnable (preempted). + * If it blocked/exited, its state is no longer TASK_RUNNING. */ + if (prev->state == TASK_RUNNING && prev != g_runqueue.idle) { + prev->time_slice = calc_timeslice(prev); /* refresh slice */ + + if (prev->policy == SCHED_FIFO || prev->policy == SCHED_RR) { + rt_enqueue(&g_runqueue, prev); + } else if (prev->policy != SCHED_IDLE) { + cfs_enqueue(&g_runqueue, prev); + } + } + + /* Set up the incoming task */ + next->state = TASK_RUNNING; + next->need_reschedule = false; + next->slice_start = now; + + g_runqueue.current = next; + g_current_task = next; + g_runqueue.nr_switches++; + + /* Update TSS.RSP0 so that user-mode interrupts for this task use + * the correct kernel stack. */ + if (next->is_user && next->kernel_stack) { + kernel_tss.rsp0 = (uint64_t)next->kernel_stack + + next->kernel_stack_size; + } + + spinlock_drop(&g_runqueue.lock); + + /* ---- Context switch -------------------------------------------- */ + sched_context_switch(&prev->ctx, &next->ctx); + + /* + * When we return here we are BACK in the context of `prev` + * (which has just been rescheduled). Process any pending signals + * before returning to user space. + */ + task_handle_pending_signals(); +} + +/* ===================================================================== + * sched_tick() — called from the PIT/LAPIC IRQ every millisecond + * ===================================================================== */ +void sched_tick(void) { + task_t *cur = g_current_task; + if (!cur) return; + + /* Decrement remaining timeslice */ + if (cur->time_slice > 0) + cur->time_slice--; + + /* + * Trigger a reschedule if: + * (a) The timeslice ran out, or + * (b) need_reschedule was set by a wakeup of a higher-prio task. + */ + if (cur->time_slice == 0 || cur->need_reschedule) { + /* schedule() expects IF=0 — guaranteed here because we are + * inside an IRQ handler; the CPU cleared IF on entry. */ + schedule(); + } +} + +/* ===================================================================== + * sched_yield() — voluntary CPU release + * ===================================================================== */ +void sched_yield(void) { + x86_64_DisableInterrupts(); + g_current_task->time_slice = 0; /* Force preemption */ + schedule(); + x86_64_EnableInterrupts(); +} + +/* ===================================================================== + * sched_block() — put current task to sleep + * + * Caller must set the task state BEFORE calling (the function + * honours whatever state is already set). Alternatively pass the + * desired reason and we set it here. + * ===================================================================== */ +void sched_block(task_state_t reason) { + x86_64_DisableInterrupts(); + g_current_task->state = reason; + schedule(); + x86_64_EnableInterrupts(); + /* When we return here the task has been woken up. */ +} + +/* ===================================================================== + * sched_wake() — wake a sleeping task + * Safe to call from interrupt context. + * ===================================================================== */ +void sched_wake(task_t *task) { + if (!task) return; + + uint64_t flags; + spinlock_acquire_irqsave(&g_runqueue.lock, &flags); + + if (task->state != TASK_RUNNING) { + task->state = TASK_RUNNING; + task->time_slice = calc_timeslice(task); + + if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) { + rt_enqueue(&g_runqueue, task); + } else if (task->policy != SCHED_IDLE) { + cfs_enqueue(&g_runqueue, task); + } + + /* + * Preempt the current task if the woken task has strictly + * higher priority (lower numeric priority value). + */ + task_t *cur = g_runqueue.current; + if (cur && task->prio < cur->prio) { + cur->need_reschedule = true; + } + } + + spinlock_release_irqrestore(&g_runqueue.lock, flags); +} + +/* ===================================================================== + * sched_exit() — terminate the current task (noreturn) + * ===================================================================== */ +void sched_exit(int exit_code) { + x86_64_DisableInterrupts(); + + task_t *self = g_current_task; + self->exit_code = exit_code; + self->state = TASK_ZOMBIE; + + /* Notify parent (send SIGCHLD) */ + if (self->parent) + task_send_signal(self->parent, SIGCHLD); + + printf("[sched] task '%s' pid=%d exited with code %d\n", + self->name, self->pid, exit_code); + + /* Hand off to someone else; we will never return. */ + schedule(); + + /* schedule() should never return to a ZOMBIE task, but just in case: */ + for (;;) asm volatile("hlt"); + __builtin_unreachable(); +} + +/* ===================================================================== + * Signal delivery + * ===================================================================== */ + +/* Default signal actions */ +typedef enum { SIG_ACTION_TERM, SIG_ACTION_CORE, SIG_ACTION_IGN, + SIG_ACTION_STOP, SIG_ACTION_CONT } sig_default_action_t; + +static sig_default_action_t default_action(int signum) { + switch (signum) { + case SIGHUP: case SIGINT: case SIGKILL: case SIGPIPE: + case SIGALRM: case SIGTERM: case SIGUSR1: case SIGUSR2: + case SIGPROF: case SIGVTALRM: case SIGSTKFLT: + return SIG_ACTION_TERM; + case SIGQUIT: case SIGILL: case SIGABRT: case SIGFPE: + case SIGSEGV: case SIGBUS: case SIGSYS: case SIGTRAP: + case SIGXCPU: case SIGXFSZ: + return SIG_ACTION_CORE; /* we treat CORE same as TERM for now */ + case SIGCHLD: case SIGURG: case SIGWINCH: case SIGIO: case SIGPWR: + return SIG_ACTION_IGN; + case SIGSTOP: case SIGTSTP: case SIGTTIN: case SIGTTOU: + return SIG_ACTION_STOP; + case SIGCONT: + return SIG_ACTION_CONT; + default: + return SIG_ACTION_TERM; + } +} + +int task_send_signal(task_t *task, int signum) { + if (!task) return -1; + if (signum <= 0 || signum >= _NSIG) return -1; + + uint64_t flags; + spinlock_acquire_irqsave(&g_runqueue.lock, &flags); + + /* Set pending bit */ + task->pending_signals |= (1ULL << signum); + + /* SIGKILL and SIGCONT always wake the target */ + if (signum == SIGKILL || signum == SIGCONT) { + if (task->state == TASK_INTERRUPTIBLE || + task->state == TASK_STOPPED || + task->state == TASK_UNINTERRUPTIBLE) { + task->state = TASK_RUNNING; + /* Re-enqueue (simplified: call rt/cfs directly since lock held) */ + if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) + rt_enqueue(&g_runqueue, task); + else if (task->policy != SCHED_IDLE) + cfs_enqueue(&g_runqueue, task); + } + } else if (!(task->signal_mask & (1ULL << signum))) { + /* Unblocked signal: wake an interruptible sleeper */ + if (task->state == TASK_INTERRUPTIBLE) { + task->state = TASK_RUNNING; + if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) + rt_enqueue(&g_runqueue, task); + else if (task->policy != SCHED_IDLE) + cfs_enqueue(&g_runqueue, task); + } + } + + spinlock_release_irqrestore(&g_runqueue.lock, flags); + return 0; +} + +/* + * Handle pending signals for the current task. + * Called just before returning to user space (end of schedule(), syscall + * return path, or end of IRQ handler for user-mode tasks). + */ +void task_handle_pending_signals(void) { + task_t *self = g_current_task; + if (!self) return; + + while (self->pending_signals & ~self->signal_mask) { + /* Find the lowest-numbered pending, unblocked signal */ + uint64_t deliverable = self->pending_signals & ~self->signal_mask; + int signum = __builtin_ctzll(deliverable) + 1; /* +1: bit 0 = sig 1 */ + if (signum >= _NSIG) break; + + /* Clear the pending bit */ + self->pending_signals &= ~(1ULL << (signum - 1)); + + sighandler_t handler = self->sigactions[signum].sa_handler; + + if (handler == SIG_IGN) { + /* Explicitly ignored */ + if (signum == SIGCHLD) continue; /* common: reap silently */ + continue; + + } else if (handler != SIG_DFL) { + /* + * User-defined handler. + * + * A full POSIX implementation would build a signal frame on + * the user stack and set registers so that iretq delivers + * the signal; that requires knowing the saved RFLAGS/RIP + * from the ISR frame. We leave this as a TODO and just + * call the handler directly for kernel threads. + * + * For user tasks this is the point where you would push a + * ucontext_t / sigframe onto the user stack and adjust the + * saved user RIP in the ISR frame. + */ + if (!self->is_user) { + handler(signum); + } else { + /* TODO: build user-space signal frame */ + printf("[signal] TODO: deliver signal %d to user task '%s'\n", + signum, self->name); + } + + } else { + /* SIG_DFL */ + switch (default_action(signum)) { + case SIG_ACTION_TERM: + case SIG_ACTION_CORE: + printf("[signal] task '%s' pid=%d killed by signal %d\n", + self->name, self->pid, signum); + sched_exit(128 + signum); + break; /* unreachable */ + + case SIG_ACTION_STOP: + self->state = TASK_STOPPED; + /* Notify parent */ + if (self->parent) task_send_signal(self->parent, SIGCHLD); + sched_block(TASK_STOPPED); + break; + + case SIG_ACTION_CONT: + /* Already running (we were woken to handle this) */ + break; + + case SIG_ACTION_IGN: + break; + } + } + } +} + +/* ===================================================================== + * sched_find_task (linear scan — O(n), suitable for small task counts) + * ===================================================================== */ +task_t *sched_find_task(pid_t pid) { + /* + * Walk the CFS list and RT queues. In a production kernel this + * would be a hash table. For KirkOS this is fine. + */ + task_t *t = g_runqueue.cfs_head; + while (t) { + if (t->pid == pid) return t; + t = t->rq_next; + } + for (int p = RT_PRIO_MIN; p <= RT_PRIO_MAX; p++) { + t = g_runqueue.rt.head[p]; + while (t) { + if (t->pid == pid) return t; + t = t->rq_next; + } + } + if (g_runqueue.current && g_runqueue.current->pid == pid) + return g_runqueue.current; + return NULL; +} + +/* ===================================================================== + * Priority / scheduler controls + * ===================================================================== */ +int task_set_nice(task_t *task, int nice) { + if (nice < NICE_MIN) nice = NICE_MIN; + if (nice > NICE_MAX) nice = NICE_MAX; + + int old_nice = task->nice; + + task->nice = nice; + task->static_prio = NICE_TO_PRIO(nice); + task->prio = task->static_prio; + + /* + * Recompute the timeslice. If the task is currently on a queue we + * would need to re-sort it (out of scope here — next schedule() will + * pick the right slot when it re-enqueues). + */ + task->time_slice = calc_timeslice(task); + + return old_nice; +} + +int task_set_scheduler(task_t *task, int policy, int rt_prio) { + if (policy != SCHED_NORMAL && policy != SCHED_FIFO && + policy != SCHED_RR && policy != SCHED_BATCH && + policy != SCHED_IDLE) + return -1; + + if ((policy == SCHED_FIFO || policy == SCHED_RR) && + (rt_prio < RT_PRIO_MIN || rt_prio > RT_PRIO_MAX)) + return -1; + + /* Remove from current queue, change policy, re-enqueue */ + bool was_queued = (task->state == TASK_RUNNING && + task != g_runqueue.current); + if (was_queued) + sched_dequeue(task); + + task->policy = policy; + task->static_prio = (policy == SCHED_FIFO || policy == SCHED_RR) + ? rt_prio + : NICE_TO_PRIO(task->nice); + task->prio = task->static_prio; + task->time_slice = calc_timeslice(task); + + if (was_queued) + sched_enqueue(task); + + return 0; +} \ No newline at end of file diff --git a/src/sched/scheduler.h b/src/sched/scheduler.h new file mode 100644 index 0000000..dd2fd28 --- /dev/null +++ b/src/sched/scheduler.h @@ -0,0 +1,312 @@ +#pragma once +#include +#include +#include +#include "mm/vmm.h" +#include "mp/spinlock.h" + +/* ===================================================================== + * POSIX signal numbers + * ===================================================================== */ +#define SIGHUP 1 +#define SIGINT 2 +#define SIGQUIT 3 +#define SIGILL 4 +#define SIGTRAP 5 +#define SIGABRT 6 +#define SIGBUS 7 +#define SIGFPE 8 +#define SIGKILL 9 /* cannot be caught or ignored */ +#define SIGUSR1 10 +#define SIGSEGV 11 +#define SIGUSR2 12 +#define SIGPIPE 13 +#define SIGALRM 14 +#define SIGTERM 15 +#define SIGSTKFLT 16 +#define SIGCHLD 17 +#define SIGCONT 18 +#define SIGSTOP 19 /* cannot be caught or ignored */ +#define SIGTSTP 20 +#define SIGTTIN 21 +#define SIGTTOU 22 +#define SIGURG 23 +#define SIGXCPU 24 +#define SIGXFSZ 25 +#define SIGVTALRM 26 +#define SIGPROF 27 +#define SIGWINCH 28 +#define SIGIO 29 +#define SIGPWR 30 +#define SIGSYS 31 +#define _NSIG 32 + +typedef void (*sighandler_t)(int signum); +#define SIG_DFL ((sighandler_t)0) /* default action */ +#define SIG_IGN ((sighandler_t)1) /* ignore signal */ +#define SIG_ERR ((sighandler_t)-1) /* error return */ + +#define SA_NOCLDSTOP 0x00000001 +#define SA_NOCLDWAIT 0x00000002 +#define SA_SIGINFO 0x00000004 +#define SA_RESTORER 0x04000000 +#define SA_ONSTACK 0x08000000 +#define SA_RESTART 0x10000000 +#define SA_NODEFER 0x40000000 +#define SA_RESETHAND 0x80000000 + +struct sigaction { + sighandler_t sa_handler; + uint64_t sa_mask; /* signals blocked while handler runs */ + int sa_flags; +}; + +/* ===================================================================== + * Scheduling policies (POSIX) + * ===================================================================== */ +#define SCHED_NORMAL 0 /* Fair time-sharing (CFS-like, nice values) */ +#define SCHED_FIFO 1 /* Real-time FIFO – runs until yield or block */ +#define SCHED_RR 2 /* Real-time round-robin with fixed timeslice */ +#define SCHED_BATCH 3 /* CPU-bound variant of NORMAL, no preemption boost */ +#define SCHED_IDLE 5 /* Only runs when nothing else is runnable */ + +/* Priority ranges: + * RT tasks: static_prio 1 .. 99 (1 = highest) + * Normal tasks: static_prio 100 .. 139 (maps from nice -20 .. +19) + * Idle: static_prio 140 + */ +#define MAX_RT_PRIO 100 +#define MAX_PRIO 140 +#define RT_PRIO_MIN 1 +#define RT_PRIO_MAX 99 +#define NICE_MIN (-20) +#define NICE_MAX 19 +#define NICE_DEFAULT 0 +#define IDLE_PRIO 140 + +/* nice ↔ static_prio conversions for SCHED_NORMAL */ +#define NICE_TO_PRIO(n) (MAX_RT_PRIO + (n) + 20) +#define PRIO_TO_NICE(p) ((p) - MAX_RT_PRIO - 20) + +/* Timeslice constants (ticks, PIT at 1000 Hz → 1 tick = 1 ms) */ +#define SCHED_BASE_SLICE_MS 10 /* base timeslice for NICE_DEFAULT */ +#define SCHED_MIN_SLICE_MS 1 /* minimum timeslice (1 ms) */ +#define SCHED_MAX_SLICE_MS 100 /* maximum timeslice (100 ms) */ +#define SCHED_RR_SLICE_MS 10 /* fixed timeslice for SCHED_RR */ + +/* ===================================================================== + * Task states + * ===================================================================== */ +typedef enum task_state { + TASK_RUNNING = 0, /* Runnable – on a run queue or executing */ + TASK_INTERRUPTIBLE = 1, /* Sleeping, can be woken by signal */ + TASK_UNINTERRUPTIBLE = 2, /* Sleeping, ignores signals (D state) */ + TASK_STOPPED = 4, /* Halted by SIGSTOP / SIGTSTP */ + TASK_ZOMBIE = 8, /* Exited, waiting for parent to wait() */ + TASK_DEAD = 16, /* Fully reaped, memory can be freed */ +} task_state_t; + +/* ===================================================================== + * Minimal CPU context + * + * Only RSP and CR3 live here; all callee-saved GPRs are pushed onto + * the kernel stack by sched_context_switch() before RSP is saved. + * This keeps the struct tiny and the assembly dead simple. + * ===================================================================== */ +struct cpu_context { + uint64_t rsp; /* Saved kernel stack pointer */ + uint64_t cr3; /* Physical address of PML4 (0 = stay on kernel map) */ +}; + +/* ===================================================================== + * Task / Process descriptor + * ===================================================================== */ +typedef int pid_t; +typedef struct task task_t; + +struct task { + /* ---- CPU context (must stay first – asm references it at offset 0) */ + struct cpu_context ctx; + + /* ---- Identity ---------------------------------------------------- */ + pid_t pid; + pid_t ppid; + char name[64]; + bool is_user; /* true = user process, false = kernel thread */ + + /* ---- Scheduling policy and priority ------------------------------ */ + int policy; /* SCHED_NORMAL / SCHED_FIFO / SCHED_RR / … */ + int static_prio; /* Immutable base priority */ + int nice; /* -20 .. +19, SCHED_NORMAL only */ + int prio; /* Effective priority (may be boosted) */ + + /* ---- State ------------------------------------------------------- */ + volatile task_state_t state; + bool need_reschedule; /* Set when a higher-priority task wakes up */ + + /* ---- Time accounting (ticks, 1 tick = 1 ms at 1000 Hz PIT) ------- */ + uint64_t vruntime; /* Virtual runtime (tick-equivalents, weighted) */ + uint64_t sum_exec_runtime; /* Total CPU time consumed (raw ticks) */ + uint64_t time_slice; /* Remaining timeslice (ticks) */ + uint64_t slice_start; /* Tick when the current slice began */ + + /* ---- Memory ------------------------------------------------------ */ + struct pagemap *pagemap; /* NULL → use kernel_pagemap */ + void *kernel_stack; /* Pointer to bottom of kernel-stack alloc */ + size_t kernel_stack_size; + + /* ---- Entry points ------------------------------------------------ */ + uint64_t user_entry; /* User-space RIP for user tasks */ + uint64_t user_stack_top; /* User-space RSP for user tasks */ + void (*kthread_entry)(void *arg); /* Kernel thread entry point */ + void *kthread_arg; + + /* ---- Signals ----------------------------------------------------- */ + uint64_t pending_signals; /* Bitmask of unhandled signals */ + uint64_t signal_mask; /* Blocked (SIG_BLOCK) signals */ + struct sigaction sigactions[_NSIG]; + + /* ---- Exit status ------------------------------------------------- */ + int exit_code; + + /* ---- Run-queue linkage (doubly-linked, intrusive) ---------------- */ + task_t *rq_next; + task_t *rq_prev; + + /* ---- Process tree ------------------------------------------------ */ + task_t *parent; + task_t *first_child; + task_t *next_sibling; +}; + +/* ===================================================================== + * Run queue + * + * Two sub-queues per CPU (single CPU for now, MP-ready by design): + * + * 1. RT array – 99 FIFO lists indexed by RT priority. Highest + * priority with a runnable task is O(1) via bitmap. + * + * 2. CFS list – Tasks sorted by vruntime (ascending). Pick-next + * is O(1) (front of list); insert is O(n) – good + * enough for now, swap in an rb-tree later. + * + * Idle task is stored separately and is returned only when both + * sub-queues are empty. + * ===================================================================== */ +#define RT_QUEUE_LEVELS MAX_RT_PRIO /* 100 levels (index 0 = unused, 1–99 used) */ + +struct rt_prio_array { + /* + * Bitmap: bit N is set ↔ rt_queue[N] is non-empty. + * Two 64-bit words cover 128 bits, enough for 100 levels. + */ + uint64_t bitmap[2]; + task_t *head[RT_QUEUE_LEVELS]; /* FIFO queue heads */ + task_t *tail[RT_QUEUE_LEVELS]; /* FIFO queue tails */ + int total; /* Total RT tasks enqueued */ +}; + +struct runqueue { + spinlock_t lock; + + /* Real-time (SCHED_FIFO / SCHED_RR) */ + struct rt_prio_array rt; + + /* Normal (SCHED_NORMAL / SCHED_BATCH) — sorted ascending by vruntime */ + task_t *cfs_head; + int cfs_count; + uint64_t min_vruntime; /* Lower bound; new tasks start from here */ + + /* Idle fallback (SCHED_IDLE) */ + task_t *idle; + + /* Currently executing task on this CPU */ + task_t *current; + + /* Statistics */ + uint64_t nr_switches; + uint64_t nr_running; /* Total runnable tasks (all classes) */ +}; + +/* ===================================================================== + * Globals (single-CPU; extend to per-cpu array for SMP) + * ===================================================================== */ +extern struct runqueue g_runqueue; +extern task_t *g_current_task; /* Pointer to currently-running task */ + +/* ===================================================================== + * Public scheduler API + * ===================================================================== */ + +/* Initialise the scheduler (call after PMM + VMM + PIT are ready) */ +void sched_init(void); + +/* Create a kernel thread and enqueue it immediately */ +task_t *sched_create_kthread(const char *name, + void (*entry)(void *), void *arg); + +/* Create a user-space task and enqueue it immediately */ +task_t *sched_create_user_task(const char *name, + uint64_t entry_rip, uint64_t user_rsp, + struct pagemap *pm); + +/* Add a task to the appropriate run queue */ +void sched_enqueue(task_t *task); + +/* Remove a task from its run queue (does NOT free it) */ +void sched_dequeue(task_t *task); + +/* Pick next task and perform context switch (call with IF=0) */ +void schedule(void); + +/* Called from the PIT/LAPIC timer IRQ every tick (1 ms) */ +void sched_tick(void); + +/* Voluntarily give up the CPU */ +void sched_yield(void); + +/* Block current task (IF must be 0 on entry; IF restored by schedule) */ +void sched_block(task_state_t reason); + +/* Wake a sleeping task (safe to call from IRQ context) */ +void sched_wake(task_t *task); + +/* Terminate the current task (noreturn) */ +void sched_exit(int exit_code) __attribute__((noreturn)); + +/* ---- Signal API ------------------------------------------------------- */ + +/* Send signal signum to task (safe from any context) */ +int task_send_signal(task_t *task, int signum); + +/* Find a task by PID (NULL if not found) */ +task_t *sched_find_task(pid_t pid); + +/* Process pending signals for the current task (call before returning to user) */ +void task_handle_pending_signals(void); + +/* ---- Priority / policy control --------------------------------------- */ + +/* Set nice value [-20, +19] for a SCHED_NORMAL task; returns old nice */ +int task_set_nice(task_t *task, int nice); + +/* Change scheduling policy + RT priority; returns 0 on success */ +int task_set_scheduler(task_t *task, int policy, int rt_prio); + +/* ---- Convenience ----------------------------------------------------- */ +static inline task_t *sched_current(void) { return g_current_task; } + +/* ---- Assembly context switch (defined in sched_switch.S) ------------- */ +/* + * Save callee-saved registers of the current context onto its kernel + * stack and record RSP in *from. Then switch to *to's stack, restore + * its callee-saved registers, and return — which resumes wherever *to + * last called schedule(). For first-time tasks the "return" jumps to + * the appropriate trampoline. + */ +void sched_context_switch(struct cpu_context *from, + struct cpu_context *to); + +/* Kernel stack size for each task */ +#define KSTACK_SIZE (32 * 1024) /* 32 KiB — comfortable headroom */ \ No newline at end of file diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index 198622f..fe6ddca 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -3,6 +3,7 @@ #include "mp/percpu.h" #include "fs/vfs.h" #include "syscall.h" +#include "sched/scheduler.h" #define MSR_EFER 0xC0000080 #define MSR_STAR 0xC0000081 @@ -41,7 +42,7 @@ uint64_t syscall_handler(uint64_t num, uint8_t* buf = (uint8_t*)arg2; size_t len = (size_t)arg3; - return (uint64_t)VFS_Read(fd, buf, len); + return VFS_Read(fd, buf, len); } case SYS_WRITE: @@ -65,6 +66,99 @@ uint64_t syscall_handler(uint64_t num, return (uint64_t)VFS_Close(fd); } + case SYS_GETPID: + return (uint64_t)sched_current()->pid; + + case SYS_GETPPID: + return (uint64_t)sched_current()->ppid; + + case SYS_EXIT: + case SYS_EXIT_GROUP: + sched_exit((int)arg1); + //noreturn + + case SYS_SCHED_YIELD: + sched_yield(); + return 0; + + case SYS_NICE: + { + int increment = (int)arg1; + int old_nice = sched_current()->nice; + int new_nice = old_nice + increment; + return (uint64_t)task_set_nice(sched_current(), new_nice); + } + + case SYS_KILL: + { + pid_t target = (pid_t)arg1; + int sig = (int)arg2; + task_t *t = sched_find_task(target); + if (!t) return (uint64_t)-1; + return (uint64_t)task_send_signal(t, sig); + } + + case SYS_SIGACTION: + { + int signum = (int)arg1; + const struct sigaction *act = (const struct sigaction *)arg2; + struct sigaction *oact = (struct sigaction *)arg3; + + if (signum <= 0 || signum >= _NSIG) + return (uint64_t)-1; + if (signum == SIGKILL || signum == SIGSTOP) + return (uint64_t)-1; // cannot override + + task_t *cur = sched_current(); + if (oact) + *oact = cur->sigactions[signum]; + if (act) + cur->sigactions[signum] = *act; + return 0; + } + + case SYS_SIGPROCMASK: + { + // how: 0=SIG_BLOCK, 1=SIG_UNBLOCK, 2=SIG_SETMASK + int how = (int)arg1; + uint64_t new_set = arg2; + uint64_t *old = (uint64_t *)arg3; + + task_t *cur = sched_current(); + if (old) *old = cur->signal_mask; + + // SIGKILL and SIGSTOP can never be blocked + new_set &= ~((1ULL << SIGKILL) | (1ULL << SIGSTOP)); + + switch (how) { + case 0: cur->signal_mask |= new_set; break; // SIG_BLOCK + case 1: cur->signal_mask &= ~new_set; break; // SIG_UNBLOCK + case 2: cur->signal_mask = new_set; break; // SIG_SETMASK + default: return (uint64_t)-1; + } + return 0; + } + + case SYS_SCHED_GETSCHEDULER: + { + pid_t target = (pid_t)arg1; + task_t *t = target ? sched_find_task(target) + : sched_current(); + if (!t) return (uint64_t)-1; + return (uint64_t)t->policy; + } + + case SYS_SCHED_SETSCHEDULER: + { + pid_t target = (pid_t)arg1; + int policy = (int)arg2; + int rt_prio = (int)arg3; + task_t *t = target ? sched_find_task(target) + : sched_current(); + if (!t) return (uint64_t)-1; + return (uint64_t)task_set_scheduler(t, policy, rt_prio); + } + default: return (uint64_t)-1; } diff --git a/src/syscall/syscall.h b/src/syscall/syscall.h index 8e7f4ab..a10f903 100644 --- a/src/syscall/syscall.h +++ b/src/syscall/syscall.h @@ -6,5 +6,17 @@ #define SYS_OPEN 2 #define SYS_CLOSE 3 +#define SYS_SCHED_YIELD 24 +#define SYS_GETPID 39 +#define SYS_GETPPID 110 +#define SYS_NICE 34 +#define SYS_KILL 62 +#define SYS_SIGACTION 13 /* rt_sigaction on Linux */ +#define SYS_SIGPROCMASK 14 /* rt_sigprocmask on Linux */ +#define SYS_EXIT 60 +#define SYS_EXIT_GROUP 231 +#define SYS_SCHED_GETSCHEDULER 138 +#define SYS_SCHED_SETSCHEDULER 139 + void syscall_init(void); \ No newline at end of file diff --git a/src/syscall/syscall_entry.S b/src/syscall/syscall_entry.S index c50b674..0763dda 100644 --- a/src/syscall/syscall_entry.S +++ b/src/syscall/syscall_entry.S @@ -27,7 +27,9 @@ syscall_entry: pop %rsi # rsi = a1 (2nd param) mov %rax, %rdi # rdi = num (1st param) + sub $8, %rsp call syscall_handler + add $8, %rsp # ── Restore user context ─────────────────────────────────────────── pop %r11 # user RFLAGS diff --git a/user/build/init.elf b/user/build/init.elf index f98c8589f2ebfdedfc22f92011d846ad1146ccd4..613b512992b19cca8224e72463b77bd071dfd25b 100755 GIT binary patch delta 498 zcmZ3X*`YN-Lhv(#0|N*^NCt)z6E)3^jxd777+^Gm0EEv1q(Im~5W;t0bO19P7$=^R zo4A2P2vcI?sWkq2$I$%@K&6f$p?jHt8iEaPdvw0==)C9AdCQ~ujeuw8CC|Lt-=zsuw*%x1B*`7jKt4npk|@|{pm|6_yBfd>k9+i%sC0OA`T)&_+u+f8 z6let0s9Q)9VB7YCtb}TNg(Ug<-wTWX|Np=K$1r)dz*LihfB*k$166{63J@CraSu@J zFi60OPoS5{oll~N*^y77jm3>mqnXv8&th_dpvmMJf(neEC+`%LXFNRlrr>+VUz1M? z=?j7a2MnO$YBhPGpzvfFVHrVq2t#EIz%nsFnF6Su0;o&~SZ0o}3geN<2Zik!A53Nx Tv1f`9n(QQE$5=eMQA8a8X4#F| delta 407 zcmeCsTA?{XLU0L#0|N*^NCt+CiJIm{35;Md1{lpC0O7L$DG>eu<$r)``@k^ql-$G( z973298&9S2*LyU-5!k^5)VO~GP>10+!%O_E`wjq^9-YT`X#&M}fGp%^-N6i`Akqls z{H(hgfWknf9=#$Ybi^&t2WhNhBT)=pEaw3yFBlF}&CUHq^kOm;&0%8_03qr(g-pEwX1XOCs zYR>p#awe-iWA@~gtoBUpj6ie5Cb4NSc1+e}v*!f65U3|&av_^JlL7PORcv-l0!))% YvDq<3~7!*7O{CSDNn;%D8#4CMluaA|(lT@65C zkIv&By(KCgh6ngr4ZnGI9x%KFRt{9k2$bFrBESlKIuA`wXH=Nn!5AYV02DL;Vh|_+ z5iWcZlNnhgCTlP)U@Vz@l1W~Y3nUB#AR1&n42Vu