sched: Implement basic scheduling and signal handling system

Note: this is probably 25% broken, but it works right now as written, so I hope it all works.

- Added a new scheduler header file (scheduler.h) defining task structures, scheduling policies, and signal handling mechanisms.
- Integrated scheduling functions into the syscall interface, including SYS_GETPID, SYS_GETPPID, SYS_EXIT, SYS_SCHED_YIELD, SYS_NICE, SYS_KILL, SYS_SIGACTION, SYS_SIGPROCMASK, SYS_SCHED_GETSCHEDULER, and SYS_SCHED_SETSCHEDULER.
- Updated syscall handler to manage new scheduling-related syscalls and signal actions.

Signed-off-by: kaguya <vpshinomiya@protonmail.com>
This commit is contained in:
kaguya
2026-04-26 22:46:28 -04:00
parent 336af1c2ad
commit 7d99745ff9
20 changed files with 1561 additions and 53 deletions
+48 -19
View File
@@ -5,24 +5,53 @@
#include "mm/memory.h"
#include "libk/stdio.h"
#include "fs/elf.h"
#include "sched/scheduler.h"
extern uintptr_t g_hhdm_offset;
#define USER_STACK_TOP 0x00007FFFFFFFE000ULL
#define USER_STACK_PAGES 4
#define USER_STACK_PAGES 8
#define USER_STACK_SIZE (USER_STACK_PAGES * PAGE_SIZE)
#define PTE_PRESENT (1ULL << 0)
#define PTE_WRITABLE (1ULL << 1)
#define PTE_USER (1ULL << 2)
static uint64_t user_stack_phys_base = 0;
extern struct pagemap *kernel_pagemap;
//extern struct pagemap *kernel_pagemap;
uintptr_t setup_user_stack(void)
struct pagemap *create_user_pagemap(void)
{
struct pagemap *pm = kmalloc(sizeof(struct pagemap));
if (!pm) {
printf("Failed to allocate user pagemap struct!\n");
return NULL;
}
spinlock_init(&pm->lock);
/* Allocate a fresh PML4 (physical page) */
pm->top_level = (uint64_t *)((uintptr_t)pmm_allocz(1) + MEM_PHYS_OFFSET);
if (!pm->top_level) {
printf("Failed to allocate user PML4!\n");
kfree(pm);
return NULL;
}
/* Copy kernel higher-half mappings (kernel + HHDM) */
for (size_t i = 256; i < 512; i++) {
pm->top_level[i] = kernel_pagemap->top_level[i];
}
/* Lower half remains zero (user address space) */
printf("[usermode] user pagemap created (PML4 phys = 0x%lx)\n",
(uint64_t)pm->top_level - MEM_PHYS_OFFSET);
return pm;
}
uintptr_t setup_user_stack(struct pagemap *pagemap)
{
user_stack_phys_base = (uint64_t)pmm_alloc(USER_STACK_PAGES);
@@ -37,7 +66,7 @@ uintptr_t setup_user_stack(void)
uintptr_t virt = stack_bottom + i * PAGE_SIZE;
uintptr_t phys = user_stack_phys_base + i * PAGE_SIZE;
if (!vmm_map_page(kernel_pagemap,
if (!vmm_map_page(pagemap,
virt,
phys,
PAGE_READ | PAGE_WRITE | PAGE_USER,
@@ -47,19 +76,13 @@ uintptr_t setup_user_stack(void)
for (;;);
}
// zero physical page through HHDM
//memset((void *)(phys + g_hhdm_offset), 0, PAGE_SIZE);
}
uintptr_t rsp = USER_STACK_TOP;
rsp &= ~0xFULL;
return rsp;
}
// usermode.c
__attribute__((naked))
void enter_user_mode(uint64_t rip, uint64_t rsp)
{
@@ -98,21 +121,27 @@ void enter_user_mode(uint64_t rip, uint64_t rsp)
void start_userspace(void)
{
void *entry = NULL;
if (!ELF_Read("init.elf", &entry)) {
struct pagemap *user_pagemap = create_user_pagemap();
if (!user_pagemap) {
printf("Failed to create user pagemap\n");
for (;;);
}
void *elf_entry = NULL;
if (!ELF_Read("init.elf", &elf_entry, user_pagemap)) {
printf("Failed to load init.elf\n");
for(;;);
}
if (!entry) {
if (!elf_entry) {
printf("ELF has no entry point\n");
for(;;);
}
uintptr_t rsp = setup_user_stack();
uintptr_t user_rsp = setup_user_stack(user_pagemap);
printf("Entering usermode RIP=%p RSP=%p\n", entry, (void*)rsp);
printf("Entering usermode RIP=%p RSP=%p\n", elf_entry, (void*)user_rsp);
enter_user_mode((uint64_t)entry, (rsp & ~0xFULL));
sched_create_user_task("init", (uint64_t)elf_entry, user_rsp, user_pagemap);
}
+2 -1
View File
@@ -6,6 +6,7 @@
#include "e9.h"
#include "limine.h"
#include "apic.h"
#include "sched/scheduler.h"
__attribute__((used, section(".limine_requests")))
volatile struct limine_date_at_boot_request boot_request = {
@@ -39,7 +40,7 @@ void PIT_IRQ_Handler(Registers* regs)
lapic_eoi();
}
// You can add scheduler / time logic here later
sched_tick();
}
/* ========================= */
+16 -7
View File
@@ -7,16 +7,13 @@
#include "fs/ext2.h"
extern uintptr_t g_hhdm_offset;
extern struct pagemap *kernel_pagemap;
#define ELF_BUFFER_SIZE (1024 * 1024)
#define PTE_PRESENT (1ULL << 0)
#define PTE_WRITABLE (1ULL << 1)
#define PTE_USER (1ULL << 2)
bool ELF_Read(const char* path, void** entryPoint)
bool ELF_Read(const char* path, void** entryPoint, struct pagemap *target_pagemap)
{
uint32_t size;
@@ -41,6 +38,15 @@ bool ELF_Read(const char* path, void** entryPoint)
ELFHeader* header = (ELFHeader*)elf_buffer;
printf("=== ELF DEBUG ===\n");
printf("Entry point VA = 0x%lx\n", header->ProgramEntryPosition);
printf("PHDR offset = 0x%lx\n", header->ProgramHeaderTablePosition);
printf("PHDR count = %u\n", header->ProgramHeaderTableEntryCount);
printf("=== END ELF DEBUG ===\n");
// ── validate ELF ──────────────────────────────────
if (memcmp(header->Magic, ELF_MAGIC, 4) != 0) {
printf("ELF: bad magic\n");
@@ -82,8 +88,11 @@ bool ELF_Read(const char* path, void** entryPoint)
ELFProgramHeader* ph = (ELFProgramHeader*)(ph_table +
i * header->ProgramHeaderTableEntrySize);
if (ph->Type != ELF_PROGRAM_TYPE_LOAD)
if (ph->Type != ELF_PROGRAM_TYPE_LOAD) {
printf("LOAD segment: VA=0x%lx FileSz=0x%lx MemSz=0x%lx\n",
ph->VirtualAddress, ph->FileSize, ph->MemorySize);
continue;
}
uint64_t virt = ph->VirtualAddress;
uint64_t offset = ph->Offset;
@@ -114,7 +123,7 @@ bool ELF_Read(const char* path, void** entryPoint)
uint64_t phys_addr = phys_base + p * PAGE_SIZE;
bool success = vmm_map_page(
kernel_pagemap,
target_pagemap,
virt_addr,
phys_addr,
PAGE_READ | PAGE_WRITE | PAGE_USER, // RW + User mode
+2 -1
View File
@@ -1,6 +1,7 @@
#pragma once
#include <stdint.h>
#include <stdbool.h>
#include "mm/vmm.h"
#define ELF_MAGIC ("\x7F" "ELF")
@@ -114,4 +115,4 @@ enum ELFProgramType {
};
bool ELF_Read(const char* path, void** entryPoint);
bool ELF_Read(const char* path, void** entryPoint, struct pagemap *target_pagemap);
+8 -2
View File
@@ -131,13 +131,19 @@ int VFS_Read_internal(fd_t fd, uint8_t* buf, size_t size)
// naive: read whole file then slice
uint8_t* tmp = kmalloc(file_size);
if (!ext2_read_file(&file->ext2.inode, tmp))
if (!tmp) {
return -1;
}
if (!ext2_read_file(&file->ext2.inode, tmp)) {
kfree(tmp);
return -1;
}
for (size_t i = 0; i < size; i++)
buf[i] = tmp[file->offset + i];
file->offset += size;
kfree(tmp);
return size;
}
+15 -3
View File
@@ -31,6 +31,7 @@
#include "arch/x86_64/sys/apic.h"
#include "arch/x86_64/sys/ioapic.h"
#include "drivers/input/ps2.h"
#include "sched/scheduler.h"
uintptr_t g_hhdm_offset;
@@ -128,7 +129,7 @@ static uacpi_interrupt_ret handle_power_button(uacpi_handle ctx) {
void kmain(void) {
if (LIMINE_BASE_REVISION_SUPPORTED(limine_base_revision) == false) {
if (LIMINE_BASE_REVISION_SUPPORTED(limine_base_revision) == false) {
hcf();
}
@@ -350,8 +351,19 @@ void kmain(void) {
syscall_init();
sched_init();
start_userspace();
sched_yield();
for (;;) {
sched_yield();
}
// We're done, just hang...
hcf();
}
//hcf();
}
+7 -1
View File
@@ -23,6 +23,10 @@ extern volatile struct limine_executable_address_request kernel_address_request;
#define PAGE_USER (1ULL << 2)
#define PAGE_NO_EXECUTE (1ULL << 63)
#define PTE_PRESENT (1ULL << 0)
#define PTE_WRITABLE (1ULL << 1)
#define PTE_USER (1ULL << 2)
struct pagemap {
spinlock_t lock;
uint64_t *top_level;
@@ -36,4 +40,6 @@ void vmm_init(struct limine_memmap_entry **memmap, size_t memmap_entries);
void vmm_switch_pagemap(struct pagemap *pagemap);
bool vmm_map_page(struct pagemap *pagemap, uint64_t virt, uint64_t phys,
uint64_t flags, enum page_size pg_size);
uint64_t vmm_virt_to_phys(struct pagemap *pagemap, uint64_t virt);
uint64_t vmm_virt_to_phys(struct pagemap *pagemap, uint64_t virt);
uint64_t *vmm_virt_to_pte(struct pagemap *pagemap, uintptr_t virt_addr,
bool allocate);
+7 -3
View File
@@ -3,7 +3,11 @@
#include <stdint.h>
struct cpu_local {
uint64_t self; // +0x00 (GS:0x00)
uint64_t user_rsp; // +0x08 (GS:0x08) — saved user RSP on syscall entry
uint64_t kernel_rsp; // +0x10 (GS:0x10) — kernel stack for syscall handler
uint64_t self; /* +0x00 (GS:0x00) — points to this struct */
uint64_t user_rsp; /* +0x08 (GS:0x08) — saved user RSP on SYSCALL */
uint64_t kernel_rsp; /* +0x10 (GS:0x10) — kernel stack for syscall */
/* Future SMP fields: */
uint32_t cpu_id; /* +0x18 logical CPU index */
uint32_t _pad;
void *current; /* +0x20 pointer to current task_t */
};
+75
View File
@@ -0,0 +1,75 @@
# void sched_context_switch(struct cpu_context *from,
# struct cpu_context *to);
#
# struct cpu_context layout (from sched.h):
# offset 0 : rsp (uint64_t)
# offset 8 : cr3 (uint64_t)
#
# Strategy
# --------
# Only callee-saved registers need explicit saving; caller-saved registers
# (rax, rcx, rdx, rsi, rdi, r8r11) are already on the caller's stack per
# the System V ABI.
#
# 1. Push all callee-saved GPRs onto the *current* kernel stack.
# 2. Save RSP from->rsp.
# 3. If to->cr3 is non-zero and differs from the current CR3, load it
# (switches address space for user processes).
# 4. Load RSP to->rsp (switch to next task's kernel stack).
# 5. Pop all callee-saved GPRs from the *new* stack.
# 6. ret pops the return address placed there during task creation
# (kthread_trampoline / user_task_trampoline) for a brand-new task,
# or returns into the schedule() call-site for a resumed task.
#
# NOTE: This function is called with IF=0 (interrupts disabled) and must
# NOT modify IF itself. The trampolines re-enable interrupts after the
# first-ever schedule-in of a task.
.section .text
.global sched_context_switch
.type sched_context_switch, @function
sched_context_switch:
# Save outgoing task
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
# from->rsp = RSP (rdi = struct cpu_context *from, offset 0 = rsp)
movq %rsp, 0(%rdi)
# Switch address space (CR3) if needed
# to->cr3 is at offset 8 in struct cpu_context
movq 8(%rsi), %rax
testq %rax, %rax # 0 means "keep current CR3" (kernel thread)
jz .Lno_cr3
movq %cr3, %rcx
cmpq %rax, %rcx # Same CR3? Don't flush the TLB needlessly.
je .Lno_cr3
movq %rax, %cr3 # Load new page table root (flushes TLB)
.Lno_cr3:
# Switch to incoming task's kernel stack
# to->rsp is at offset 0 (rsi = struct cpu_context *to)
movq 0(%rsi), %rsp
# Restore incoming task's callee-saved registers
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
popq %rbx
# ret pops the "return address" from the new task's stack.
# Brand-new task jumps to kthread_trampoline or user_task_trampoline
# Resumed task returns into schedule(), then back up the call chain
ret
.size sched_context_switch, . - sched_context_switch
+935
View File
@@ -0,0 +1,935 @@
#include "scheduler.h"
#include "mm/memory.h"
#include "mm/pmm.h"
#include "libk/stdio.h"
#include "arch/x86_64/cpu/io.h"
#include "arch/x86_64/sys/pit.h"
#include "string.h"
/* =====================================================================
* Forward declarations for GDT/TSS (defined in gdt.c)
* ===================================================================== */
typedef struct {
uint32_t reserved0;
uint64_t rsp0;
uint64_t rsp1;
uint64_t rsp2;
uint64_t reserved1;
uint64_t ist[7];
uint64_t reserved2;
uint16_t reserved3;
uint16_t iopb_offset;
} __attribute__((packed)) TSS;
extern TSS kernel_tss;
/* =====================================================================
* Globals
* ===================================================================== */
struct runqueue g_runqueue = {0};
task_t *g_current_task = NULL;
/* PIT tick counter (defined in pit.c) */
extern volatile uint64_t g_Ticks;
/* =====================================================================
* Linux-compatible nice → CPU weight table (NICE_0_LOAD = 1024)
*
* vruntime_delta = actual_ticks * NICE_0_LOAD / weight[nice + 20]
* timeslice = BASE * weight[nice + 20] / NICE_0_LOAD
*
* Low weight (high nice) → vruntime accumulates faster → scheduled less.
* ===================================================================== */
#define NICE_0_LOAD 1024u
static const uint32_t nice_to_weight[40] = {
/* nice -20 */ 88761, 71755, 56483, 46273, 36291,
/* nice -15 */ 29154, 23254, 18705, 14949, 11916,
/* nice -10 */ 9548, 7620, 6100, 4904, 3906,
/* nice -5 */ 3121, 2501, 1991, 1586, 1277,
/* nice 0 */ 1024, 820, 655, 526, 423,
/* nice +5 */ 335, 272, 215, 172, 137,
/* nice +10 */ 110, 87, 70, 56, 45,
/* nice +15 */ 36, 29, 23, 18, 15,
};
static inline uint32_t weight_for_nice(int nice) {
int idx = nice + 20;
if (idx < 0) idx = 0;
if (idx > 39) idx = 39;
return nice_to_weight[idx];
}
/* =====================================================================
* PID allocator (simple monotonic counter, single-CPU safe)
* ===================================================================== */
static pid_t g_next_pid = 1;
static pid_t alloc_pid(void) {
return g_next_pid++;
}
/* =====================================================================
* RT bitmap helpers (O(1) find-first-set)
* ===================================================================== */
static inline void rt_bitmap_set(struct rt_prio_array *arr, int prio) {
/* prio is 1..99; store at bit (prio-1) */
int bit = prio - 1;
arr->bitmap[bit / 64] |= (1ULL << (bit % 64));
}
static inline void rt_bitmap_clear(struct rt_prio_array *arr, int prio) {
int bit = prio - 1;
arr->bitmap[bit / 64] &= ~(1ULL << (bit % 64));
}
/* Returns the highest-priority (lowest numeric) non-empty RT queue,
* or -1 if all are empty. */
static inline int rt_bitmap_first(const struct rt_prio_array *arr) {
if (arr->bitmap[0]) return __builtin_ctzll(arr->bitmap[0]) + 1;
if (arr->bitmap[1]) return __builtin_ctzll(arr->bitmap[1]) + 65;
return -1;
}
/* =====================================================================
* RT FIFO queue operations
* ===================================================================== */
static void rt_enqueue(struct runqueue *rq, task_t *task) {
int prio = task->static_prio;
struct rt_prio_array *arr = &rq->rt;
task->rq_next = NULL;
task->rq_prev = arr->tail[prio];
if (arr->tail[prio])
arr->tail[prio]->rq_next = task;
else
arr->head[prio] = task;
arr->tail[prio] = task;
rt_bitmap_set(arr, prio);
arr->total++;
rq->nr_running++;
}
/* Dequeue the head of the given priority's FIFO */
static task_t *rt_dequeue_head(struct runqueue *rq, int prio) {
struct rt_prio_array *arr = &rq->rt;
task_t *task = arr->head[prio];
if (!task) return NULL;
arr->head[prio] = task->rq_next;
if (arr->head[prio])
arr->head[prio]->rq_prev = NULL;
else {
arr->tail[prio] = NULL;
rt_bitmap_clear(arr, prio);
}
task->rq_next = task->rq_prev = NULL;
arr->total--;
rq->nr_running--;
return task;
}
/* Remove a specific task from an RT queue (O(1) with doubly-linked list) */
static void rt_remove(struct runqueue *rq, task_t *task) {
int prio = task->static_prio;
struct rt_prio_array *arr = &rq->rt;
if (task->rq_prev) task->rq_prev->rq_next = task->rq_next;
else arr->head[prio] = task->rq_next;
if (task->rq_next) task->rq_next->rq_prev = task->rq_prev;
else arr->tail[prio] = task->rq_prev;
if (!arr->head[prio])
rt_bitmap_clear(arr, prio);
task->rq_next = task->rq_prev = NULL;
arr->total--;
rq->nr_running--;
}
/* =====================================================================
* CFS (normal) queue operations — sorted ascending by vruntime
* ===================================================================== */
/*
* Insert task into the CFS list, keeping it sorted by vruntime.
* O(n) — acceptable for hobby-kernel task counts; replace with
* red-black tree if you hit performance issues.
*/
static void cfs_enqueue(struct runqueue *rq, task_t *task) {
/* New tasks start at min_vruntime so they don't starve incumbents
* but also don't get a massive head-start. */
if (task->vruntime < rq->min_vruntime)
task->vruntime = rq->min_vruntime;
task_t **pp = &rq->cfs_head;
task_t *prev = NULL;
while (*pp && (*pp)->vruntime <= task->vruntime) {
prev = *pp;
pp = &(*pp)->rq_next;
}
task->rq_next = *pp;
task->rq_prev = prev;
if (*pp) (*pp)->rq_prev = task;
*pp = task;
rq->cfs_count++;
rq->nr_running++;
}
/* Remove the task with the smallest vruntime (head of list) */
static task_t *cfs_dequeue_min(struct runqueue *rq) {
task_t *task = rq->cfs_head;
if (!task) return NULL;
rq->cfs_head = task->rq_next;
if (rq->cfs_head) {
rq->cfs_head->rq_prev = NULL;
rq->min_vruntime = rq->cfs_head->vruntime;
}
task->rq_next = task->rq_prev = NULL;
rq->cfs_count--;
rq->nr_running--;
return task;
}
/* Remove a specific task from the CFS queue */
static void cfs_remove(struct runqueue *rq, task_t *task) {
if (task->rq_prev) task->rq_prev->rq_next = task->rq_next;
else rq->cfs_head = task->rq_next;
if (task->rq_next) task->rq_next->rq_prev = task->rq_prev;
task->rq_next = task->rq_prev = NULL;
rq->cfs_count--;
rq->nr_running--;
}
/* =====================================================================
* Timeslice calculation
* ===================================================================== */
static uint64_t calc_timeslice(const task_t *task) {
switch (task->policy) {
case SCHED_FIFO:
return UINT64_MAX; /* Runs until it yields or blocks */
case SCHED_RR:
return SCHED_RR_SLICE_MS;
case SCHED_IDLE:
return SCHED_BASE_SLICE_MS;
default: /* SCHED_NORMAL / SCHED_BATCH */
{
uint32_t w = weight_for_nice(task->nice);
uint64_t ms = (uint64_t)SCHED_BASE_SLICE_MS * w / NICE_0_LOAD;
if (ms < SCHED_MIN_SLICE_MS) ms = SCHED_MIN_SLICE_MS;
if (ms > SCHED_MAX_SLICE_MS) ms = SCHED_MAX_SLICE_MS;
return ms;
}
}
}
/* Update a task's vruntime based on how many ticks it actually ran */
static void update_vruntime(task_t *task, uint64_t elapsed_ticks) {
task->sum_exec_runtime += elapsed_ticks;
if (task->policy == SCHED_NORMAL || task->policy == SCHED_BATCH) {
/* vruntime_delta = ticks * NICE_0_LOAD / weight
* High-weight (low nice) tasks accumulate vruntime slowly → more CPU. */
uint32_t w = weight_for_nice(task->nice);
uint64_t delta = elapsed_ticks * NICE_0_LOAD / w;
task->vruntime += delta;
} else {
/* RT and idle: track raw ticks for accounting; vruntime unused */
task->vruntime += elapsed_ticks;
}
}
/* =====================================================================
* sched_enqueue / sched_dequeue (public, uses run-queue lock)
* ===================================================================== */
void sched_enqueue(task_t *task) {
uint64_t flags;
spinlock_acquire_irqsave(&g_runqueue.lock, &flags);
task->state = TASK_RUNNING;
if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) {
rt_enqueue(&g_runqueue, task);
} else if (task->policy == SCHED_IDLE) {
/* SCHED_IDLE: only one idle task, stored separately */
g_runqueue.idle = task;
} else {
cfs_enqueue(&g_runqueue, task);
}
spinlock_release_irqrestore(&g_runqueue.lock, flags);
}
void sched_dequeue(task_t *task) {
uint64_t flags;
spinlock_acquire_irqsave(&g_runqueue.lock, &flags);
if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) {
rt_remove(&g_runqueue, task);
} else if (task->policy != SCHED_IDLE) {
cfs_remove(&g_runqueue, task);
}
spinlock_release_irqrestore(&g_runqueue.lock, flags);
}
/* =====================================================================
* pick_next_task (called with interrupts OFF and lock held)
*
* Priority order:
* 1. Highest RT priority with a runnable task
* 2. Normal task with smallest vruntime
* 3. Idle task (always exists, never NULL)
* ===================================================================== */
static task_t *pick_next_task(struct runqueue *rq) {
/* 1. Real-time */
int rt_prio = rt_bitmap_first(&rq->rt);
if (rt_prio > 0) {
return rt_dequeue_head(rq, rt_prio);
}
/* 2. CFS normal */
if (rq->cfs_count > 0) {
return cfs_dequeue_min(rq);
}
/* 3. Idle fallback */
return rq->idle;
}
/* =====================================================================
* Trampolines
* These are the "return addresses" pushed onto a new task's kernel
* stack. When the task is scheduled for the first time, ret inside
* sched_context_switch() jumps here.
* ===================================================================== */
static void kthread_trampoline(void) {
/* We arrive with interrupts disabled (just after a context switch).
* Re-enable them before entering user code. */
x86_64_EnableInterrupts();
task_t *self = g_current_task;
self->kthread_entry(self->kthread_arg);
/* Thread returned — treat it as a clean exit. */
sched_exit(0);
}
static void user_task_trampoline(void) {
x86_64_EnableInterrupts();
task_t *self = g_current_task;
/*
* Build an iretq frame on the current (kernel) stack and enter
* user mode. We reset the stack pointer to the very top of the
* kernel stack first, so the iretq frame doesn't sit below a
* pile of stale context-switch frames.
*
* Segment selectors (from your GDT / STAR setup):
* User CS = 0x23 (GDT index 4, RPL 3)
* User SS = 0x1B (GDT index 3, RPL 3)
*/
uint64_t kstack_top = (uint64_t)self->kernel_stack + self->kernel_stack_size;
uint64_t user_rip = self->user_entry;
uint64_t user_rsp = self->user_stack_top;
asm volatile(
"movq %0, %%rsp\n\t" /* Reset kernel RSP to stack top */
"pushq $0x1B\n\t" /* SS user data segment */
"pushq %1\n\t" /* RSP user stack pointer */
"pushfq\n\t" /* RFLAGS */
"orq $0x200, (%%rsp)\n\t" /* Set IF so user code runs with */
/* interrupts enabled */
"pushq $0x23\n\t" /* CS user code segment */
"pushq %2\n\t" /* RIP user entry point */
"iretq\n\t"
:
: "r"(kstack_top), "r"(user_rsp), "r"(user_rip)
: "memory"
);
__builtin_unreachable();
}
/* =====================================================================
* Kernel stack setup for a new task
*
* Lay out: [trampoline_addr] [r15=0] [r14=0] [r13=0] [r12=0]
* [rbp=0] [rbx=0] ← ctx.rsp points here
*
* sched_context_switch pops in order: rbx, rbp, r12, r13, r14, r15,
* then ret → trampoline.
* ===================================================================== */
static void setup_initial_kstack(task_t *task, void *trampoline) {
uint64_t *sp = (uint64_t *)((uint8_t *)task->kernel_stack
+ task->kernel_stack_size);
*--sp = (uint64_t)trampoline; /* "return address" → trampoline */
*--sp = 0; /* r15 */
*--sp = 0; /* r14 */
*--sp = 0; /* r13 */
*--sp = 0; /* r12 */
*--sp = 0; /* rbp */
*--sp = 0; /* rbx */
task->ctx.rsp = (uint64_t)sp;
}
/* =====================================================================
* Task allocation helper
* ===================================================================== */
static task_t *alloc_task(const char *name, bool is_user) {
task_t *task = kmalloc(sizeof(task_t));
if (!task) return NULL;
memset(task, 0, sizeof(task_t));
strncpy(task->name, name, sizeof(task->name) - 1);
task->is_user = is_user;
task->state = TASK_RUNNING;
task->policy = SCHED_NORMAL;
task->nice = NICE_DEFAULT;
task->static_prio = NICE_TO_PRIO(NICE_DEFAULT);
task->prio = task->static_prio;
task->pid = alloc_pid();
task->ppid = g_current_task ? g_current_task->pid : 0;
task->parent = g_current_task;
/* Default: all signals use SIG_DFL */
for (int i = 0; i < _NSIG; i++)
task->sigactions[i].sa_handler = SIG_DFL;
/* Allocate kernel stack */
task->kernel_stack_size = KSTACK_SIZE;
task->kernel_stack = kmalloc(KSTACK_SIZE);
if (!task->kernel_stack) {
kfree(task);
return NULL;
}
memset(task->kernel_stack, 0xCC, KSTACK_SIZE); /* poison */
return task;
}
/* =====================================================================
* Public task-creation API
* ===================================================================== */
task_t *sched_create_kthread(const char *name,
void (*entry)(void *), void *arg)
{
task_t *task = alloc_task(name, false);
if (!task) return NULL;
task->kthread_entry = entry;
task->kthread_arg = arg;
task->ctx.cr3 = 0; /* Kernel threads share kernel_pagemap */
setup_initial_kstack(task, kthread_trampoline);
task->time_slice = calc_timeslice(task);
task->vruntime = g_runqueue.min_vruntime;
sched_enqueue(task);
printf("[sched] kthread '%s' pid=%d created\n", task->name, task->pid);
return task;
}
task_t *sched_create_user_task(const char *name,
uint64_t entry_rip, uint64_t user_rsp,
struct pagemap *pm)
{
task_t *task = alloc_task(name, true);
if (!task) return NULL;
task->pagemap = pm;
task->user_entry = entry_rip;
task->user_stack_top= user_rsp;
/* CR3 = physical address of PML4 */
task->ctx.cr3 = (uint64_t)pm->top_level - MEM_PHYS_OFFSET;
setup_initial_kstack(task, user_task_trampoline);
task->time_slice = calc_timeslice(task);
task->vruntime = g_runqueue.min_vruntime;
sched_enqueue(task);
printf("[sched] user task '%s' pid=%d created, entry=0x%lx\n",
task->name, task->pid, entry_rip);
return task;
}
/* =====================================================================
* Idle task
* ===================================================================== */
static void idle_entry(void *arg) {
(void)arg;
for (;;) {
asm volatile("sti; hlt; cli" ::: "memory");
/* If we get here, an interrupt fired; schedule() will be
* called by sched_tick if a real task became runnable. */
}
}
/* =====================================================================
* sched_init — set up the idle task and initialise the run queue
* ===================================================================== */
void sched_init(void) {
spinlock_init(&g_runqueue.lock);
g_runqueue.min_vruntime = 0;
/* Synthesise a "current" descriptor for the boot thread so that
* the first schedule() call has something valid in g_current_task. */
task_t *boot = kmalloc(sizeof(task_t));
if (!boot) {
printf("[sched] FATAL: cannot allocate boot task\n");
while (1) asm volatile("hlt");
}
memset(boot, 0, sizeof(task_t));
strncpy(boot->name, "boot", 63);
boot->pid = alloc_pid(); /* pid 1 */
boot->state = TASK_RUNNING;
boot->policy = SCHED_NORMAL;
boot->nice = NICE_DEFAULT;
boot->static_prio = NICE_TO_PRIO(NICE_DEFAULT);
boot->prio = boot->static_prio;
boot->time_slice = calc_timeslice(boot);
boot->slice_start = g_Ticks;
/* kernel_stack: we're already running on it; RSP will be saved by
* the first sched_context_switch() call, so no setup needed. */
boot->is_user = false;
g_current_task = boot;
g_runqueue.current = boot;
/* Create the idle task but do NOT go through sched_create_kthread
* because we store it separately (not on the CFS/RT queues). */
task_t *idle = alloc_task("idle", false);
if (!idle) {
printf("[sched] FATAL: cannot allocate idle task\n");
while (1) asm volatile("hlt");
}
idle->policy = SCHED_IDLE;
idle->static_prio = IDLE_PRIO;
idle->prio = IDLE_PRIO;
idle->kthread_entry = idle_entry;
idle->kthread_arg = NULL;
idle->ctx.cr3 = 0;
setup_initial_kstack(idle, kthread_trampoline);
g_runqueue.idle = idle;
printf("[sched] initialised; boot pid=%d\n", boot->pid);
}
/* =====================================================================
* schedule() — the heart of the scheduler
*
* Must be called with interrupts disabled (IF=0). Restores IF when
* the scheduled-in task next runs (via its own stack context or via
* the kthread_trampoline which calls x86_64_EnableInterrupts).
* ===================================================================== */
void schedule(void) {
/*
* We deliberately do NOT use spinlock_acquire_irqsave here because
* we're already called with IF=0 (either from an ISR or from
* sched_yield/sched_block which do cli first).
* We use a plain spinlock_acquire_or_wait so that on SMP (future)
* another CPU spinning on the lock eventually gets it.
*/
spinlock_acquire_or_wait(&g_runqueue.lock);
task_t *prev = g_runqueue.current;
task_t *next = pick_next_task(&g_runqueue);
if (next == prev || next == NULL) {
/* Nothing to switch to; keep running current task. */
spinlock_drop(&g_runqueue.lock);
return;
}
/* Account for time the current task actually ran */
uint64_t now = g_Ticks;
uint64_t elapsed = (prev->slice_start <= now)
? (now - prev->slice_start)
: 0;
update_vruntime(prev, elapsed);
/* Re-enqueue the outgoing task if it is still runnable (preempted).
* If it blocked/exited, its state is no longer TASK_RUNNING. */
if (prev->state == TASK_RUNNING && prev != g_runqueue.idle) {
prev->time_slice = calc_timeslice(prev); /* refresh slice */
if (prev->policy == SCHED_FIFO || prev->policy == SCHED_RR) {
rt_enqueue(&g_runqueue, prev);
} else if (prev->policy != SCHED_IDLE) {
cfs_enqueue(&g_runqueue, prev);
}
}
/* Set up the incoming task */
next->state = TASK_RUNNING;
next->need_reschedule = false;
next->slice_start = now;
g_runqueue.current = next;
g_current_task = next;
g_runqueue.nr_switches++;
/* Update TSS.RSP0 so that user-mode interrupts for this task use
* the correct kernel stack. */
if (next->is_user && next->kernel_stack) {
kernel_tss.rsp0 = (uint64_t)next->kernel_stack
+ next->kernel_stack_size;
}
spinlock_drop(&g_runqueue.lock);
/* ---- Context switch -------------------------------------------- */
sched_context_switch(&prev->ctx, &next->ctx);
/*
* When we return here we are BACK in the context of `prev`
* (which has just been rescheduled). Process any pending signals
* before returning to user space.
*/
task_handle_pending_signals();
}
/* =====================================================================
* sched_tick() — called from the PIT/LAPIC IRQ every millisecond
* ===================================================================== */
void sched_tick(void) {
task_t *cur = g_current_task;
if (!cur) return;
/* Decrement remaining timeslice */
if (cur->time_slice > 0)
cur->time_slice--;
/*
* Trigger a reschedule if:
* (a) The timeslice ran out, or
* (b) need_reschedule was set by a wakeup of a higher-prio task.
*/
if (cur->time_slice == 0 || cur->need_reschedule) {
/* schedule() expects IF=0 — guaranteed here because we are
* inside an IRQ handler; the CPU cleared IF on entry. */
schedule();
}
}
/* =====================================================================
* sched_yield() — voluntary CPU release
* ===================================================================== */
void sched_yield(void) {
x86_64_DisableInterrupts();
g_current_task->time_slice = 0; /* Force preemption */
schedule();
x86_64_EnableInterrupts();
}
/* =====================================================================
* sched_block() — put current task to sleep
*
* Caller must set the task state BEFORE calling (the function
* honours whatever state is already set). Alternatively pass the
* desired reason and we set it here.
* ===================================================================== */
void sched_block(task_state_t reason) {
x86_64_DisableInterrupts();
g_current_task->state = reason;
schedule();
x86_64_EnableInterrupts();
/* When we return here the task has been woken up. */
}
/* =====================================================================
* sched_wake() — wake a sleeping task
* Safe to call from interrupt context.
* ===================================================================== */
void sched_wake(task_t *task) {
if (!task) return;
uint64_t flags;
spinlock_acquire_irqsave(&g_runqueue.lock, &flags);
if (task->state != TASK_RUNNING) {
task->state = TASK_RUNNING;
task->time_slice = calc_timeslice(task);
if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) {
rt_enqueue(&g_runqueue, task);
} else if (task->policy != SCHED_IDLE) {
cfs_enqueue(&g_runqueue, task);
}
/*
* Preempt the current task if the woken task has strictly
* higher priority (lower numeric priority value).
*/
task_t *cur = g_runqueue.current;
if (cur && task->prio < cur->prio) {
cur->need_reschedule = true;
}
}
spinlock_release_irqrestore(&g_runqueue.lock, flags);
}
/* =====================================================================
* sched_exit() — terminate the current task (noreturn)
* ===================================================================== */
void sched_exit(int exit_code) {
x86_64_DisableInterrupts();
task_t *self = g_current_task;
self->exit_code = exit_code;
self->state = TASK_ZOMBIE;
/* Notify parent (send SIGCHLD) */
if (self->parent)
task_send_signal(self->parent, SIGCHLD);
printf("[sched] task '%s' pid=%d exited with code %d\n",
self->name, self->pid, exit_code);
/* Hand off to someone else; we will never return. */
schedule();
/* schedule() should never return to a ZOMBIE task, but just in case: */
for (;;) asm volatile("hlt");
__builtin_unreachable();
}
/* =====================================================================
* Signal delivery
* ===================================================================== */
/* Default signal actions */
typedef enum { SIG_ACTION_TERM, SIG_ACTION_CORE, SIG_ACTION_IGN,
SIG_ACTION_STOP, SIG_ACTION_CONT } sig_default_action_t;
static sig_default_action_t default_action(int signum) {
switch (signum) {
case SIGHUP: case SIGINT: case SIGKILL: case SIGPIPE:
case SIGALRM: case SIGTERM: case SIGUSR1: case SIGUSR2:
case SIGPROF: case SIGVTALRM: case SIGSTKFLT:
return SIG_ACTION_TERM;
case SIGQUIT: case SIGILL: case SIGABRT: case SIGFPE:
case SIGSEGV: case SIGBUS: case SIGSYS: case SIGTRAP:
case SIGXCPU: case SIGXFSZ:
return SIG_ACTION_CORE; /* we treat CORE same as TERM for now */
case SIGCHLD: case SIGURG: case SIGWINCH: case SIGIO: case SIGPWR:
return SIG_ACTION_IGN;
case SIGSTOP: case SIGTSTP: case SIGTTIN: case SIGTTOU:
return SIG_ACTION_STOP;
case SIGCONT:
return SIG_ACTION_CONT;
default:
return SIG_ACTION_TERM;
}
}
int task_send_signal(task_t *task, int signum) {
if (!task) return -1;
if (signum <= 0 || signum >= _NSIG) return -1;
uint64_t flags;
spinlock_acquire_irqsave(&g_runqueue.lock, &flags);
/* Set pending bit */
task->pending_signals |= (1ULL << signum);
/* SIGKILL and SIGCONT always wake the target */
if (signum == SIGKILL || signum == SIGCONT) {
if (task->state == TASK_INTERRUPTIBLE ||
task->state == TASK_STOPPED ||
task->state == TASK_UNINTERRUPTIBLE) {
task->state = TASK_RUNNING;
/* Re-enqueue (simplified: call rt/cfs directly since lock held) */
if (task->policy == SCHED_FIFO || task->policy == SCHED_RR)
rt_enqueue(&g_runqueue, task);
else if (task->policy != SCHED_IDLE)
cfs_enqueue(&g_runqueue, task);
}
} else if (!(task->signal_mask & (1ULL << signum))) {
/* Unblocked signal: wake an interruptible sleeper */
if (task->state == TASK_INTERRUPTIBLE) {
task->state = TASK_RUNNING;
if (task->policy == SCHED_FIFO || task->policy == SCHED_RR)
rt_enqueue(&g_runqueue, task);
else if (task->policy != SCHED_IDLE)
cfs_enqueue(&g_runqueue, task);
}
}
spinlock_release_irqrestore(&g_runqueue.lock, flags);
return 0;
}
/*
* Handle pending signals for the current task.
* Called just before returning to user space (end of schedule(), syscall
* return path, or end of IRQ handler for user-mode tasks).
*/
void task_handle_pending_signals(void) {
task_t *self = g_current_task;
if (!self) return;
while (self->pending_signals & ~self->signal_mask) {
/* Find the lowest-numbered pending, unblocked signal */
uint64_t deliverable = self->pending_signals & ~self->signal_mask;
int signum = __builtin_ctzll(deliverable) + 1; /* +1: bit 0 = sig 1 */
if (signum >= _NSIG) break;
/* Clear the pending bit */
self->pending_signals &= ~(1ULL << (signum - 1));
sighandler_t handler = self->sigactions[signum].sa_handler;
if (handler == SIG_IGN) {
/* Explicitly ignored */
if (signum == SIGCHLD) continue; /* common: reap silently */
continue;
} else if (handler != SIG_DFL) {
/*
* User-defined handler.
*
* A full POSIX implementation would build a signal frame on
* the user stack and set registers so that iretq delivers
* the signal; that requires knowing the saved RFLAGS/RIP
* from the ISR frame. We leave this as a TODO and just
* call the handler directly for kernel threads.
*
* For user tasks this is the point where you would push a
* ucontext_t / sigframe onto the user stack and adjust the
* saved user RIP in the ISR frame.
*/
if (!self->is_user) {
handler(signum);
} else {
/* TODO: build user-space signal frame */
printf("[signal] TODO: deliver signal %d to user task '%s'\n",
signum, self->name);
}
} else {
/* SIG_DFL */
switch (default_action(signum)) {
case SIG_ACTION_TERM:
case SIG_ACTION_CORE:
printf("[signal] task '%s' pid=%d killed by signal %d\n",
self->name, self->pid, signum);
sched_exit(128 + signum);
break; /* unreachable */
case SIG_ACTION_STOP:
self->state = TASK_STOPPED;
/* Notify parent */
if (self->parent) task_send_signal(self->parent, SIGCHLD);
sched_block(TASK_STOPPED);
break;
case SIG_ACTION_CONT:
/* Already running (we were woken to handle this) */
break;
case SIG_ACTION_IGN:
break;
}
}
}
}
/* =====================================================================
* sched_find_task (linear scan — O(n), suitable for small task counts)
* ===================================================================== */
task_t *sched_find_task(pid_t pid) {
/*
* Walk the CFS list and RT queues. In a production kernel this
* would be a hash table. For KirkOS this is fine.
*/
task_t *t = g_runqueue.cfs_head;
while (t) {
if (t->pid == pid) return t;
t = t->rq_next;
}
for (int p = RT_PRIO_MIN; p <= RT_PRIO_MAX; p++) {
t = g_runqueue.rt.head[p];
while (t) {
if (t->pid == pid) return t;
t = t->rq_next;
}
}
if (g_runqueue.current && g_runqueue.current->pid == pid)
return g_runqueue.current;
return NULL;
}
/* =====================================================================
* Priority / scheduler controls
* ===================================================================== */
int task_set_nice(task_t *task, int nice) {
if (nice < NICE_MIN) nice = NICE_MIN;
if (nice > NICE_MAX) nice = NICE_MAX;
int old_nice = task->nice;
task->nice = nice;
task->static_prio = NICE_TO_PRIO(nice);
task->prio = task->static_prio;
/*
* Recompute the timeslice. If the task is currently on a queue we
* would need to re-sort it (out of scope here — next schedule() will
* pick the right slot when it re-enqueues).
*/
task->time_slice = calc_timeslice(task);
return old_nice;
}
int task_set_scheduler(task_t *task, int policy, int rt_prio) {
if (policy != SCHED_NORMAL && policy != SCHED_FIFO &&
policy != SCHED_RR && policy != SCHED_BATCH &&
policy != SCHED_IDLE)
return -1;
if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
(rt_prio < RT_PRIO_MIN || rt_prio > RT_PRIO_MAX))
return -1;
/* Remove from current queue, change policy, re-enqueue */
bool was_queued = (task->state == TASK_RUNNING &&
task != g_runqueue.current);
if (was_queued)
sched_dequeue(task);
task->policy = policy;
task->static_prio = (policy == SCHED_FIFO || policy == SCHED_RR)
? rt_prio
: NICE_TO_PRIO(task->nice);
task->prio = task->static_prio;
task->time_slice = calc_timeslice(task);
if (was_queued)
sched_enqueue(task);
return 0;
}
+312
View File
@@ -0,0 +1,312 @@
#pragma once
#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#include "mm/vmm.h"
#include "mp/spinlock.h"
/* =====================================================================
* POSIX signal numbers
* ===================================================================== */
#define SIGHUP 1
#define SIGINT 2
#define SIGQUIT 3
#define SIGILL 4
#define SIGTRAP 5
#define SIGABRT 6
#define SIGBUS 7
#define SIGFPE 8
#define SIGKILL 9 /* cannot be caught or ignored */
#define SIGUSR1 10
#define SIGSEGV 11
#define SIGUSR2 12
#define SIGPIPE 13
#define SIGALRM 14
#define SIGTERM 15
#define SIGSTKFLT 16
#define SIGCHLD 17
#define SIGCONT 18
#define SIGSTOP 19 /* cannot be caught or ignored */
#define SIGTSTP 20
#define SIGTTIN 21
#define SIGTTOU 22
#define SIGURG 23
#define SIGXCPU 24
#define SIGXFSZ 25
#define SIGVTALRM 26
#define SIGPROF 27
#define SIGWINCH 28
#define SIGIO 29
#define SIGPWR 30
#define SIGSYS 31
#define _NSIG 32
typedef void (*sighandler_t)(int signum);
#define SIG_DFL ((sighandler_t)0) /* default action */
#define SIG_IGN ((sighandler_t)1) /* ignore signal */
#define SIG_ERR ((sighandler_t)-1) /* error return */
#define SA_NOCLDSTOP 0x00000001
#define SA_NOCLDWAIT 0x00000002
#define SA_SIGINFO 0x00000004
#define SA_RESTORER 0x04000000
#define SA_ONSTACK 0x08000000
#define SA_RESTART 0x10000000
#define SA_NODEFER 0x40000000
#define SA_RESETHAND 0x80000000
struct sigaction {
sighandler_t sa_handler;
uint64_t sa_mask; /* signals blocked while handler runs */
int sa_flags;
};
/* =====================================================================
* Scheduling policies (POSIX)
* ===================================================================== */
#define SCHED_NORMAL 0 /* Fair time-sharing (CFS-like, nice values) */
#define SCHED_FIFO 1 /* Real-time FIFO runs until yield or block */
#define SCHED_RR 2 /* Real-time round-robin with fixed timeslice */
#define SCHED_BATCH 3 /* CPU-bound variant of NORMAL, no preemption boost */
#define SCHED_IDLE 5 /* Only runs when nothing else is runnable */
/* Priority ranges:
* RT tasks: static_prio 1 .. 99 (1 = highest)
* Normal tasks: static_prio 100 .. 139 (maps from nice -20 .. +19)
* Idle: static_prio 140
*/
#define MAX_RT_PRIO 100
#define MAX_PRIO 140
#define RT_PRIO_MIN 1
#define RT_PRIO_MAX 99
#define NICE_MIN (-20)
#define NICE_MAX 19
#define NICE_DEFAULT 0
#define IDLE_PRIO 140
/* nice ↔ static_prio conversions for SCHED_NORMAL */
#define NICE_TO_PRIO(n) (MAX_RT_PRIO + (n) + 20)
#define PRIO_TO_NICE(p) ((p) - MAX_RT_PRIO - 20)
/* Timeslice constants (ticks, PIT at 1000 Hz → 1 tick = 1 ms) */
#define SCHED_BASE_SLICE_MS 10 /* base timeslice for NICE_DEFAULT */
#define SCHED_MIN_SLICE_MS 1 /* minimum timeslice (1 ms) */
#define SCHED_MAX_SLICE_MS 100 /* maximum timeslice (100 ms) */
#define SCHED_RR_SLICE_MS 10 /* fixed timeslice for SCHED_RR */
/* =====================================================================
* Task states
* ===================================================================== */
typedef enum task_state {
TASK_RUNNING = 0, /* Runnable on a run queue or executing */
TASK_INTERRUPTIBLE = 1, /* Sleeping, can be woken by signal */
TASK_UNINTERRUPTIBLE = 2, /* Sleeping, ignores signals (D state) */
TASK_STOPPED = 4, /* Halted by SIGSTOP / SIGTSTP */
TASK_ZOMBIE = 8, /* Exited, waiting for parent to wait() */
TASK_DEAD = 16, /* Fully reaped, memory can be freed */
} task_state_t;
/* =====================================================================
* Minimal CPU context
*
* Only RSP and CR3 live here; all callee-saved GPRs are pushed onto
* the kernel stack by sched_context_switch() before RSP is saved.
* This keeps the struct tiny and the assembly dead simple.
* ===================================================================== */
struct cpu_context {
uint64_t rsp; /* Saved kernel stack pointer */
uint64_t cr3; /* Physical address of PML4 (0 = stay on kernel map) */
};
/* =====================================================================
* Task / Process descriptor
* ===================================================================== */
typedef int pid_t;
typedef struct task task_t;
struct task {
/* ---- CPU context (must stay first asm references it at offset 0) */
struct cpu_context ctx;
/* ---- Identity ---------------------------------------------------- */
pid_t pid;
pid_t ppid;
char name[64];
bool is_user; /* true = user process, false = kernel thread */
/* ---- Scheduling policy and priority ------------------------------ */
int policy; /* SCHED_NORMAL / SCHED_FIFO / SCHED_RR / … */
int static_prio; /* Immutable base priority */
int nice; /* -20 .. +19, SCHED_NORMAL only */
int prio; /* Effective priority (may be boosted) */
/* ---- State ------------------------------------------------------- */
volatile task_state_t state;
bool need_reschedule; /* Set when a higher-priority task wakes up */
/* ---- Time accounting (ticks, 1 tick = 1 ms at 1000 Hz PIT) ------- */
uint64_t vruntime; /* Virtual runtime (tick-equivalents, weighted) */
uint64_t sum_exec_runtime; /* Total CPU time consumed (raw ticks) */
uint64_t time_slice; /* Remaining timeslice (ticks) */
uint64_t slice_start; /* Tick when the current slice began */
/* ---- Memory ------------------------------------------------------ */
struct pagemap *pagemap; /* NULL → use kernel_pagemap */
void *kernel_stack; /* Pointer to bottom of kernel-stack alloc */
size_t kernel_stack_size;
/* ---- Entry points ------------------------------------------------ */
uint64_t user_entry; /* User-space RIP for user tasks */
uint64_t user_stack_top; /* User-space RSP for user tasks */
void (*kthread_entry)(void *arg); /* Kernel thread entry point */
void *kthread_arg;
/* ---- Signals ----------------------------------------------------- */
uint64_t pending_signals; /* Bitmask of unhandled signals */
uint64_t signal_mask; /* Blocked (SIG_BLOCK) signals */
struct sigaction sigactions[_NSIG];
/* ---- Exit status ------------------------------------------------- */
int exit_code;
/* ---- Run-queue linkage (doubly-linked, intrusive) ---------------- */
task_t *rq_next;
task_t *rq_prev;
/* ---- Process tree ------------------------------------------------ */
task_t *parent;
task_t *first_child;
task_t *next_sibling;
};
/* =====================================================================
* Run queue
*
* Two sub-queues per CPU (single CPU for now, MP-ready by design):
*
* 1. RT array 99 FIFO lists indexed by RT priority. Highest
* priority with a runnable task is O(1) via bitmap.
*
* 2. CFS list Tasks sorted by vruntime (ascending). Pick-next
* is O(1) (front of list); insert is O(n) good
* enough for now, swap in an rb-tree later.
*
* Idle task is stored separately and is returned only when both
* sub-queues are empty.
* ===================================================================== */
#define RT_QUEUE_LEVELS MAX_RT_PRIO /* 100 levels (index 0 = unused, 199 used) */
struct rt_prio_array {
/*
* Bitmap: bit N is set ↔ rt_queue[N] is non-empty.
* Two 64-bit words cover 128 bits, enough for 100 levels.
*/
uint64_t bitmap[2];
task_t *head[RT_QUEUE_LEVELS]; /* FIFO queue heads */
task_t *tail[RT_QUEUE_LEVELS]; /* FIFO queue tails */
int total; /* Total RT tasks enqueued */
};
struct runqueue {
spinlock_t lock;
/* Real-time (SCHED_FIFO / SCHED_RR) */
struct rt_prio_array rt;
/* Normal (SCHED_NORMAL / SCHED_BATCH) — sorted ascending by vruntime */
task_t *cfs_head;
int cfs_count;
uint64_t min_vruntime; /* Lower bound; new tasks start from here */
/* Idle fallback (SCHED_IDLE) */
task_t *idle;
/* Currently executing task on this CPU */
task_t *current;
/* Statistics */
uint64_t nr_switches;
uint64_t nr_running; /* Total runnable tasks (all classes) */
};
/* =====================================================================
* Globals (single-CPU; extend to per-cpu array for SMP)
* ===================================================================== */
extern struct runqueue g_runqueue;
extern task_t *g_current_task; /* Pointer to currently-running task */
/* =====================================================================
* Public scheduler API
* ===================================================================== */
/* Initialise the scheduler (call after PMM + VMM + PIT are ready) */
void sched_init(void);
/* Create a kernel thread and enqueue it immediately */
task_t *sched_create_kthread(const char *name,
void (*entry)(void *), void *arg);
/* Create a user-space task and enqueue it immediately */
task_t *sched_create_user_task(const char *name,
uint64_t entry_rip, uint64_t user_rsp,
struct pagemap *pm);
/* Add a task to the appropriate run queue */
void sched_enqueue(task_t *task);
/* Remove a task from its run queue (does NOT free it) */
void sched_dequeue(task_t *task);
/* Pick next task and perform context switch (call with IF=0) */
void schedule(void);
/* Called from the PIT/LAPIC timer IRQ every tick (1 ms) */
void sched_tick(void);
/* Voluntarily give up the CPU */
void sched_yield(void);
/* Block current task (IF must be 0 on entry; IF restored by schedule) */
void sched_block(task_state_t reason);
/* Wake a sleeping task (safe to call from IRQ context) */
void sched_wake(task_t *task);
/* Terminate the current task (noreturn) */
void sched_exit(int exit_code) __attribute__((noreturn));
/* ---- Signal API ------------------------------------------------------- */
/* Send signal signum to task (safe from any context) */
int task_send_signal(task_t *task, int signum);
/* Find a task by PID (NULL if not found) */
task_t *sched_find_task(pid_t pid);
/* Process pending signals for the current task (call before returning to user) */
void task_handle_pending_signals(void);
/* ---- Priority / policy control --------------------------------------- */
/* Set nice value [-20, +19] for a SCHED_NORMAL task; returns old nice */
int task_set_nice(task_t *task, int nice);
/* Change scheduling policy + RT priority; returns 0 on success */
int task_set_scheduler(task_t *task, int policy, int rt_prio);
/* ---- Convenience ----------------------------------------------------- */
static inline task_t *sched_current(void) { return g_current_task; }
/* ---- Assembly context switch (defined in sched_switch.S) ------------- */
/*
* Save callee-saved registers of the current context onto its kernel
* stack and record RSP in *from. Then switch to *to's stack, restore
* its callee-saved registers, and return — which resumes wherever *to
* last called schedule(). For first-time tasks the "return" jumps to
* the appropriate trampoline.
*/
void sched_context_switch(struct cpu_context *from,
struct cpu_context *to);
/* Kernel stack size for each task */
#define KSTACK_SIZE (32 * 1024) /* 32 KiB — comfortable headroom */
+95 -1
View File
@@ -3,6 +3,7 @@
#include "mp/percpu.h"
#include "fs/vfs.h"
#include "syscall.h"
#include "sched/scheduler.h"
#define MSR_EFER 0xC0000080
#define MSR_STAR 0xC0000081
@@ -41,7 +42,7 @@ uint64_t syscall_handler(uint64_t num,
uint8_t* buf = (uint8_t*)arg2;
size_t len = (size_t)arg3;
return (uint64_t)VFS_Read(fd, buf, len);
return VFS_Read(fd, buf, len);
}
case SYS_WRITE:
@@ -65,6 +66,99 @@ uint64_t syscall_handler(uint64_t num,
return (uint64_t)VFS_Close(fd);
}
case SYS_GETPID:
return (uint64_t)sched_current()->pid;
case SYS_GETPPID:
return (uint64_t)sched_current()->ppid;
case SYS_EXIT:
case SYS_EXIT_GROUP:
sched_exit((int)arg1);
//noreturn
case SYS_SCHED_YIELD:
sched_yield();
return 0;
case SYS_NICE:
{
int increment = (int)arg1;
int old_nice = sched_current()->nice;
int new_nice = old_nice + increment;
return (uint64_t)task_set_nice(sched_current(), new_nice);
}
case SYS_KILL:
{
pid_t target = (pid_t)arg1;
int sig = (int)arg2;
task_t *t = sched_find_task(target);
if (!t) return (uint64_t)-1;
return (uint64_t)task_send_signal(t, sig);
}
case SYS_SIGACTION:
{
int signum = (int)arg1;
const struct sigaction *act = (const struct sigaction *)arg2;
struct sigaction *oact = (struct sigaction *)arg3;
if (signum <= 0 || signum >= _NSIG)
return (uint64_t)-1;
if (signum == SIGKILL || signum == SIGSTOP)
return (uint64_t)-1; // cannot override
task_t *cur = sched_current();
if (oact)
*oact = cur->sigactions[signum];
if (act)
cur->sigactions[signum] = *act;
return 0;
}
case SYS_SIGPROCMASK:
{
// how: 0=SIG_BLOCK, 1=SIG_UNBLOCK, 2=SIG_SETMASK
int how = (int)arg1;
uint64_t new_set = arg2;
uint64_t *old = (uint64_t *)arg3;
task_t *cur = sched_current();
if (old) *old = cur->signal_mask;
// SIGKILL and SIGSTOP can never be blocked
new_set &= ~((1ULL << SIGKILL) | (1ULL << SIGSTOP));
switch (how) {
case 0: cur->signal_mask |= new_set; break; // SIG_BLOCK
case 1: cur->signal_mask &= ~new_set; break; // SIG_UNBLOCK
case 2: cur->signal_mask = new_set; break; // SIG_SETMASK
default: return (uint64_t)-1;
}
return 0;
}
case SYS_SCHED_GETSCHEDULER:
{
pid_t target = (pid_t)arg1;
task_t *t = target ? sched_find_task(target)
: sched_current();
if (!t) return (uint64_t)-1;
return (uint64_t)t->policy;
}
case SYS_SCHED_SETSCHEDULER:
{
pid_t target = (pid_t)arg1;
int policy = (int)arg2;
int rt_prio = (int)arg3;
task_t *t = target ? sched_find_task(target)
: sched_current();
if (!t) return (uint64_t)-1;
return (uint64_t)task_set_scheduler(t, policy, rt_prio);
}
default:
return (uint64_t)-1;
}
+12
View File
@@ -6,5 +6,17 @@
#define SYS_OPEN 2
#define SYS_CLOSE 3
#define SYS_SCHED_YIELD 24
#define SYS_GETPID 39
#define SYS_GETPPID 110
#define SYS_NICE 34
#define SYS_KILL 62
#define SYS_SIGACTION 13 /* rt_sigaction on Linux */
#define SYS_SIGPROCMASK 14 /* rt_sigprocmask on Linux */
#define SYS_EXIT 60
#define SYS_EXIT_GROUP 231
#define SYS_SCHED_GETSCHEDULER 138
#define SYS_SCHED_SETSCHEDULER 139
void syscall_init(void);
+2
View File
@@ -27,7 +27,9 @@ syscall_entry:
pop %rsi # rsi = a1 (2nd param)
mov %rax, %rdi # rdi = num (1st param)
sub $8, %rsp
call syscall_handler
add $8, %rsp
# Restore user context
pop %r11 # user RFLAGS