sched: Implement basic scheduling and signal handling system

Note: this is probably 25% broken, but it works right now as written, so I hope it all works.

- Added a new scheduler header file (scheduler.h) defining task structures, scheduling policies, and signal handling mechanisms.
- Integrated scheduling functions into the syscall interface, including SYS_GETPID, SYS_GETPPID, SYS_EXIT, SYS_SCHED_YIELD, SYS_NICE, SYS_KILL, SYS_SIGACTION, SYS_SIGPROCMASK, SYS_SCHED_GETSCHEDULER, and SYS_SCHED_SETSCHEDULER.
- Updated syscall handler to manage new scheduling-related syscalls and signal actions.

Signed-off-by: kaguya <vpshinomiya@protonmail.com>
This commit is contained in:
kaguya
2026-04-26 22:46:28 -04:00
parent 336af1c2ad
commit 7d99745ff9
20 changed files with 1561 additions and 53 deletions
+1 -1
View File
@@ -1,2 +1,2 @@
build/
user/build/*
user/build
Binary file not shown.
+48 -19
View File
@@ -5,24 +5,53 @@
#include "mm/memory.h"
#include "libk/stdio.h"
#include "fs/elf.h"
#include "sched/scheduler.h"
extern uintptr_t g_hhdm_offset;
#define USER_STACK_TOP 0x00007FFFFFFFE000ULL
#define USER_STACK_PAGES 4
#define USER_STACK_PAGES 8
#define USER_STACK_SIZE (USER_STACK_PAGES * PAGE_SIZE)
#define PTE_PRESENT (1ULL << 0)
#define PTE_WRITABLE (1ULL << 1)
#define PTE_USER (1ULL << 2)
static uint64_t user_stack_phys_base = 0;
extern struct pagemap *kernel_pagemap;
//extern struct pagemap *kernel_pagemap;
uintptr_t setup_user_stack(void)
struct pagemap *create_user_pagemap(void)
{
struct pagemap *pm = kmalloc(sizeof(struct pagemap));
if (!pm) {
printf("Failed to allocate user pagemap struct!\n");
return NULL;
}
spinlock_init(&pm->lock);
/* Allocate a fresh PML4 (physical page) */
pm->top_level = (uint64_t *)((uintptr_t)pmm_allocz(1) + MEM_PHYS_OFFSET);
if (!pm->top_level) {
printf("Failed to allocate user PML4!\n");
kfree(pm);
return NULL;
}
/* Copy kernel higher-half mappings (kernel + HHDM) */
for (size_t i = 256; i < 512; i++) {
pm->top_level[i] = kernel_pagemap->top_level[i];
}
/* Lower half remains zero (user address space) */
printf("[usermode] user pagemap created (PML4 phys = 0x%lx)\n",
(uint64_t)pm->top_level - MEM_PHYS_OFFSET);
return pm;
}
uintptr_t setup_user_stack(struct pagemap *pagemap)
{
user_stack_phys_base = (uint64_t)pmm_alloc(USER_STACK_PAGES);
@@ -37,7 +66,7 @@ uintptr_t setup_user_stack(void)
uintptr_t virt = stack_bottom + i * PAGE_SIZE;
uintptr_t phys = user_stack_phys_base + i * PAGE_SIZE;
if (!vmm_map_page(kernel_pagemap,
if (!vmm_map_page(pagemap,
virt,
phys,
PAGE_READ | PAGE_WRITE | PAGE_USER,
@@ -47,19 +76,13 @@ uintptr_t setup_user_stack(void)
for (;;);
}
// zero physical page through HHDM
//memset((void *)(phys + g_hhdm_offset), 0, PAGE_SIZE);
}
uintptr_t rsp = USER_STACK_TOP;
rsp &= ~0xFULL;
return rsp;
}
// usermode.c
__attribute__((naked))
void enter_user_mode(uint64_t rip, uint64_t rsp)
{
@@ -98,21 +121,27 @@ void enter_user_mode(uint64_t rip, uint64_t rsp)
void start_userspace(void)
{
void *entry = NULL;
if (!ELF_Read("init.elf", &entry)) {
struct pagemap *user_pagemap = create_user_pagemap();
if (!user_pagemap) {
printf("Failed to create user pagemap\n");
for (;;);
}
void *elf_entry = NULL;
if (!ELF_Read("init.elf", &elf_entry, user_pagemap)) {
printf("Failed to load init.elf\n");
for(;;);
}
if (!entry) {
if (!elf_entry) {
printf("ELF has no entry point\n");
for(;;);
}
uintptr_t rsp = setup_user_stack();
uintptr_t user_rsp = setup_user_stack(user_pagemap);
printf("Entering usermode RIP=%p RSP=%p\n", entry, (void*)rsp);
printf("Entering usermode RIP=%p RSP=%p\n", elf_entry, (void*)user_rsp);
enter_user_mode((uint64_t)entry, (rsp & ~0xFULL));
sched_create_user_task("init", (uint64_t)elf_entry, user_rsp, user_pagemap);
}
+2 -1
View File
@@ -6,6 +6,7 @@
#include "e9.h"
#include "limine.h"
#include "apic.h"
#include "sched/scheduler.h"
__attribute__((used, section(".limine_requests")))
volatile struct limine_date_at_boot_request boot_request = {
@@ -39,7 +40,7 @@ void PIT_IRQ_Handler(Registers* regs)
lapic_eoi();
}
// You can add scheduler / time logic here later
sched_tick();
}
/* ========================= */
+16 -7
View File
@@ -7,16 +7,13 @@
#include "fs/ext2.h"
extern uintptr_t g_hhdm_offset;
extern struct pagemap *kernel_pagemap;
#define ELF_BUFFER_SIZE (1024 * 1024)
#define PTE_PRESENT (1ULL << 0)
#define PTE_WRITABLE (1ULL << 1)
#define PTE_USER (1ULL << 2)
bool ELF_Read(const char* path, void** entryPoint)
bool ELF_Read(const char* path, void** entryPoint, struct pagemap *target_pagemap)
{
uint32_t size;
@@ -41,6 +38,15 @@ bool ELF_Read(const char* path, void** entryPoint)
ELFHeader* header = (ELFHeader*)elf_buffer;
printf("=== ELF DEBUG ===\n");
printf("Entry point VA = 0x%lx\n", header->ProgramEntryPosition);
printf("PHDR offset = 0x%lx\n", header->ProgramHeaderTablePosition);
printf("PHDR count = %u\n", header->ProgramHeaderTableEntryCount);
printf("=== END ELF DEBUG ===\n");
// ── validate ELF ──────────────────────────────────
if (memcmp(header->Magic, ELF_MAGIC, 4) != 0) {
printf("ELF: bad magic\n");
@@ -82,8 +88,11 @@ bool ELF_Read(const char* path, void** entryPoint)
ELFProgramHeader* ph = (ELFProgramHeader*)(ph_table +
i * header->ProgramHeaderTableEntrySize);
if (ph->Type != ELF_PROGRAM_TYPE_LOAD)
if (ph->Type != ELF_PROGRAM_TYPE_LOAD) {
printf("LOAD segment: VA=0x%lx FileSz=0x%lx MemSz=0x%lx\n",
ph->VirtualAddress, ph->FileSize, ph->MemorySize);
continue;
}
uint64_t virt = ph->VirtualAddress;
uint64_t offset = ph->Offset;
@@ -114,7 +123,7 @@ bool ELF_Read(const char* path, void** entryPoint)
uint64_t phys_addr = phys_base + p * PAGE_SIZE;
bool success = vmm_map_page(
kernel_pagemap,
target_pagemap,
virt_addr,
phys_addr,
PAGE_READ | PAGE_WRITE | PAGE_USER, // RW + User mode
+2 -1
View File
@@ -1,6 +1,7 @@
#pragma once
#include <stdint.h>
#include <stdbool.h>
#include "mm/vmm.h"
#define ELF_MAGIC ("\x7F" "ELF")
@@ -114,4 +115,4 @@ enum ELFProgramType {
};
bool ELF_Read(const char* path, void** entryPoint);
bool ELF_Read(const char* path, void** entryPoint, struct pagemap *target_pagemap);
+7 -1
View File
@@ -131,13 +131,19 @@ int VFS_Read_internal(fd_t fd, uint8_t* buf, size_t size)
// naive: read whole file then slice
uint8_t* tmp = kmalloc(file_size);
if (!ext2_read_file(&file->ext2.inode, tmp))
if (!tmp) {
return -1;
}
if (!ext2_read_file(&file->ext2.inode, tmp)) {
kfree(tmp);
return -1;
}
for (size_t i = 0; i < size; i++)
buf[i] = tmp[file->offset + i];
file->offset += size;
kfree(tmp);
return size;
}
+14 -2
View File
@@ -31,6 +31,7 @@
#include "arch/x86_64/sys/apic.h"
#include "arch/x86_64/sys/ioapic.h"
#include "drivers/input/ps2.h"
#include "sched/scheduler.h"
uintptr_t g_hhdm_offset;
@@ -350,8 +351,19 @@ void kmain(void) {
syscall_init();
sched_init();
start_userspace();
// We're done, just hang...
hcf();
sched_yield();
for (;;) {
sched_yield();
}
// We're done, just hang...
//hcf();
}
+6
View File
@@ -23,6 +23,10 @@ extern volatile struct limine_executable_address_request kernel_address_request;
#define PAGE_USER (1ULL << 2)
#define PAGE_NO_EXECUTE (1ULL << 63)
#define PTE_PRESENT (1ULL << 0)
#define PTE_WRITABLE (1ULL << 1)
#define PTE_USER (1ULL << 2)
struct pagemap {
spinlock_t lock;
uint64_t *top_level;
@@ -37,3 +41,5 @@ void vmm_switch_pagemap(struct pagemap *pagemap);
bool vmm_map_page(struct pagemap *pagemap, uint64_t virt, uint64_t phys,
uint64_t flags, enum page_size pg_size);
uint64_t vmm_virt_to_phys(struct pagemap *pagemap, uint64_t virt);
uint64_t *vmm_virt_to_pte(struct pagemap *pagemap, uintptr_t virt_addr,
bool allocate);
+7 -3
View File
@@ -3,7 +3,11 @@
#include <stdint.h>
struct cpu_local {
uint64_t self; // +0x00 (GS:0x00)
uint64_t user_rsp; // +0x08 (GS:0x08) — saved user RSP on syscall entry
uint64_t kernel_rsp; // +0x10 (GS:0x10) — kernel stack for syscall handler
uint64_t self; /* +0x00 (GS:0x00) — points to this struct */
uint64_t user_rsp; /* +0x08 (GS:0x08) — saved user RSP on SYSCALL */
uint64_t kernel_rsp; /* +0x10 (GS:0x10) — kernel stack for syscall */
/* Future SMP fields: */
uint32_t cpu_id; /* +0x18 logical CPU index */
uint32_t _pad;
void *current; /* +0x20 pointer to current task_t */
};
+75
View File
@@ -0,0 +1,75 @@
# void sched_context_switch(struct cpu_context *from,
# struct cpu_context *to);
#
# struct cpu_context layout (from sched.h):
# offset 0 : rsp (uint64_t)
# offset 8 : cr3 (uint64_t)
#
# Strategy
# --------
# Only callee-saved registers need explicit saving; caller-saved registers
# (rax, rcx, rdx, rsi, rdi, r8r11) are already on the caller's stack per
# the System V ABI.
#
# 1. Push all callee-saved GPRs onto the *current* kernel stack.
# 2. Save RSP from->rsp.
# 3. If to->cr3 is non-zero and differs from the current CR3, load it
# (switches address space for user processes).
# 4. Load RSP to->rsp (switch to next task's kernel stack).
# 5. Pop all callee-saved GPRs from the *new* stack.
# 6. ret pops the return address placed there during task creation
# (kthread_trampoline / user_task_trampoline) for a brand-new task,
# or returns into the schedule() call-site for a resumed task.
#
# NOTE: This function is called with IF=0 (interrupts disabled) and must
# NOT modify IF itself. The trampolines re-enable interrupts after the
# first-ever schedule-in of a task.
.section .text
.global sched_context_switch
.type sched_context_switch, @function
sched_context_switch:
# Save outgoing task
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
# from->rsp = RSP (rdi = struct cpu_context *from, offset 0 = rsp)
movq %rsp, 0(%rdi)
# Switch address space (CR3) if needed
# to->cr3 is at offset 8 in struct cpu_context
movq 8(%rsi), %rax
testq %rax, %rax # 0 means "keep current CR3" (kernel thread)
jz .Lno_cr3
movq %cr3, %rcx
cmpq %rax, %rcx # Same CR3? Don't flush the TLB needlessly.
je .Lno_cr3
movq %rax, %cr3 # Load new page table root (flushes TLB)
.Lno_cr3:
# Switch to incoming task's kernel stack
# to->rsp is at offset 0 (rsi = struct cpu_context *to)
movq 0(%rsi), %rsp
# Restore incoming task's callee-saved registers
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
popq %rbx
# ret pops the "return address" from the new task's stack.
# Brand-new task jumps to kthread_trampoline or user_task_trampoline
# Resumed task returns into schedule(), then back up the call chain
ret
.size sched_context_switch, . - sched_context_switch
+935
View File
@@ -0,0 +1,935 @@
#include "scheduler.h"
#include "mm/memory.h"
#include "mm/pmm.h"
#include "libk/stdio.h"
#include "arch/x86_64/cpu/io.h"
#include "arch/x86_64/sys/pit.h"
#include "string.h"
/* =====================================================================
* Forward declarations for GDT/TSS (defined in gdt.c)
* ===================================================================== */
typedef struct {
uint32_t reserved0;
uint64_t rsp0;
uint64_t rsp1;
uint64_t rsp2;
uint64_t reserved1;
uint64_t ist[7];
uint64_t reserved2;
uint16_t reserved3;
uint16_t iopb_offset;
} __attribute__((packed)) TSS;
extern TSS kernel_tss;
/* =====================================================================
* Globals
* ===================================================================== */
struct runqueue g_runqueue = {0};
task_t *g_current_task = NULL;
/* PIT tick counter (defined in pit.c) */
extern volatile uint64_t g_Ticks;
/* =====================================================================
* Linux-compatible nice → CPU weight table (NICE_0_LOAD = 1024)
*
* vruntime_delta = actual_ticks * NICE_0_LOAD / weight[nice + 20]
* timeslice = BASE * weight[nice + 20] / NICE_0_LOAD
*
* Low weight (high nice) → vruntime accumulates faster → scheduled less.
* ===================================================================== */
#define NICE_0_LOAD 1024u
static const uint32_t nice_to_weight[40] = {
/* nice -20 */ 88761, 71755, 56483, 46273, 36291,
/* nice -15 */ 29154, 23254, 18705, 14949, 11916,
/* nice -10 */ 9548, 7620, 6100, 4904, 3906,
/* nice -5 */ 3121, 2501, 1991, 1586, 1277,
/* nice 0 */ 1024, 820, 655, 526, 423,
/* nice +5 */ 335, 272, 215, 172, 137,
/* nice +10 */ 110, 87, 70, 56, 45,
/* nice +15 */ 36, 29, 23, 18, 15,
};
static inline uint32_t weight_for_nice(int nice) {
int idx = nice + 20;
if (idx < 0) idx = 0;
if (idx > 39) idx = 39;
return nice_to_weight[idx];
}
/* =====================================================================
* PID allocator (simple monotonic counter, single-CPU safe)
* ===================================================================== */
static pid_t g_next_pid = 1;
static pid_t alloc_pid(void) {
return g_next_pid++;
}
/* =====================================================================
* RT bitmap helpers (O(1) find-first-set)
* ===================================================================== */
static inline void rt_bitmap_set(struct rt_prio_array *arr, int prio) {
/* prio is 1..99; store at bit (prio-1) */
int bit = prio - 1;
arr->bitmap[bit / 64] |= (1ULL << (bit % 64));
}
static inline void rt_bitmap_clear(struct rt_prio_array *arr, int prio) {
int bit = prio - 1;
arr->bitmap[bit / 64] &= ~(1ULL << (bit % 64));
}
/* Returns the highest-priority (lowest numeric) non-empty RT queue,
* or -1 if all are empty. */
static inline int rt_bitmap_first(const struct rt_prio_array *arr) {
if (arr->bitmap[0]) return __builtin_ctzll(arr->bitmap[0]) + 1;
if (arr->bitmap[1]) return __builtin_ctzll(arr->bitmap[1]) + 65;
return -1;
}
/* =====================================================================
* RT FIFO queue operations
* ===================================================================== */
static void rt_enqueue(struct runqueue *rq, task_t *task) {
int prio = task->static_prio;
struct rt_prio_array *arr = &rq->rt;
task->rq_next = NULL;
task->rq_prev = arr->tail[prio];
if (arr->tail[prio])
arr->tail[prio]->rq_next = task;
else
arr->head[prio] = task;
arr->tail[prio] = task;
rt_bitmap_set(arr, prio);
arr->total++;
rq->nr_running++;
}
/* Dequeue the head of the given priority's FIFO */
static task_t *rt_dequeue_head(struct runqueue *rq, int prio) {
struct rt_prio_array *arr = &rq->rt;
task_t *task = arr->head[prio];
if (!task) return NULL;
arr->head[prio] = task->rq_next;
if (arr->head[prio])
arr->head[prio]->rq_prev = NULL;
else {
arr->tail[prio] = NULL;
rt_bitmap_clear(arr, prio);
}
task->rq_next = task->rq_prev = NULL;
arr->total--;
rq->nr_running--;
return task;
}
/* Remove a specific task from an RT queue (O(1) with doubly-linked list) */
static void rt_remove(struct runqueue *rq, task_t *task) {
int prio = task->static_prio;
struct rt_prio_array *arr = &rq->rt;
if (task->rq_prev) task->rq_prev->rq_next = task->rq_next;
else arr->head[prio] = task->rq_next;
if (task->rq_next) task->rq_next->rq_prev = task->rq_prev;
else arr->tail[prio] = task->rq_prev;
if (!arr->head[prio])
rt_bitmap_clear(arr, prio);
task->rq_next = task->rq_prev = NULL;
arr->total--;
rq->nr_running--;
}
/* =====================================================================
* CFS (normal) queue operations — sorted ascending by vruntime
* ===================================================================== */
/*
* Insert task into the CFS list, keeping it sorted by vruntime.
* O(n) — acceptable for hobby-kernel task counts; replace with
* red-black tree if you hit performance issues.
*/
static void cfs_enqueue(struct runqueue *rq, task_t *task) {
/* New tasks start at min_vruntime so they don't starve incumbents
* but also don't get a massive head-start. */
if (task->vruntime < rq->min_vruntime)
task->vruntime = rq->min_vruntime;
task_t **pp = &rq->cfs_head;
task_t *prev = NULL;
while (*pp && (*pp)->vruntime <= task->vruntime) {
prev = *pp;
pp = &(*pp)->rq_next;
}
task->rq_next = *pp;
task->rq_prev = prev;
if (*pp) (*pp)->rq_prev = task;
*pp = task;
rq->cfs_count++;
rq->nr_running++;
}
/* Remove the task with the smallest vruntime (head of list) */
static task_t *cfs_dequeue_min(struct runqueue *rq) {
task_t *task = rq->cfs_head;
if (!task) return NULL;
rq->cfs_head = task->rq_next;
if (rq->cfs_head) {
rq->cfs_head->rq_prev = NULL;
rq->min_vruntime = rq->cfs_head->vruntime;
}
task->rq_next = task->rq_prev = NULL;
rq->cfs_count--;
rq->nr_running--;
return task;
}
/* Remove a specific task from the CFS queue */
static void cfs_remove(struct runqueue *rq, task_t *task) {
if (task->rq_prev) task->rq_prev->rq_next = task->rq_next;
else rq->cfs_head = task->rq_next;
if (task->rq_next) task->rq_next->rq_prev = task->rq_prev;
task->rq_next = task->rq_prev = NULL;
rq->cfs_count--;
rq->nr_running--;
}
/* =====================================================================
* Timeslice calculation
* ===================================================================== */
static uint64_t calc_timeslice(const task_t *task) {
switch (task->policy) {
case SCHED_FIFO:
return UINT64_MAX; /* Runs until it yields or blocks */
case SCHED_RR:
return SCHED_RR_SLICE_MS;
case SCHED_IDLE:
return SCHED_BASE_SLICE_MS;
default: /* SCHED_NORMAL / SCHED_BATCH */
{
uint32_t w = weight_for_nice(task->nice);
uint64_t ms = (uint64_t)SCHED_BASE_SLICE_MS * w / NICE_0_LOAD;
if (ms < SCHED_MIN_SLICE_MS) ms = SCHED_MIN_SLICE_MS;
if (ms > SCHED_MAX_SLICE_MS) ms = SCHED_MAX_SLICE_MS;
return ms;
}
}
}
/* Update a task's vruntime based on how many ticks it actually ran */
static void update_vruntime(task_t *task, uint64_t elapsed_ticks) {
task->sum_exec_runtime += elapsed_ticks;
if (task->policy == SCHED_NORMAL || task->policy == SCHED_BATCH) {
/* vruntime_delta = ticks * NICE_0_LOAD / weight
* High-weight (low nice) tasks accumulate vruntime slowly → more CPU. */
uint32_t w = weight_for_nice(task->nice);
uint64_t delta = elapsed_ticks * NICE_0_LOAD / w;
task->vruntime += delta;
} else {
/* RT and idle: track raw ticks for accounting; vruntime unused */
task->vruntime += elapsed_ticks;
}
}
/* =====================================================================
* sched_enqueue / sched_dequeue (public, uses run-queue lock)
* ===================================================================== */
void sched_enqueue(task_t *task) {
uint64_t flags;
spinlock_acquire_irqsave(&g_runqueue.lock, &flags);
task->state = TASK_RUNNING;
if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) {
rt_enqueue(&g_runqueue, task);
} else if (task->policy == SCHED_IDLE) {
/* SCHED_IDLE: only one idle task, stored separately */
g_runqueue.idle = task;
} else {
cfs_enqueue(&g_runqueue, task);
}
spinlock_release_irqrestore(&g_runqueue.lock, flags);
}
void sched_dequeue(task_t *task) {
uint64_t flags;
spinlock_acquire_irqsave(&g_runqueue.lock, &flags);
if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) {
rt_remove(&g_runqueue, task);
} else if (task->policy != SCHED_IDLE) {
cfs_remove(&g_runqueue, task);
}
spinlock_release_irqrestore(&g_runqueue.lock, flags);
}
/* =====================================================================
* pick_next_task (called with interrupts OFF and lock held)
*
* Priority order:
* 1. Highest RT priority with a runnable task
* 2. Normal task with smallest vruntime
* 3. Idle task (always exists, never NULL)
* ===================================================================== */
static task_t *pick_next_task(struct runqueue *rq) {
/* 1. Real-time */
int rt_prio = rt_bitmap_first(&rq->rt);
if (rt_prio > 0) {
return rt_dequeue_head(rq, rt_prio);
}
/* 2. CFS normal */
if (rq->cfs_count > 0) {
return cfs_dequeue_min(rq);
}
/* 3. Idle fallback */
return rq->idle;
}
/* =====================================================================
* Trampolines
* These are the "return addresses" pushed onto a new task's kernel
* stack. When the task is scheduled for the first time, ret inside
* sched_context_switch() jumps here.
* ===================================================================== */
static void kthread_trampoline(void) {
/* We arrive with interrupts disabled (just after a context switch).
* Re-enable them before entering user code. */
x86_64_EnableInterrupts();
task_t *self = g_current_task;
self->kthread_entry(self->kthread_arg);
/* Thread returned — treat it as a clean exit. */
sched_exit(0);
}
static void user_task_trampoline(void) {
x86_64_EnableInterrupts();
task_t *self = g_current_task;
/*
* Build an iretq frame on the current (kernel) stack and enter
* user mode. We reset the stack pointer to the very top of the
* kernel stack first, so the iretq frame doesn't sit below a
* pile of stale context-switch frames.
*
* Segment selectors (from your GDT / STAR setup):
* User CS = 0x23 (GDT index 4, RPL 3)
* User SS = 0x1B (GDT index 3, RPL 3)
*/
uint64_t kstack_top = (uint64_t)self->kernel_stack + self->kernel_stack_size;
uint64_t user_rip = self->user_entry;
uint64_t user_rsp = self->user_stack_top;
asm volatile(
"movq %0, %%rsp\n\t" /* Reset kernel RSP to stack top */
"pushq $0x1B\n\t" /* SS user data segment */
"pushq %1\n\t" /* RSP user stack pointer */
"pushfq\n\t" /* RFLAGS */
"orq $0x200, (%%rsp)\n\t" /* Set IF so user code runs with */
/* interrupts enabled */
"pushq $0x23\n\t" /* CS user code segment */
"pushq %2\n\t" /* RIP user entry point */
"iretq\n\t"
:
: "r"(kstack_top), "r"(user_rsp), "r"(user_rip)
: "memory"
);
__builtin_unreachable();
}
/* =====================================================================
* Kernel stack setup for a new task
*
* Lay out: [trampoline_addr] [r15=0] [r14=0] [r13=0] [r12=0]
* [rbp=0] [rbx=0] ← ctx.rsp points here
*
* sched_context_switch pops in order: rbx, rbp, r12, r13, r14, r15,
* then ret → trampoline.
* ===================================================================== */
static void setup_initial_kstack(task_t *task, void *trampoline) {
uint64_t *sp = (uint64_t *)((uint8_t *)task->kernel_stack
+ task->kernel_stack_size);
*--sp = (uint64_t)trampoline; /* "return address" → trampoline */
*--sp = 0; /* r15 */
*--sp = 0; /* r14 */
*--sp = 0; /* r13 */
*--sp = 0; /* r12 */
*--sp = 0; /* rbp */
*--sp = 0; /* rbx */
task->ctx.rsp = (uint64_t)sp;
}
/* =====================================================================
* Task allocation helper
* ===================================================================== */
static task_t *alloc_task(const char *name, bool is_user) {
task_t *task = kmalloc(sizeof(task_t));
if (!task) return NULL;
memset(task, 0, sizeof(task_t));
strncpy(task->name, name, sizeof(task->name) - 1);
task->is_user = is_user;
task->state = TASK_RUNNING;
task->policy = SCHED_NORMAL;
task->nice = NICE_DEFAULT;
task->static_prio = NICE_TO_PRIO(NICE_DEFAULT);
task->prio = task->static_prio;
task->pid = alloc_pid();
task->ppid = g_current_task ? g_current_task->pid : 0;
task->parent = g_current_task;
/* Default: all signals use SIG_DFL */
for (int i = 0; i < _NSIG; i++)
task->sigactions[i].sa_handler = SIG_DFL;
/* Allocate kernel stack */
task->kernel_stack_size = KSTACK_SIZE;
task->kernel_stack = kmalloc(KSTACK_SIZE);
if (!task->kernel_stack) {
kfree(task);
return NULL;
}
memset(task->kernel_stack, 0xCC, KSTACK_SIZE); /* poison */
return task;
}
/* =====================================================================
* Public task-creation API
* ===================================================================== */
task_t *sched_create_kthread(const char *name,
void (*entry)(void *), void *arg)
{
task_t *task = alloc_task(name, false);
if (!task) return NULL;
task->kthread_entry = entry;
task->kthread_arg = arg;
task->ctx.cr3 = 0; /* Kernel threads share kernel_pagemap */
setup_initial_kstack(task, kthread_trampoline);
task->time_slice = calc_timeslice(task);
task->vruntime = g_runqueue.min_vruntime;
sched_enqueue(task);
printf("[sched] kthread '%s' pid=%d created\n", task->name, task->pid);
return task;
}
task_t *sched_create_user_task(const char *name,
uint64_t entry_rip, uint64_t user_rsp,
struct pagemap *pm)
{
task_t *task = alloc_task(name, true);
if (!task) return NULL;
task->pagemap = pm;
task->user_entry = entry_rip;
task->user_stack_top= user_rsp;
/* CR3 = physical address of PML4 */
task->ctx.cr3 = (uint64_t)pm->top_level - MEM_PHYS_OFFSET;
setup_initial_kstack(task, user_task_trampoline);
task->time_slice = calc_timeslice(task);
task->vruntime = g_runqueue.min_vruntime;
sched_enqueue(task);
printf("[sched] user task '%s' pid=%d created, entry=0x%lx\n",
task->name, task->pid, entry_rip);
return task;
}
/* =====================================================================
* Idle task
* ===================================================================== */
static void idle_entry(void *arg) {
(void)arg;
for (;;) {
asm volatile("sti; hlt; cli" ::: "memory");
/* If we get here, an interrupt fired; schedule() will be
* called by sched_tick if a real task became runnable. */
}
}
/* =====================================================================
* sched_init — set up the idle task and initialise the run queue
* ===================================================================== */
void sched_init(void) {
spinlock_init(&g_runqueue.lock);
g_runqueue.min_vruntime = 0;
/* Synthesise a "current" descriptor for the boot thread so that
* the first schedule() call has something valid in g_current_task. */
task_t *boot = kmalloc(sizeof(task_t));
if (!boot) {
printf("[sched] FATAL: cannot allocate boot task\n");
while (1) asm volatile("hlt");
}
memset(boot, 0, sizeof(task_t));
strncpy(boot->name, "boot", 63);
boot->pid = alloc_pid(); /* pid 1 */
boot->state = TASK_RUNNING;
boot->policy = SCHED_NORMAL;
boot->nice = NICE_DEFAULT;
boot->static_prio = NICE_TO_PRIO(NICE_DEFAULT);
boot->prio = boot->static_prio;
boot->time_slice = calc_timeslice(boot);
boot->slice_start = g_Ticks;
/* kernel_stack: we're already running on it; RSP will be saved by
* the first sched_context_switch() call, so no setup needed. */
boot->is_user = false;
g_current_task = boot;
g_runqueue.current = boot;
/* Create the idle task but do NOT go through sched_create_kthread
* because we store it separately (not on the CFS/RT queues). */
task_t *idle = alloc_task("idle", false);
if (!idle) {
printf("[sched] FATAL: cannot allocate idle task\n");
while (1) asm volatile("hlt");
}
idle->policy = SCHED_IDLE;
idle->static_prio = IDLE_PRIO;
idle->prio = IDLE_PRIO;
idle->kthread_entry = idle_entry;
idle->kthread_arg = NULL;
idle->ctx.cr3 = 0;
setup_initial_kstack(idle, kthread_trampoline);
g_runqueue.idle = idle;
printf("[sched] initialised; boot pid=%d\n", boot->pid);
}
/* =====================================================================
* schedule() — the heart of the scheduler
*
* Must be called with interrupts disabled (IF=0). Restores IF when
* the scheduled-in task next runs (via its own stack context or via
* the kthread_trampoline which calls x86_64_EnableInterrupts).
* ===================================================================== */
void schedule(void) {
/*
* We deliberately do NOT use spinlock_acquire_irqsave here because
* we're already called with IF=0 (either from an ISR or from
* sched_yield/sched_block which do cli first).
* We use a plain spinlock_acquire_or_wait so that on SMP (future)
* another CPU spinning on the lock eventually gets it.
*/
spinlock_acquire_or_wait(&g_runqueue.lock);
task_t *prev = g_runqueue.current;
task_t *next = pick_next_task(&g_runqueue);
if (next == prev || next == NULL) {
/* Nothing to switch to; keep running current task. */
spinlock_drop(&g_runqueue.lock);
return;
}
/* Account for time the current task actually ran */
uint64_t now = g_Ticks;
uint64_t elapsed = (prev->slice_start <= now)
? (now - prev->slice_start)
: 0;
update_vruntime(prev, elapsed);
/* Re-enqueue the outgoing task if it is still runnable (preempted).
* If it blocked/exited, its state is no longer TASK_RUNNING. */
if (prev->state == TASK_RUNNING && prev != g_runqueue.idle) {
prev->time_slice = calc_timeslice(prev); /* refresh slice */
if (prev->policy == SCHED_FIFO || prev->policy == SCHED_RR) {
rt_enqueue(&g_runqueue, prev);
} else if (prev->policy != SCHED_IDLE) {
cfs_enqueue(&g_runqueue, prev);
}
}
/* Set up the incoming task */
next->state = TASK_RUNNING;
next->need_reschedule = false;
next->slice_start = now;
g_runqueue.current = next;
g_current_task = next;
g_runqueue.nr_switches++;
/* Update TSS.RSP0 so that user-mode interrupts for this task use
* the correct kernel stack. */
if (next->is_user && next->kernel_stack) {
kernel_tss.rsp0 = (uint64_t)next->kernel_stack
+ next->kernel_stack_size;
}
spinlock_drop(&g_runqueue.lock);
/* ---- Context switch -------------------------------------------- */
sched_context_switch(&prev->ctx, &next->ctx);
/*
* When we return here we are BACK in the context of `prev`
* (which has just been rescheduled). Process any pending signals
* before returning to user space.
*/
task_handle_pending_signals();
}
/* =====================================================================
* sched_tick() — called from the PIT/LAPIC IRQ every millisecond
* ===================================================================== */
void sched_tick(void) {
task_t *cur = g_current_task;
if (!cur) return;
/* Decrement remaining timeslice */
if (cur->time_slice > 0)
cur->time_slice--;
/*
* Trigger a reschedule if:
* (a) The timeslice ran out, or
* (b) need_reschedule was set by a wakeup of a higher-prio task.
*/
if (cur->time_slice == 0 || cur->need_reschedule) {
/* schedule() expects IF=0 — guaranteed here because we are
* inside an IRQ handler; the CPU cleared IF on entry. */
schedule();
}
}
/* =====================================================================
* sched_yield() — voluntary CPU release
* ===================================================================== */
void sched_yield(void) {
x86_64_DisableInterrupts();
g_current_task->time_slice = 0; /* Force preemption */
schedule();
x86_64_EnableInterrupts();
}
/* =====================================================================
* sched_block() — put current task to sleep
*
* Caller must set the task state BEFORE calling (the function
* honours whatever state is already set). Alternatively pass the
* desired reason and we set it here.
* ===================================================================== */
void sched_block(task_state_t reason) {
x86_64_DisableInterrupts();
g_current_task->state = reason;
schedule();
x86_64_EnableInterrupts();
/* When we return here the task has been woken up. */
}
/* =====================================================================
* sched_wake() — wake a sleeping task
* Safe to call from interrupt context.
* ===================================================================== */
void sched_wake(task_t *task) {
if (!task) return;
uint64_t flags;
spinlock_acquire_irqsave(&g_runqueue.lock, &flags);
if (task->state != TASK_RUNNING) {
task->state = TASK_RUNNING;
task->time_slice = calc_timeslice(task);
if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) {
rt_enqueue(&g_runqueue, task);
} else if (task->policy != SCHED_IDLE) {
cfs_enqueue(&g_runqueue, task);
}
/*
* Preempt the current task if the woken task has strictly
* higher priority (lower numeric priority value).
*/
task_t *cur = g_runqueue.current;
if (cur && task->prio < cur->prio) {
cur->need_reschedule = true;
}
}
spinlock_release_irqrestore(&g_runqueue.lock, flags);
}
/* =====================================================================
* sched_exit() — terminate the current task (noreturn)
* ===================================================================== */
void sched_exit(int exit_code) {
x86_64_DisableInterrupts();
task_t *self = g_current_task;
self->exit_code = exit_code;
self->state = TASK_ZOMBIE;
/* Notify parent (send SIGCHLD) */
if (self->parent)
task_send_signal(self->parent, SIGCHLD);
printf("[sched] task '%s' pid=%d exited with code %d\n",
self->name, self->pid, exit_code);
/* Hand off to someone else; we will never return. */
schedule();
/* schedule() should never return to a ZOMBIE task, but just in case: */
for (;;) asm volatile("hlt");
__builtin_unreachable();
}
/* =====================================================================
* Signal delivery
* ===================================================================== */
/* Default signal actions */
typedef enum { SIG_ACTION_TERM, SIG_ACTION_CORE, SIG_ACTION_IGN,
SIG_ACTION_STOP, SIG_ACTION_CONT } sig_default_action_t;
static sig_default_action_t default_action(int signum) {
switch (signum) {
case SIGHUP: case SIGINT: case SIGKILL: case SIGPIPE:
case SIGALRM: case SIGTERM: case SIGUSR1: case SIGUSR2:
case SIGPROF: case SIGVTALRM: case SIGSTKFLT:
return SIG_ACTION_TERM;
case SIGQUIT: case SIGILL: case SIGABRT: case SIGFPE:
case SIGSEGV: case SIGBUS: case SIGSYS: case SIGTRAP:
case SIGXCPU: case SIGXFSZ:
return SIG_ACTION_CORE; /* we treat CORE same as TERM for now */
case SIGCHLD: case SIGURG: case SIGWINCH: case SIGIO: case SIGPWR:
return SIG_ACTION_IGN;
case SIGSTOP: case SIGTSTP: case SIGTTIN: case SIGTTOU:
return SIG_ACTION_STOP;
case SIGCONT:
return SIG_ACTION_CONT;
default:
return SIG_ACTION_TERM;
}
}
int task_send_signal(task_t *task, int signum) {
if (!task) return -1;
if (signum <= 0 || signum >= _NSIG) return -1;
uint64_t flags;
spinlock_acquire_irqsave(&g_runqueue.lock, &flags);
/* Set pending bit */
task->pending_signals |= (1ULL << signum);
/* SIGKILL and SIGCONT always wake the target */
if (signum == SIGKILL || signum == SIGCONT) {
if (task->state == TASK_INTERRUPTIBLE ||
task->state == TASK_STOPPED ||
task->state == TASK_UNINTERRUPTIBLE) {
task->state = TASK_RUNNING;
/* Re-enqueue (simplified: call rt/cfs directly since lock held) */
if (task->policy == SCHED_FIFO || task->policy == SCHED_RR)
rt_enqueue(&g_runqueue, task);
else if (task->policy != SCHED_IDLE)
cfs_enqueue(&g_runqueue, task);
}
} else if (!(task->signal_mask & (1ULL << signum))) {
/* Unblocked signal: wake an interruptible sleeper */
if (task->state == TASK_INTERRUPTIBLE) {
task->state = TASK_RUNNING;
if (task->policy == SCHED_FIFO || task->policy == SCHED_RR)
rt_enqueue(&g_runqueue, task);
else if (task->policy != SCHED_IDLE)
cfs_enqueue(&g_runqueue, task);
}
}
spinlock_release_irqrestore(&g_runqueue.lock, flags);
return 0;
}
/*
* Handle pending signals for the current task.
* Called just before returning to user space (end of schedule(), syscall
* return path, or end of IRQ handler for user-mode tasks).
*/
void task_handle_pending_signals(void) {
task_t *self = g_current_task;
if (!self) return;
while (self->pending_signals & ~self->signal_mask) {
/* Find the lowest-numbered pending, unblocked signal */
uint64_t deliverable = self->pending_signals & ~self->signal_mask;
int signum = __builtin_ctzll(deliverable) + 1; /* +1: bit 0 = sig 1 */
if (signum >= _NSIG) break;
/* Clear the pending bit */
self->pending_signals &= ~(1ULL << (signum - 1));
sighandler_t handler = self->sigactions[signum].sa_handler;
if (handler == SIG_IGN) {
/* Explicitly ignored */
if (signum == SIGCHLD) continue; /* common: reap silently */
continue;
} else if (handler != SIG_DFL) {
/*
* User-defined handler.
*
* A full POSIX implementation would build a signal frame on
* the user stack and set registers so that iretq delivers
* the signal; that requires knowing the saved RFLAGS/RIP
* from the ISR frame. We leave this as a TODO and just
* call the handler directly for kernel threads.
*
* For user tasks this is the point where you would push a
* ucontext_t / sigframe onto the user stack and adjust the
* saved user RIP in the ISR frame.
*/
if (!self->is_user) {
handler(signum);
} else {
/* TODO: build user-space signal frame */
printf("[signal] TODO: deliver signal %d to user task '%s'\n",
signum, self->name);
}
} else {
/* SIG_DFL */
switch (default_action(signum)) {
case SIG_ACTION_TERM:
case SIG_ACTION_CORE:
printf("[signal] task '%s' pid=%d killed by signal %d\n",
self->name, self->pid, signum);
sched_exit(128 + signum);
break; /* unreachable */
case SIG_ACTION_STOP:
self->state = TASK_STOPPED;
/* Notify parent */
if (self->parent) task_send_signal(self->parent, SIGCHLD);
sched_block(TASK_STOPPED);
break;
case SIG_ACTION_CONT:
/* Already running (we were woken to handle this) */
break;
case SIG_ACTION_IGN:
break;
}
}
}
}
/* =====================================================================
* sched_find_task (linear scan — O(n), suitable for small task counts)
* ===================================================================== */
task_t *sched_find_task(pid_t pid) {
/*
* Walk the CFS list and RT queues. In a production kernel this
* would be a hash table. For KirkOS this is fine.
*/
task_t *t = g_runqueue.cfs_head;
while (t) {
if (t->pid == pid) return t;
t = t->rq_next;
}
for (int p = RT_PRIO_MIN; p <= RT_PRIO_MAX; p++) {
t = g_runqueue.rt.head[p];
while (t) {
if (t->pid == pid) return t;
t = t->rq_next;
}
}
if (g_runqueue.current && g_runqueue.current->pid == pid)
return g_runqueue.current;
return NULL;
}
/* =====================================================================
* Priority / scheduler controls
* ===================================================================== */
int task_set_nice(task_t *task, int nice) {
if (nice < NICE_MIN) nice = NICE_MIN;
if (nice > NICE_MAX) nice = NICE_MAX;
int old_nice = task->nice;
task->nice = nice;
task->static_prio = NICE_TO_PRIO(nice);
task->prio = task->static_prio;
/*
* Recompute the timeslice. If the task is currently on a queue we
* would need to re-sort it (out of scope here — next schedule() will
* pick the right slot when it re-enqueues).
*/
task->time_slice = calc_timeslice(task);
return old_nice;
}
int task_set_scheduler(task_t *task, int policy, int rt_prio) {
if (policy != SCHED_NORMAL && policy != SCHED_FIFO &&
policy != SCHED_RR && policy != SCHED_BATCH &&
policy != SCHED_IDLE)
return -1;
if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
(rt_prio < RT_PRIO_MIN || rt_prio > RT_PRIO_MAX))
return -1;
/* Remove from current queue, change policy, re-enqueue */
bool was_queued = (task->state == TASK_RUNNING &&
task != g_runqueue.current);
if (was_queued)
sched_dequeue(task);
task->policy = policy;
task->static_prio = (policy == SCHED_FIFO || policy == SCHED_RR)
? rt_prio
: NICE_TO_PRIO(task->nice);
task->prio = task->static_prio;
task->time_slice = calc_timeslice(task);
if (was_queued)
sched_enqueue(task);
return 0;
}
+312
View File
@@ -0,0 +1,312 @@
#pragma once
#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#include "mm/vmm.h"
#include "mp/spinlock.h"
/* =====================================================================
* POSIX signal numbers
* ===================================================================== */
#define SIGHUP 1
#define SIGINT 2
#define SIGQUIT 3
#define SIGILL 4
#define SIGTRAP 5
#define SIGABRT 6
#define SIGBUS 7
#define SIGFPE 8
#define SIGKILL 9 /* cannot be caught or ignored */
#define SIGUSR1 10
#define SIGSEGV 11
#define SIGUSR2 12
#define SIGPIPE 13
#define SIGALRM 14
#define SIGTERM 15
#define SIGSTKFLT 16
#define SIGCHLD 17
#define SIGCONT 18
#define SIGSTOP 19 /* cannot be caught or ignored */
#define SIGTSTP 20
#define SIGTTIN 21
#define SIGTTOU 22
#define SIGURG 23
#define SIGXCPU 24
#define SIGXFSZ 25
#define SIGVTALRM 26
#define SIGPROF 27
#define SIGWINCH 28
#define SIGIO 29
#define SIGPWR 30
#define SIGSYS 31
#define _NSIG 32
typedef void (*sighandler_t)(int signum);
#define SIG_DFL ((sighandler_t)0) /* default action */
#define SIG_IGN ((sighandler_t)1) /* ignore signal */
#define SIG_ERR ((sighandler_t)-1) /* error return */
#define SA_NOCLDSTOP 0x00000001
#define SA_NOCLDWAIT 0x00000002
#define SA_SIGINFO 0x00000004
#define SA_RESTORER 0x04000000
#define SA_ONSTACK 0x08000000
#define SA_RESTART 0x10000000
#define SA_NODEFER 0x40000000
#define SA_RESETHAND 0x80000000
struct sigaction {
sighandler_t sa_handler;
uint64_t sa_mask; /* signals blocked while handler runs */
int sa_flags;
};
/* =====================================================================
* Scheduling policies (POSIX)
* ===================================================================== */
#define SCHED_NORMAL 0 /* Fair time-sharing (CFS-like, nice values) */
#define SCHED_FIFO 1 /* Real-time FIFO runs until yield or block */
#define SCHED_RR 2 /* Real-time round-robin with fixed timeslice */
#define SCHED_BATCH 3 /* CPU-bound variant of NORMAL, no preemption boost */
#define SCHED_IDLE 5 /* Only runs when nothing else is runnable */
/* Priority ranges:
* RT tasks: static_prio 1 .. 99 (1 = highest)
* Normal tasks: static_prio 100 .. 139 (maps from nice -20 .. +19)
* Idle: static_prio 140
*/
#define MAX_RT_PRIO 100
#define MAX_PRIO 140
#define RT_PRIO_MIN 1
#define RT_PRIO_MAX 99
#define NICE_MIN (-20)
#define NICE_MAX 19
#define NICE_DEFAULT 0
#define IDLE_PRIO 140
/* nice ↔ static_prio conversions for SCHED_NORMAL */
#define NICE_TO_PRIO(n) (MAX_RT_PRIO + (n) + 20)
#define PRIO_TO_NICE(p) ((p) - MAX_RT_PRIO - 20)
/* Timeslice constants (ticks, PIT at 1000 Hz → 1 tick = 1 ms) */
#define SCHED_BASE_SLICE_MS 10 /* base timeslice for NICE_DEFAULT */
#define SCHED_MIN_SLICE_MS 1 /* minimum timeslice (1 ms) */
#define SCHED_MAX_SLICE_MS 100 /* maximum timeslice (100 ms) */
#define SCHED_RR_SLICE_MS 10 /* fixed timeslice for SCHED_RR */
/* =====================================================================
* Task states
* ===================================================================== */
typedef enum task_state {
TASK_RUNNING = 0, /* Runnable on a run queue or executing */
TASK_INTERRUPTIBLE = 1, /* Sleeping, can be woken by signal */
TASK_UNINTERRUPTIBLE = 2, /* Sleeping, ignores signals (D state) */
TASK_STOPPED = 4, /* Halted by SIGSTOP / SIGTSTP */
TASK_ZOMBIE = 8, /* Exited, waiting for parent to wait() */
TASK_DEAD = 16, /* Fully reaped, memory can be freed */
} task_state_t;
/* =====================================================================
* Minimal CPU context
*
* Only RSP and CR3 live here; all callee-saved GPRs are pushed onto
* the kernel stack by sched_context_switch() before RSP is saved.
* This keeps the struct tiny and the assembly dead simple.
* ===================================================================== */
struct cpu_context {
uint64_t rsp; /* Saved kernel stack pointer */
uint64_t cr3; /* Physical address of PML4 (0 = stay on kernel map) */
};
/* =====================================================================
* Task / Process descriptor
* ===================================================================== */
typedef int pid_t;
typedef struct task task_t;
struct task {
/* ---- CPU context (must stay first asm references it at offset 0) */
struct cpu_context ctx;
/* ---- Identity ---------------------------------------------------- */
pid_t pid;
pid_t ppid;
char name[64];
bool is_user; /* true = user process, false = kernel thread */
/* ---- Scheduling policy and priority ------------------------------ */
int policy; /* SCHED_NORMAL / SCHED_FIFO / SCHED_RR / … */
int static_prio; /* Immutable base priority */
int nice; /* -20 .. +19, SCHED_NORMAL only */
int prio; /* Effective priority (may be boosted) */
/* ---- State ------------------------------------------------------- */
volatile task_state_t state;
bool need_reschedule; /* Set when a higher-priority task wakes up */
/* ---- Time accounting (ticks, 1 tick = 1 ms at 1000 Hz PIT) ------- */
uint64_t vruntime; /* Virtual runtime (tick-equivalents, weighted) */
uint64_t sum_exec_runtime; /* Total CPU time consumed (raw ticks) */
uint64_t time_slice; /* Remaining timeslice (ticks) */
uint64_t slice_start; /* Tick when the current slice began */
/* ---- Memory ------------------------------------------------------ */
struct pagemap *pagemap; /* NULL → use kernel_pagemap */
void *kernel_stack; /* Pointer to bottom of kernel-stack alloc */
size_t kernel_stack_size;
/* ---- Entry points ------------------------------------------------ */
uint64_t user_entry; /* User-space RIP for user tasks */
uint64_t user_stack_top; /* User-space RSP for user tasks */
void (*kthread_entry)(void *arg); /* Kernel thread entry point */
void *kthread_arg;
/* ---- Signals ----------------------------------------------------- */
uint64_t pending_signals; /* Bitmask of unhandled signals */
uint64_t signal_mask; /* Blocked (SIG_BLOCK) signals */
struct sigaction sigactions[_NSIG];
/* ---- Exit status ------------------------------------------------- */
int exit_code;
/* ---- Run-queue linkage (doubly-linked, intrusive) ---------------- */
task_t *rq_next;
task_t *rq_prev;
/* ---- Process tree ------------------------------------------------ */
task_t *parent;
task_t *first_child;
task_t *next_sibling;
};
/* =====================================================================
* Run queue
*
* Two sub-queues per CPU (single CPU for now, MP-ready by design):
*
* 1. RT array 99 FIFO lists indexed by RT priority. Highest
* priority with a runnable task is O(1) via bitmap.
*
* 2. CFS list Tasks sorted by vruntime (ascending). Pick-next
* is O(1) (front of list); insert is O(n) good
* enough for now, swap in an rb-tree later.
*
* Idle task is stored separately and is returned only when both
* sub-queues are empty.
* ===================================================================== */
#define RT_QUEUE_LEVELS MAX_RT_PRIO /* 100 levels (index 0 = unused, 199 used) */
struct rt_prio_array {
/*
* Bitmap: bit N is set ↔ rt_queue[N] is non-empty.
* Two 64-bit words cover 128 bits, enough for 100 levels.
*/
uint64_t bitmap[2];
task_t *head[RT_QUEUE_LEVELS]; /* FIFO queue heads */
task_t *tail[RT_QUEUE_LEVELS]; /* FIFO queue tails */
int total; /* Total RT tasks enqueued */
};
struct runqueue {
spinlock_t lock;
/* Real-time (SCHED_FIFO / SCHED_RR) */
struct rt_prio_array rt;
/* Normal (SCHED_NORMAL / SCHED_BATCH) — sorted ascending by vruntime */
task_t *cfs_head;
int cfs_count;
uint64_t min_vruntime; /* Lower bound; new tasks start from here */
/* Idle fallback (SCHED_IDLE) */
task_t *idle;
/* Currently executing task on this CPU */
task_t *current;
/* Statistics */
uint64_t nr_switches;
uint64_t nr_running; /* Total runnable tasks (all classes) */
};
/* =====================================================================
* Globals (single-CPU; extend to per-cpu array for SMP)
* ===================================================================== */
extern struct runqueue g_runqueue;
extern task_t *g_current_task; /* Pointer to currently-running task */
/* =====================================================================
* Public scheduler API
* ===================================================================== */
/* Initialise the scheduler (call after PMM + VMM + PIT are ready) */
void sched_init(void);
/* Create a kernel thread and enqueue it immediately */
task_t *sched_create_kthread(const char *name,
void (*entry)(void *), void *arg);
/* Create a user-space task and enqueue it immediately */
task_t *sched_create_user_task(const char *name,
uint64_t entry_rip, uint64_t user_rsp,
struct pagemap *pm);
/* Add a task to the appropriate run queue */
void sched_enqueue(task_t *task);
/* Remove a task from its run queue (does NOT free it) */
void sched_dequeue(task_t *task);
/* Pick next task and perform context switch (call with IF=0) */
void schedule(void);
/* Called from the PIT/LAPIC timer IRQ every tick (1 ms) */
void sched_tick(void);
/* Voluntarily give up the CPU */
void sched_yield(void);
/* Block current task (IF must be 0 on entry; IF restored by schedule) */
void sched_block(task_state_t reason);
/* Wake a sleeping task (safe to call from IRQ context) */
void sched_wake(task_t *task);
/* Terminate the current task (noreturn) */
void sched_exit(int exit_code) __attribute__((noreturn));
/* ---- Signal API ------------------------------------------------------- */
/* Send signal signum to task (safe from any context) */
int task_send_signal(task_t *task, int signum);
/* Find a task by PID (NULL if not found) */
task_t *sched_find_task(pid_t pid);
/* Process pending signals for the current task (call before returning to user) */
void task_handle_pending_signals(void);
/* ---- Priority / policy control --------------------------------------- */
/* Set nice value [-20, +19] for a SCHED_NORMAL task; returns old nice */
int task_set_nice(task_t *task, int nice);
/* Change scheduling policy + RT priority; returns 0 on success */
int task_set_scheduler(task_t *task, int policy, int rt_prio);
/* ---- Convenience ----------------------------------------------------- */
static inline task_t *sched_current(void) { return g_current_task; }
/* ---- Assembly context switch (defined in sched_switch.S) ------------- */
/*
* Save callee-saved registers of the current context onto its kernel
* stack and record RSP in *from. Then switch to *to's stack, restore
* its callee-saved registers, and return — which resumes wherever *to
* last called schedule(). For first-time tasks the "return" jumps to
* the appropriate trampoline.
*/
void sched_context_switch(struct cpu_context *from,
struct cpu_context *to);
/* Kernel stack size for each task */
#define KSTACK_SIZE (32 * 1024) /* 32 KiB — comfortable headroom */
+95 -1
View File
@@ -3,6 +3,7 @@
#include "mp/percpu.h"
#include "fs/vfs.h"
#include "syscall.h"
#include "sched/scheduler.h"
#define MSR_EFER 0xC0000080
#define MSR_STAR 0xC0000081
@@ -41,7 +42,7 @@ uint64_t syscall_handler(uint64_t num,
uint8_t* buf = (uint8_t*)arg2;
size_t len = (size_t)arg3;
return (uint64_t)VFS_Read(fd, buf, len);
return VFS_Read(fd, buf, len);
}
case SYS_WRITE:
@@ -65,6 +66,99 @@ uint64_t syscall_handler(uint64_t num,
return (uint64_t)VFS_Close(fd);
}
case SYS_GETPID:
return (uint64_t)sched_current()->pid;
case SYS_GETPPID:
return (uint64_t)sched_current()->ppid;
case SYS_EXIT:
case SYS_EXIT_GROUP:
sched_exit((int)arg1);
//noreturn
case SYS_SCHED_YIELD:
sched_yield();
return 0;
case SYS_NICE:
{
int increment = (int)arg1;
int old_nice = sched_current()->nice;
int new_nice = old_nice + increment;
return (uint64_t)task_set_nice(sched_current(), new_nice);
}
case SYS_KILL:
{
pid_t target = (pid_t)arg1;
int sig = (int)arg2;
task_t *t = sched_find_task(target);
if (!t) return (uint64_t)-1;
return (uint64_t)task_send_signal(t, sig);
}
case SYS_SIGACTION:
{
int signum = (int)arg1;
const struct sigaction *act = (const struct sigaction *)arg2;
struct sigaction *oact = (struct sigaction *)arg3;
if (signum <= 0 || signum >= _NSIG)
return (uint64_t)-1;
if (signum == SIGKILL || signum == SIGSTOP)
return (uint64_t)-1; // cannot override
task_t *cur = sched_current();
if (oact)
*oact = cur->sigactions[signum];
if (act)
cur->sigactions[signum] = *act;
return 0;
}
case SYS_SIGPROCMASK:
{
// how: 0=SIG_BLOCK, 1=SIG_UNBLOCK, 2=SIG_SETMASK
int how = (int)arg1;
uint64_t new_set = arg2;
uint64_t *old = (uint64_t *)arg3;
task_t *cur = sched_current();
if (old) *old = cur->signal_mask;
// SIGKILL and SIGSTOP can never be blocked
new_set &= ~((1ULL << SIGKILL) | (1ULL << SIGSTOP));
switch (how) {
case 0: cur->signal_mask |= new_set; break; // SIG_BLOCK
case 1: cur->signal_mask &= ~new_set; break; // SIG_UNBLOCK
case 2: cur->signal_mask = new_set; break; // SIG_SETMASK
default: return (uint64_t)-1;
}
return 0;
}
case SYS_SCHED_GETSCHEDULER:
{
pid_t target = (pid_t)arg1;
task_t *t = target ? sched_find_task(target)
: sched_current();
if (!t) return (uint64_t)-1;
return (uint64_t)t->policy;
}
case SYS_SCHED_SETSCHEDULER:
{
pid_t target = (pid_t)arg1;
int policy = (int)arg2;
int rt_prio = (int)arg3;
task_t *t = target ? sched_find_task(target)
: sched_current();
if (!t) return (uint64_t)-1;
return (uint64_t)task_set_scheduler(t, policy, rt_prio);
}
default:
return (uint64_t)-1;
}
+12
View File
@@ -6,5 +6,17 @@
#define SYS_OPEN 2
#define SYS_CLOSE 3
#define SYS_SCHED_YIELD 24
#define SYS_GETPID 39
#define SYS_GETPPID 110
#define SYS_NICE 34
#define SYS_KILL 62
#define SYS_SIGACTION 13 /* rt_sigaction on Linux */
#define SYS_SIGPROCMASK 14 /* rt_sigprocmask on Linux */
#define SYS_EXIT 60
#define SYS_EXIT_GROUP 231
#define SYS_SCHED_GETSCHEDULER 138
#define SYS_SCHED_SETSCHEDULER 139
void syscall_init(void);
+2
View File
@@ -27,7 +27,9 @@ syscall_entry:
pop %rsi # rsi = a1 (2nd param)
mov %rax, %rdi # rdi = num (1st param)
sub $8, %rsp
call syscall_handler
add $8, %rsp
# Restore user context
pop %r11 # user RFLAGS
Binary file not shown.
BIN
View File
Binary file not shown.
+14 -4
View File
@@ -4,16 +4,26 @@
#define SYS_WRITE 1
#define SYS_OPEN 2
#define SYS_CLOSE 3
#define SYS_SCHED_YIELD 24
static inline long syscall(long num, long a1, long a2, long a3)
static inline long syscall(long num,
unsigned long a1,
unsigned long a2,
unsigned long a3,
unsigned long a4,
unsigned long a5,
unsigned long a6)
{
long ret;
asm volatile (
"mov %4, %%r10\n"
"mov %5, %%r8\n"
"mov %6, %%r9\n"
"syscall"
: "=a"(ret)
: "a"(num), "D"(a1), "S"(a2), "d"(a3)
: "a"(num), "D"(a1), "S"(a2), "d"(a3),
"r"(a4), "r"(a5), "r"(a6)
: "r10", "r8", "r9", "rcx", "r11", "memory"
);
return ret;
}
+10 -10
View File
@@ -15,31 +15,31 @@ unsigned strlen(const char* str)
void main()
{
const char* path = "/qwerty.txt";
const char* msg = "Suki Suki Daisuki Kekkon Shiyo, my honey!";
const unsigned char* path = "/qwerty.txt";
const unsigned char* msg = "Suki Suki Daisuki Kekkon Shiyo, my honey!";
char buf[128];
unsigned char buf[128];
// ── open file ─────────────────────────────
long fd = syscall(SYS_OPEN, (long)path, 0, 0);
unsigned long fd = syscall(SYS_OPEN, (long)path, 0, 0, 0, 0, 0);
// ── write message ─────────────────────────
syscall(SYS_WRITE, fd, (long)msg, strlen(msg));
syscall(SYS_WRITE, fd, (long)msg, strlen(msg), 0, 0, 0);
// ── close ────────────────────────────────
syscall(SYS_CLOSE, fd, 0, 0);
syscall(SYS_CLOSE, fd, 0, 0, 0, 0, 0);
// ── reopen ───────────────────────────────
fd = syscall(SYS_OPEN, (long)path, 0, 0);
fd = syscall(SYS_OPEN, (long)path, 0, 0, 0, 0, 0);
// ── read into buffer ─────────────────────
long n = syscall(SYS_READ, fd, (long)buf, sizeof(buf));
unsigned long n = syscall(SYS_READ, fd, (unsigned long)buf, sizeof(buf), 0, 0, 0);
// ── close ────────────────────────────────
syscall(SYS_CLOSE, fd, 0, 0);
syscall(SYS_CLOSE, fd, 0, 0, 0, 0, 0);
// ── print buffer to stdout ───────────────
syscall(SYS_WRITE, STDOUT, (long)buf, n);
syscall(SYS_WRITE, STDOUT, (unsigned long)buf, n, 0, 0, 0);
// ── done ────────────────────────────────
while (1);