OpenBSD-4.6/sys/kern/kern_sched.c
/* $OpenBSD: kern_sched.c,v 1.13 2009/04/22 08:35:54 art Exp $ */
/*
* Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/param.h>
#include <sys/sched.h>
#include <sys/proc.h>
#include <sys/kthread.h>
#include <sys/systm.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/mutex.h>
#include <machine/atomic.h>
#include <uvm/uvm_extern.h>
#include <sys/malloc.h>
void sched_kthreads_create(void *);
void sched_idle(void *);
int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
struct proc *sched_steal_proc(struct cpu_info *);
/*
* To help choosing which cpu should run which process we keep track
* of cpus which are currently idle and which cpus have processes
* queued.
*/
struct cpuset sched_idle_cpus;
struct cpuset sched_queued_cpus;
/*
* A few notes about cpu_switchto that is implemented in MD code.
*
* cpu_switchto takes two arguments, the old proc and the proc
* it should switch to. The new proc will never be NULL, so we always have
* a saved state that we need to switch to. The old proc however can
* be NULL if the process is exiting. NULL for the old proc simply
* means "don't bother saving old state".
*
* cpu_switchto is supposed to atomically load the new state of the process
* including the pcb, pmap and setting curproc, the p_cpu pointer in the
* proc and p_stat to SONPROC. Atomically with respect to interrupts, other
* cpus in the system must not depend on this state being consistent.
* Therefore no locking is necessary in cpu_switchto other than blocking
* interrupts during the context switch.
*/
/*
* sched_init_cpu is called from main() for the boot cpu, then it's the
* responsibility of the MD code to call it for all other cpus.
*/
void
sched_init_cpu(struct cpu_info *ci)
{
struct schedstate_percpu *spc = &ci->ci_schedstate;
int i;
for (i = 0; i < SCHED_NQS; i++)
TAILQ_INIT(&spc->spc_qs[i]);
spc->spc_idleproc = NULL;
kthread_create_deferred(sched_kthreads_create, ci);
LIST_INIT(&spc->spc_deadproc);
/*
* Slight hack here until the cpuset code handles cpu_info
* structures.
*/
cpuset_init_cpu(ci);
}
void
sched_kthreads_create(void *v)
{
struct cpu_info *ci = v;
struct schedstate_percpu *spc = &ci->ci_schedstate;
static int num;
if (kthread_create(sched_idle, ci, &spc->spc_idleproc, "idle%d", num))
panic("fork idle");
num++;
}
void
sched_idle(void *v)
{
struct schedstate_percpu *spc;
struct proc *p = curproc;
struct cpu_info *ci = v;
int s;
KERNEL_PROC_UNLOCK(p);
spc = &ci->ci_schedstate;
/*
* First time we enter here, we're not supposed to idle,
* just go away for a while.
*/
SCHED_LOCK(s);
cpuset_add(&sched_idle_cpus, ci);
p->p_stat = SSLEEP;
p->p_cpu = ci;
atomic_setbits_int(&p->p_flag, P_CPUPEG);
mi_switch();
cpuset_del(&sched_idle_cpus, ci);
SCHED_UNLOCK(s);
KASSERT(ci == curcpu());
KASSERT(curproc == spc->spc_idleproc);
while (1) {
while (!curcpu_is_idle()) {
struct proc *dead;
SCHED_LOCK(s);
p->p_stat = SSLEEP;
mi_switch();
SCHED_UNLOCK(s);
while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
LIST_REMOVE(dead, p_hash);
exit2(dead);
}
}
splassert(IPL_NONE);
cpuset_add(&sched_idle_cpus, ci);
cpu_idle_enter();
while (spc->spc_whichqs == 0)
cpu_idle_cycle();
cpu_idle_leave();
cpuset_del(&sched_idle_cpus, ci);
}
}
/*
* To free our address space we have to jump through a few hoops.
* The freeing is done by the reaper, but until we have one reaper
* per cpu, we have no way of putting this proc on the deadproc list
* and waking up the reaper without risking having our address space and
* stack torn from under us before we manage to switch to another proc.
* Therefore we have a per-cpu list of dead processes where we put this
* proc and have idle clean up that list and move it to the reaper list.
* All this will be unnecessary once we can bind the reaper this cpu
* and not risk having it switch to another in case it sleeps.
*/
void
sched_exit(struct proc *p)
{
struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
struct timeval tv;
struct proc *idle;
int s;
microuptime(&tv);
timersub(&tv, &spc->spc_runtime, &tv);
timeradd(&p->p_rtime, &tv, &p->p_rtime);
LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
#ifdef MULTIPROCESSOR
KASSERT(__mp_lock_held(&kernel_lock) == 0);
#endif
SCHED_LOCK(s);
idle = spc->spc_idleproc;
idle->p_stat = SRUN;
cpu_switchto(NULL, idle);
panic("cpu_switchto returned");
}
/*
* Run queue management.
*/
void
sched_init_runqueues(void)
{
#ifdef MULTIPROCESSOR
__mp_lock_init(&sched_lock);
#endif
}
void
setrunqueue(struct proc *p)
{
struct schedstate_percpu *spc;
int queue = p->p_priority >> 2;
SCHED_ASSERT_LOCKED();
spc = &p->p_cpu->ci_schedstate;
spc->spc_nrun++;
TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
spc->spc_whichqs |= (1 << queue);
cpuset_add(&sched_queued_cpus, p->p_cpu);
if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
cpu_unidle(p->p_cpu);
}
void
remrunqueue(struct proc *p)
{
struct schedstate_percpu *spc;
int queue = p->p_priority >> 2;
SCHED_ASSERT_LOCKED();
spc = &p->p_cpu->ci_schedstate;
spc->spc_nrun--;
TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
spc->spc_whichqs &= ~(1 << queue);
if (spc->spc_whichqs == 0)
cpuset_del(&sched_queued_cpus, p->p_cpu);
}
}
struct proc *
sched_chooseproc(void)
{
struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
struct proc *p;
int queue;
SCHED_ASSERT_LOCKED();
again:
if (spc->spc_whichqs) {
queue = ffs(spc->spc_whichqs) - 1;
p = TAILQ_FIRST(&spc->spc_qs[queue]);
remrunqueue(p);
} else if ((p = sched_steal_proc(curcpu())) == NULL) {
p = spc->spc_idleproc;
if (p == NULL) {
int s;
/*
* We get here if someone decides to switch during
* boot before forking kthreads, bleh.
* This is kind of like a stupid idle loop.
*/
#ifdef MULTIPROCESSOR
__mp_unlock(&sched_lock);
#endif
spl0();
delay(10);
SCHED_LOCK(s);
goto again;
}
KASSERT(p);
p->p_stat = SRUN;
}
return (p);
}
uint64_t sched_nmigrations;
uint64_t sched_noidle;
uint64_t sched_stolen;
uint64_t sched_choose;
uint64_t sched_wasidle;
uint64_t sched_nomigrations;
struct cpu_info *
sched_choosecpu_fork(struct proc *parent, int flags)
{
struct cpu_info *choice = NULL;
fixpt_t load, best_load = ~0;
int run, best_run = INT_MAX;
struct cpu_info *ci;
struct cpuset set;
#if 0
/*
* XXX
* Don't do this until we have a painless way to move the cpu in exec.
* Preferably when nuking the old pmap and getting a new one on a
* new cpu.
*/
/*
* PPWAIT forks are simple. We know that the parent will not
* run until we exec and choose another cpu, so we just steal its
* cpu.
*/
if (flags & FORK_PPWAIT)
return (parent->p_cpu);
#endif
/*
* Look at all cpus that are currently idle and have nothing queued.
* If there are none, pick the one with least queued procs first,
* then the one with lowest load average.
*/
cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
if (cpuset_first(&set) == NULL)
cpuset_add_all(&set);
while ((ci = cpuset_first(&set)) != NULL) {
cpuset_del(&set, ci);
load = ci->ci_schedstate.spc_ldavg;
run = ci->ci_schedstate.spc_nrun;
if (choice == NULL || run < best_run ||
(run == best_run &&load < best_load)) {
choice = ci;
best_load = load;
best_run = run;
}
}
return (choice);
}
struct cpu_info *
sched_choosecpu(struct proc *p)
{
struct cpu_info *choice = NULL;
int last_cost = INT_MAX;
struct cpu_info *ci;
struct cpuset set;
/*
* If pegged to a cpu, don't allow it to move.
*/
if (p->p_flag & P_CPUPEG)
return (p->p_cpu);
sched_choose++;
/*
* Look at all cpus that are currently idle and have nothing queued.
* If there are none, pick the cheapest of those.
* (idle + queued could mean that the cpu is handling an interrupt
* at this moment and haven't had time to leave idle yet).
*/
cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
/*
* First, just check if our current cpu is in that set, if it is,
* this is simple.
* Also, our cpu might not be idle, but if it's the current cpu
* and it has nothing else queued and we're curproc, take it.
*/
if (cpuset_isset(&set, p->p_cpu) ||
(p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
curproc == p)) {
sched_wasidle++;
return (p->p_cpu);
}
if (cpuset_first(&set) == NULL)
cpuset_add_all(&set);
while ((ci = cpuset_first(&set)) != NULL) {
int cost = sched_proc_to_cpu_cost(ci, p);
if (choice == NULL || cost < last_cost) {
choice = ci;
last_cost = cost;
}
cpuset_del(&set, ci);
}
if (p->p_cpu != choice)
sched_nmigrations++;
else
sched_nomigrations++;
return (choice);
}
/*
* Attempt to steal a proc from some cpu.
*/
struct proc *
sched_steal_proc(struct cpu_info *self)
{
struct schedstate_percpu *spc;
struct proc *best = NULL;
int bestcost = INT_MAX;
struct cpu_info *ci;
struct cpuset set;
cpuset_copy(&set, &sched_queued_cpus);
while ((ci = cpuset_first(&set)) != NULL) {
struct proc *p;
int queue;
int cost;
cpuset_del(&set, ci);
spc = &ci->ci_schedstate;
queue = ffs(spc->spc_whichqs) - 1;
TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
if (p->p_flag & P_CPUPEG)
continue;
cost = sched_proc_to_cpu_cost(self, p);
if (best == NULL || cost < bestcost) {
best = p;
bestcost = cost;
}
}
}
if (best == NULL)
return (NULL);
spc = &best->p_cpu->ci_schedstate;
remrunqueue(best);
best->p_cpu = self;
sched_stolen++;
return (best);
}
/*
* Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
*/
static int
log2(unsigned int i)
{
int ret = 0;
while (i >>= 1)
ret++;
return (ret);
}
/*
* Calculate the cost of moving the proc to this cpu.
*
* What we want is some guesstimate of how much "performance" it will
* cost us to move the proc here. Not just for caches and TLBs and NUMA
* memory, but also for the proc itself. A highly loaded cpu might not
* be the best candidate for this proc since it won't get run.
*
* Just total guesstimates for now.
*/
int sched_cost_load = 1;
int sched_cost_priority = 1;
int sched_cost_runnable = 3;
int sched_cost_resident = 1;
int
sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
{
struct schedstate_percpu *spc;
int l2resident = 0;
int cost;
spc = &ci->ci_schedstate;
cost = 0;
/*
* First, account for the priority of the proc we want to move.
* More willing to move, the lower the priority of the destination
* and the higher the priority of the proc.
*/
if (!cpuset_isset(&sched_idle_cpus, ci)) {
cost += (p->p_priority - spc->spc_curpriority) *
sched_cost_priority;
cost += sched_cost_runnable;
}
if (cpuset_isset(&sched_queued_cpus, ci)) {
cost += spc->spc_nrun * sched_cost_runnable;
}
/*
* Higher load on the destination means we don't want to go there.
*/
cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
/*
* If the proc is on this cpu already, lower the cost by how much
* it has been running and an estimate of its footprint.
*/
if (p->p_cpu == ci && p->p_slptime == 0) {
l2resident =
log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
cost -= l2resident * sched_cost_resident;
}
return (cost);
}
/*
* Peg a proc to a cpu.
*/
void
sched_peg_curproc(struct cpu_info *ci)
{
struct proc *p = curproc;
int s;
SCHED_LOCK(s);
p->p_priority = p->p_usrpri;
p->p_stat = SRUN;
p->p_cpu = ci;
atomic_setbits_int(&p->p_flag, P_CPUPEG);
setrunqueue(p);
p->p_stats->p_ru.ru_nvcsw++;
mi_switch();
SCHED_UNLOCK(s);
}
/*
* Functions to manipulate cpu sets.
*/
struct cpu_info *cpuset_infos[MAXCPUS];
static struct cpuset cpuset_all;
void
cpuset_init_cpu(struct cpu_info *ci)
{
cpuset_add(&cpuset_all, ci);
cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
}
void
cpuset_clear(struct cpuset *cs)
{
memset(cs, 0, sizeof(*cs));
}
/*
* XXX - implement it on SP architectures too
*/
#ifndef CPU_INFO_UNIT
#define CPU_INFO_UNIT 0
#endif
void
cpuset_add(struct cpuset *cs, struct cpu_info *ci)
{
unsigned int num = CPU_INFO_UNIT(ci);
atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
}
void
cpuset_del(struct cpuset *cs, struct cpu_info *ci)
{
unsigned int num = CPU_INFO_UNIT(ci);
atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
}
int
cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
{
unsigned int num = CPU_INFO_UNIT(ci);
return (cs->cs_set[num/32] & (1 << (num % 32)));
}
void
cpuset_add_all(struct cpuset *cs)
{
cpuset_copy(cs, &cpuset_all);
}
void
cpuset_copy(struct cpuset *to, struct cpuset *from)
{
memcpy(to, from, sizeof(*to));
}
struct cpu_info *
cpuset_first(struct cpuset *cs)
{
int i;
for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
if (cs->cs_set[i])
return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
return (NULL);
}
void
cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
{
int i;
for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
}
void
cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
{
int i;
for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
}
void
cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
{
int i;
for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
}