有了上述的基本策略和基本调度结构体,我们可以形成大致的骨架,下面就是需要核心的调度流程将其拼凑成一个整体,实现调度系统。调度分为两种,主动调度和抢占式调度。
- 主动调度即任务执行一定时间以后主动让出CPU,通过调度策略选择合适的下一个任务执行。
- 抢占式调度即任务执行中收到了其他任务的中断,由此停止执行并切换至下一个任务。
主动调度
说到调用,逃不过核心函数schedule()。其中sched_submit_work()函数完成当前任务的收尾工作,以避免出现如死锁或者IO中断等情况。之后首先禁止抢占式调度的发生,然后调用__schedule()函数完成调度,之后重新打开抢占式调度,如果需要重新调度则会一直重复该过程,否则结束函数。
asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
sched_submit_work(tsk);
do {
preempt_disable();
__schedule(false);
sched_preempt_enable_no_resched();
} while (need_resched());
}
EXPORT_SYMBOL(schedule);
而__schedule()函数则是实际的核心调度函数,该函数主要操作包括选取下一进程和进行上下文切换,而上下文切换又包括用户态空间切换和内核态的切换。具体的解释可以参照英文源码注释以及中文对各个步骤的注释。
/*
* __schedule() is the main scheduler function.
* The main means of driving the scheduler and thus entering this function are:
* 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
* 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
* paths. For example, see arch/x86/entry_64.S.
* To drive preemption between tasks, the scheduler sets the flag in timer
* interrupt handler scheduler_tick().
* 3. Wakeups don't really cause entry into schedule(). They add a
* task to the run-queue and that's it.
* Now, if the new task added to the run-queue preempts the current
* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
* called on the nearest possible occasion:
* - If the kernel is preemptible (CONFIG_PREEMPT=y):
* - in syscall or exception context, at the next outmost
* preempt_enable(). (this might be as soon as the wake_up()'s
* spin_unlock()!)
* - in IRQ context, return from interrupt-handler to
* preemptible context
* - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
* then at the next:
* - cond_resched() call
* - explicit schedule() call
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space
* WARNING: must be called with preemption disabled!
*/
static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq_flags rf;
struct rq *rq;
int cpu;
//从当前的CPU中取出任务队列rq,prev赋值为当前任务
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr;
//检测当前任务是否可以调度
schedule_debug(prev);
if (sched_feat(HRTICK))
hrtick_clear(rq);
//禁止中断,RCU抢占关闭,队列加锁,SMP加锁
local_irq_disable();
rcu_note_context_switch(preempt);
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*
* The membarrier system call requires a full memory barrier
* after coming from user-space, before storing to rq->curr.
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
update_rq_clock(rq);
switch_count = &prev->nivcsw;
if (!preempt && prev->state) {
//不可中断的任务则继续执行
if (signal_pending_state(prev->state, prev)) {
prev->state = TASK_RUNNING;
} else {
//当前任务从队列rq中出队,on_rq设置为0,如果存在I/O未完成则延时完成
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
prev->on_rq = 0;
if (prev->in_iowait) {
atomic_inc(&rq->nr_iowait);
delayacct_blkio_start();
}
/* 唤醒睡眠进程
* If a worker went to sleep, notify and ask workqueue
* whether it wants to wake up a task to maintain
* concurrency.
*/
if (prev->flags & PF_WQ_WORKER) {
struct task_struct *to_wakeup;
to_wakeup = wq_worker_sleeping(prev);
if (to_wakeup)
try_to_wake_up_local(to_wakeup, &rf);
}
}
switch_count = &prev->nvcsw;
}
// 调用pick_next_task获取下一个任务,赋值给next
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
// 如果产生了任务切换,则需要切换上下文
if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
* rq->curr, before returning to user-space.
*
* Here are the schemes providing that barrier on the
* various architectures:
* - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
* switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
* - finish_lock_switch() for weakly-ordered
* architectures where spin_unlock is a full barrier,
* - switch_to() for arm64 (weakly-ordered, spin_unlock
* is a RELEASE barrier),
*/
++*switch_count;
trace_sched_switch(preempt, prev, next);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
// 清除标记位,重开中断
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unlock_irq(rq, &rf);
}
//队列自平衡:红黑树平衡操作
balance_callback(rq);
}
其中核心函数是获取下一个任务的pick_next_task()以及上下文切换的context_switch(),下面详细展开剖析。首先看看pick_next_task(),该函数会根据调度策略分类,调用该类对应的调度函数选择下一个任务实体。根据前文分析我们知道,最终是在不同的红黑树上选择最左节点作为下一个任务实体并返回。
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
/* 这里做了一个优化:如果是普通调度策略则直接调用fair_sched_class中的pick_next_task
* Optimization: we know that if all tasks are in the fair class we can
* call that function directly, but only if the @prev task wasn't of a
* higher scheduling class, because otherwise those loose the
* opportunity to pull in more work from other CPUs.
*/
if (likely((prev->sched_class == &idle_sched_class ||
prev->sched_class == &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto again;
/* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
p = idle_sched_class.pick_next_task(rq, prev, rf);
return p;
}
again:
//依次调用类中的选择函数,如果正确选择到下一个任务则返回
for_each_class(class) {
p = class->pick_next_task(rq, prev, rf);
if (p) {
if (unlikely(p == RETRY_TASK))
goto again;
return p;
}
}
/* The idle class should always have a runnable task: */
BUG();
}
下面来看看上下文切换。上下文切换主要干两件事情,一是切换任务空间,也即虚拟内存;二是切换寄存器和 CPU 上下文。关于任务空间的切换放在内存部分的文章中详细介绍,这里先按下不表,通过任务空间切换实际完成了用户态的上下文切换工作。下面我们重点看一下内核态切换,即寄存器和CPU上下文的切换。
/*
* context_switch - switch to the new MM and the new thread's register state.
*/
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
* one hypercall.
*/
arch_start_context_switch(prev);
/*
* If mm is non-NULL, we pass through switch_mm(). If mm is
* NULL, we will pass through mmdrop() in finish_task_switch().
* Both of these contain the full memory barrier required by
* membarrier after storing to rq->curr, before returning to
* user-space.
*/
if (!mm) {
next->active_mm = oldmm;
mmgrab(oldmm);
enter_lazy_tlb(oldmm, next);
} else
switch_mm_irqs_off(oldmm, mm, next);
if (!prev->mm) {
prev->active_mm = NULL;
rq->prev_mm = oldmm;
}
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
//barrier 语句是一个编译器指令,用于保证 switch_to 和 finish_task_switch 的执行顺序不会因为编译阶段优化而改变
barrier();
return finish_task_switch(prev);
}
switch_to()就是寄存器和栈的切换,它调用到了 __switch_to_asm。这是一段汇编代码,主要用于栈的切换, 其中32位使用esp作为栈顶指针,64位使用rsp,其他部分代码一致。通过该段汇编代码我们完成了栈顶指针的切换,并调用__switch_to完成最终TSS的切换。注意switch_to中其实是有三个变量,分别是prev, next, last,而实际在使用时,我们会对last也赋值为prev。这里的设计意图需要结合一个例子来说明。假设有ABC三个任务,从A调度到B,B到C,最后C回到A,我们假设仅保存prev和next,则流程如下
- A保存内核栈和寄存器,切换至B,此时prev = A, next = B,该状态会保存在栈里,等下次调用A的时候再恢复。然后调用B的finish_task_switch()继续执行下去,返回B的队列rq,
- B保存内核栈和寄存器,切换至C
- C保存内核栈和寄存器,切换至A。A从barrier()开始运行,而A从步骤1中保存的prev = A, next = B则完美的避开了C,丢失了C的信息。因此last指针的重要性就出现了。在执行完__switch_to_asm后,A的内核栈和寄存器重新覆盖了prev和next,但是我们通过返回值提供了C的内存地址,保存在last中,在finish_task_switch中完成清理工作。
#define switch_to(prev, next, last) \
do { \
prepare_switch_to(next); \
\
((last) = __switch_to_asm((prev), (next))); \
} while (0)
/*
* %eax: prev task
* %edx: next task
*/
ENTRY(__switch_to_asm)
......
/* switch stack */
movl %esp, TASK_threadsp(%eax)
movl TASK_threadsp(%edx), %esp
......
jmp __switch_to
END(__switch_to_asm)
最终调用__switch_to()函数。该函数中涉及到一个结构体TSS(Task State Segment),该结构体存放了所有的寄存器。另外还有一个特殊的寄存器TR(Task Register)会指向TSS,我们通过更改TR的值,会触发硬件保存CPU所有寄存器在当前TSS,并从新的TSS读取寄存器的值加载入CPU,从而完成一次硬中断带来的上下文切换工作。系统初始化的时候,会调用 cpu_init()给每一个 CPU 关联一个 TSS,然后将 TR 指向这个 TSS,然后在操作系统的运行过程中,TR 就不切换了,永远指向这个 TSS。当修改TR的值得时候,则为任务调度。
/*
* switch_to(x,y) should switch tasks from x to y.
*
* We fsave/fwait so that an exception goes off at the right time
* (as a call from the fsave or fwait in effect) rather than to
* the wrong process. Lazy FP saving no longer makes any sense
* with modern CPU's, and this simplifies a lot of things (SMP
* and UP become the same).
*
* NOTE! We used to use the x86 hardware context switching. The
* reason for not using it any more becomes apparent when you
* try to recover gracefully from saved state that is no longer
* valid (stale segment register values in particular). With the
* hardware task-switch, there is no way to fix up bad state in
* a reasonable manner.
*
* The fact that Intel documents the hardware task-switching to
* be slow is a fairly red herring - this code is not noticeably
* faster. However, there _is_ some room for improvement here,
* so the performance issues may eventually be a valid point.
* More important, however, is the fact that this allows us much
* more flexibility.
*
* The return value (in %ax) will be the "prev" task after
* the task-switch, and shows up in ret_from_fork in entry.S,
* for example.
*/
__visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
switch_fpu_prepare(prev_fpu, cpu);
/*
* Save away %gs. No need to save %fs, as it was saved on the
* stack on entry. No need to save %es and %ds, as those are
* always kernel segments while inside the kernel. Doing this
* before setting the new TLS descriptors avoids the situation
* where we temporarily have non-reloadable segments in %fs
* and %gs. This could be an issue if the NMI handler ever
* used %fs or %gs (it does not today), or if the kernel is
* running inside of a hypervisor layer.
*/
lazy_save_gs(prev->gs);
/*
* Load the per-thread Thread-Local Storage descriptor.
*/
load_TLS(next, cpu);
/*
* Restore IOPL if needed. In normal use, the flags restore
* in the switch assembly will handle this. But if the kernel
* is running virtualized at a non-zero CPL, the popf will
* not restore flags, so it must be done in a separate step.
*/
if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
set_iopl_mask(next->iopl);
switch_to_extra(prev_p, next_p);
/*
* Leave lazy mode, flushing any hypercalls made here.
* This must be done before restoring TLS segments so
* the GDT and LDT are properly updated, and must be
* done before fpu__restore(), so the TS bit is up
* to date.
*/
arch_end_context_switch(next_p);
/*
* Reload esp0 and cpu_current_top_of_stack. This changes
* current_thread_info(). Refresh the SYSENTER configuration in
* case prev or next is vm86.
*/
update_task_stack(next_p);
refresh_sysenter_cs(next);
this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE);
/*
* Restore %gs if needed (which is common)
*/
if (prev->gs | next->gs)
lazy_load_gs(next->gs);
switch_fpu_finish(next_fpu, cpu);
this_cpu_write(current_task, next_p);
/* Load the Intel cache allocation PQR MSR. */
resctrl_sched_in();
return prev_p;
}
在完成了switch_to()的内核态切换后,还有一个重要的函数finish_task_switch()负责善后清理工作。在前面介绍switch_to三个参数的时候我们已经说明了使用last的重要性。而这里为何让prev和last均赋值为prev,是因为prev在后面没有需要用到,所以节省了一个指针空间来存储last。
/**
* finish_task_switch - clean up after a task-switch
* @prev: the thread we just switched away from.
*
* finish_task_switch must be called after the context switch, paired
* with a prepare_task_switch call before the context switch.
* finish_task_switch will reconcile locking set up by prepare_task_switch,
* and do any other architecture-specific cleanup actions.
*
* Note that we may have delayed dropping an mm in context_switch(). If
* so, we finish that here outside of the runqueue lock. (Doing it
* with the lock held can cause deadlocks; see schedule() for
* details.)
*
* The context switch have flipped the stack from under us and restored the
* local variables which were saved when this task called schedule() in the
* past. prev == current is still correct but we need to recalculate this_rq
* because prev may have moved to another CPU.
*/
static struct rq *finish_task_switch(struct task_struct *prev)
__releases(rq->lock)
{
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
long prev_state;
/*
* The previous task will have left us with a preempt_count of 2
* because it left us after:
*
* schedule()
* preempt_disable(); // 1
* __schedule()
* raw_spin_lock_irq(&rq->lock) // 2
*
* Also, see FORK_PREEMPT_COUNT.
*/
if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
"corrupted preempt_count: %s/%d/0x%x\n",
current->comm, current->pid, preempt_count()))
preempt_count_set(FORK_PREEMPT_COUNT);
rq->prev_mm = NULL;
/*
* A task struct has one reference for the use as "current".
* If a task dies, then it sets TASK_DEAD in tsk->state and calls
* schedule one last time. The schedule call will never return, and
* the scheduled task must drop that reference.
*
* We must observe prev->state before clearing prev->on_cpu (in
* finish_task), otherwise a concurrent wakeup can get prev
* running on another CPU and we could rave with its RUNNING -> DEAD
* transition, resulting in a double drop.
*/
prev_state = prev->state;
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
finish_task(prev);
finish_lock_switch(rq);
finish_arch_post_lock_switch();
kcov_finish_switch(current);
fire_sched_in_preempt_notifiers(current);
/*
* When switching through a kernel thread, the loop in
* membarrier_{private,global}_expedited() may have observed that
* kernel thread and not issued an IPI. It is therefore possible to
* schedule between user->kernel->user threads without passing though
* switch_mm(). Membarrier requires a barrier after storing to
* rq->curr, before returning to userspace, so provide them here:
*
* - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
* provided by mmdrop(),
* - a sync_core for SYNC_CORE.
*/
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
mmdrop(mm);
}
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
*/
kprobe_flush_task(prev);
/* Task is done with its stack. */
put_task_stack(prev);
put_task_struct(prev);
}
tick_nohz_task_switch();
return rq;
}
至此,我们完成了内核态的切换工作,也完成了整个主动调度的过程。
抢占式调度
抢占式调度通常发生在两种情况下。一种是某任务执行时间过长,另一种是当某任务被唤醒的时候。首先看看任务执行时间过长的情况。
任务运行时间检测
该情况需要衡量一个任务的执行时间长短,执行时间过长则发起抢占。在计算机里面有一个时钟,会过一段时间触发一次时钟中断,通知操作系统时间又过去一个时钟周期,通过这种方式可以查看是否是需要抢占的时间点。
时钟中断处理函数会调用scheduler_tick()。该函数首先取出当前CPU,并由此获取对应的运行队列rq和当前任务curr。接着调用该任务的调度类sched_class对应的task_tick()函数进行时间事件处理。
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*/
void scheduler_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
struct rq_flags rf;
sched_clock_tick();
rq_lock(rq, &rf);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq);
calc_global_load_tick(rq);
psi_task_tick(rq);
rq_unlock(rq, &rf);
perf_event_task_tick();
......
}
以普通任务队列为例,对应的调度类为fair_sched_class,对应的时钟处理函数为task_tick_fair(),该函数会获取当前的调度实体和运行队列,并调用entity_tick()函数更新时间。
/*
* scheduler tick hitting a task of our scheduling class.
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &curr->se;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
}
在entity_tick()中,首先会调用update_curr()更新当前任务的vruntime,然后调用check_preempt_tick()检测现在是否可以发起抢占。
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
/*
* Ensure that runnable average is periodically updated.
*/
update_load_avg(cfs_rq, curr, UPDATE_TG);
update_cfs_group(curr);
......
if (cfs_rq->nr_running > 1)
check_preempt_tick(cfs_rq, curr);
}
check_preempt_tick() 先是调用 sched_slice() 函数计算出一个调度周期中该任务运行的实际时间 ideal_runtime。sum_exec_runtime 指任务总共执行的实际时间,prev_sum_exec_runtime 指上次该进程被调度时已经占用的实际时间,所以 sum_exec_runtime – prev_sum_exec_runtime 就是这次调度占用实际时间。如果这个时间大于 ideal_runtime,则应该被抢占了。除了这个条件之外,还会通过 __pick_first_entity 取出红黑树中最小的进程。如果当前进程的 vruntime 大于红黑树中最小的进程的 vruntime,且差值大于 ideal_runtime,也应该被抢占了。
/*
* Preempt the current task with a newly woken task if needed:
*/
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
unsigned long ideal_runtime, delta_exec;
struct sched_entity *se;
s64 delta;
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
resched_curr(rq_of(cfs_rq));
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
*/
clear_buddies(cfs_rq, curr);
return;
}
/*
* Ensure that a task that missed wakeup preemption by a
* narrow margin doesn't have to wait for a full slice.
* This also mitigates buddy induced latencies under load.
*/
if (delta_exec < sysctl_sched_min_granularity)
return;
se = __pick_first_entity(cfs_rq);
delta = curr->vruntime - se->vruntime;
if (delta < 0)
return;
if (delta > ideal_runtime)
resched_curr(rq_of(cfs_rq));
}
如果确认需要被抢占,则会调用resched_curr()函数,该函数会调用set_tsk_need_resched()标记该任务为_TIF_NEED_RESCHED,即该任务应该被抢占。
/*
* resched_curr - mark rq's current task 'to be rescheduled now'.
*
* On UP this means the setting of the need_resched flag, on SMP it
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
void resched_curr(struct rq *rq)
{
struct task_struct *curr = rq->curr;
int cpu;
.......
cpu = cpu_of(rq);
if (cpu == smp_processor_id()) {
set_tsk_need_resched(curr);
set_preempt_need_resched();
return;
}
if (set_nr_and_not_polling(curr))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
}
任务唤醒情况
某些任务会因为中断而唤醒,如当 I/O 到来的时候,I/O进程往往会被唤醒。在这种时候,如果被唤醒的任务优先级高于 CPU 上的当前任务,就会触发抢占。try_to_wake_up() 调用 ttwu_queue() 将这个唤醒的任务添加到队列当中。ttwu_queue() 再调用 ttwu_do_activate() 激活这个任务。ttwu_do_activate() 调用 ttwu_do_wakeup()。这里面调用了 check_preempt_curr() 检查是否应该发生抢占。如果应该发生抢占,也不是直接踢走当前进程,而是将当前进程标记为应该被抢占。
static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
struct rq_flags *rf)
{
check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING;
trace_sched_wakeup(p);
抢占的发生
由前面的分析,我们知道了不论是是当前任务执行时间过长还是新任务唤醒,我们均会对现在的任务标记位_TIF_NEED_RESCUED,下面分析实际抢占的发生。真正的抢占还需要一个特定的时机让正在运行中的进程有机会调用一下 __schedule()函数,发起真正的调度。
实际上会调用__schedule()函数共有以下几个时机
从系统调用返回用户态:以64位为例,系统调用的链路为do_syscall_64->syscall_return_slowpath->prepare_exit_to_usermode->exit_to_usermode_loop。在exit_to_usermode_loop中,会检测是否为_TIF_NEED_RESCHED,如果是则调用__schedule()
static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
{
while (true) {
/* We have work to do. */
local_irq_enable();
if (cached_flags & _TIF_NEED_RESCHED)
schedule();
......
}
内核态启动:内核态的执行中,被抢占的时机一般发生在 preempt_enable() 中。在内核态的执行中,有的操作是不能被中断的,所以在进行这些操作之前,总是先调用 preempt_disable() 关闭抢占,当再次打开的时候,就是一次内核态代码被抢占的机会。preempt_enable() 会调用 preempt_count_dec_and_test(),判断 preempt_count 和 TIF_NEED_RESCHED 是否可以被抢占。如果可以,就调用 preempt_schedule->preempt_schedule_common->__schedule 进行调度。
#define preempt_enable() \
do { \
if (unlikely(preempt_count_dec_and_test())) \
__preempt_schedule(); \
} while (0)
#define preempt_count_dec_and_test() \
({ preempt_count_sub(1); should_resched(0); })
static __always_inline bool should_resched(int preempt_offset)
{
return unlikely(preempt_count() == preempt_offset &&
tif_need_resched());
}
#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
static void __sched notrace preempt_schedule_common(void)
{
do {
......
__schedule(true);
......
} while (need_resched())
- 从中断返回内核态/用户态:中断处理调用的是 do_IRQ 函数,中断完毕后分为两种情况,一个是返回用户态,一个是返回内核态。
- 返回用户态会调用 prepare_exit_to_usermode(),最终调用 exit_to_usermode_loop()
- 返回内核态会调用preempt_schedule_irq(),最终调用__schedule()
common_interrupt:
ASM_CLAC
addq $-0x80, (%rsp)
interrupt do_IRQ
ret_from_intr:
popq %rsp
testb $3, CS(%rsp)
jz retint_kernel
/* Interrupt came from user space */
GLOBAL(retint_user)
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
SWAPGS
jmp restore_regs_and_iret
/* Returning to kernel space */
retint_kernel:
#ifdef CONFIG_PREEMPT
bt $9, EFLAGS(%rsp)
jnc 1f
0: cmpl $0, PER_CPU_VAR(__preempt_count)
jnz 1f
call preempt_schedule_irq
jmp 0b
asmlinkage __visible void __sched preempt_schedule_irq(void)
{
......
do {
preempt_disable();
local_irq_enable();
__schedule(true);
local_irq_disable();
sched_preempt_enable_no_resched();
} while (need_resched());
......
}
酷客网相关文章:
评论前必须登录!
注册