/* * context_switch - switch to the new MM and the new thread's register state. */ static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next, struct rq_flags *rf) { prepare_task_switch(rq, prev, next);
/* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into * one hypercall. */ arch_start_context_switch(prev);
/* * kernel -> kernel lazy + transfer active * user -> kernel lazy + mmgrab() active * * kernel -> user switch + mmdrop() active * user -> user switch */ if (!next->mm) { // to kernel enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm; if (prev->mm) // from user mmgrab(prev->active_mm); else prev->active_mm = NULL; } else { // to user membarrier_switch_mm(rq, prev->active_mm, next->mm); /* * sys_membarrier() requires an smp_mb() between setting * rq->curr / membarrier_switch_mm() and returning to userspace. * * The below provides this either through switch_mm(), or in * case 'prev->active_mm == next->mm' through * finish_task_switch()'s mmdrop(). */ switch_mm_irqs_off(prev->active_mm, next->mm, next);
if (!prev->mm) { // from kernel /* will mmdrop() in finish_task_switch(). */ rq->prev_mm = prev->active_mm; prev->active_mm = NULL; } }
/* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); barrier();
return finish_task_switch(prev); }
大致流程如下:
切换准备工作
切换地址空间
切换寄存器状态
切换收尾
切换准备工作
prepare_task_switch 函数的注释如下:
1 2 3 4 5 6 7 8 9 10 11 12 13
/** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch * @prev: the current task that is being switched out * @next: the task we are going to switch to. * * This is called with the rq lock held and interrupts off. It must * be paired with a subsequent finish_task_switch after the context * switch. * * prepare_task_switch sets up locking and calls architecture specific * hooks. */
/* * kernel -> kernel lazy + transfer active * user -> kernel lazy + mmgrab() active * * kernel -> user switch + mmdrop() active * user -> user switch */ if (!next->mm) { // to kernel enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm; if (prev->mm) // from user mmgrab(prev->active_mm); else prev->active_mm = NULL; } else { // to user membarrier_switch_mm(rq, prev->active_mm, next->mm); /* * sys_membarrier() requires an smp_mb() between setting * rq->curr / membarrier_switch_mm() and returning to userspace. * * The below provides this either through switch_mm(), or in * case 'prev->active_mm == next->mm' through * finish_task_switch()'s mmdrop(). */ switch_mm_irqs_off(prev->active_mm, next->mm, next);
if (!prev->mm) { // from kernel /* will mmdrop() in finish_task_switch(). */ rq->prev_mm = prev->active_mm; prev->active_mm = NULL; } }
切换寄存器状态
switch_to 即为各个体系结构对进程切换的寄存器操作,对于 X86 来说:
1 2 3 4 5 6
#define switch_to(prev, next, last) \ do { \ prepare_switch_to(next); \ \ ((last) = __switch_to_asm((prev), (next))); \ } while (0)
/* * %rdi: prev task * %rsi: next task */ ENTRY(__switch_to_asm) UNWIND_HINT_FUNC /* * Save callee-saved registers * This must match the order in inactive_task_frame */ pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15
#ifdef CONFIG_RETPOLINE /* * When switching from a shallower to a deeper call stack * the RSB may either underflow or use entries populated * with userspace addresses. On CPUs where those concerns * exist, overwrite the RSB with entries which capture * speculative execution to prevent attack. */ FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW #endif
/** * finish_task_switch - clean up after a task-switch * @prev: the thread we just switched away from. * * finish_task_switch must be called after the context switch, paired * with a prepare_task_switch call before the context switch. * finish_task_switch will reconcile locking set up by prepare_task_switch, * and do any other architecture-specific cleanup actions. * * Note that we may have delayed dropping an mm in context_switch(). If * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) * * The context switch have flipped the stack from under us and restored the * local variables which were saved when this task called schedule() in the * past. prev == current is still correct but we need to recalculate this_rq * because prev may have moved to another CPU. */