Linux内核调度结构体

首先,我们需要一个结构体去执行调度策略,即sched_class。该类有几种实现

  • stop_sched_class 优先级最高的任务会使用这种策略,会中断所有其他线程,且不会被其他任务打断;
  • dl_sched_class 就对应上面的 deadline 调度策略;
  • rt_sched_class 就对应 RR 算法或者 FIFO 算法的调度策略,具体调度策略由进程的 task_struct->policy 指定;
  • fair_sched_class 就是普通进程的调度策略;
  • idle_sched_class 就是空闲进程的调度策略。

其次,我们需要一个调度结构体来集合调度信息,用于调度,即sched_entity,主要有

  • struct sched_entity se:普通任务调度实体
  • struct sched_rt_entity rt:实时调度实体
  • struct sched_dl_entity dl:DEADLINE调度实体

普通任务调度实体源码如下,这里面包含了 vruntime 和权重 load_weight,以及对于运行时间的统计。

struct sched_entity {
    /* For load-balancing: */
    struct load_weight      load;
    unsigned long           runnable_weight;
    struct rb_node          run_node;
    struct list_head        group_node;
    unsigned int            on_rq;
    u64             exec_start;
    u64             sum_exec_runtime;
    u64             vruntime;
    u64             prev_sum_exec_runtime;
    u64             nr_migrations;
    struct sched_statistics     statistics;
#ifdef CONFIG_FAIR_GROUP_SCHED
    int             depth;
    struct sched_entity     *parent;
    /* rq on which this entity is (to be) queued: */
    struct cfs_rq           *cfs_rq;
    /* rq "owned" by this entity/group: */
    struct cfs_rq           *my_q;
#endif
#ifdef CONFIG_SMP
    /*
     * Per entity load average tracking.
     *
     * Put into separate cache line so it does not
     * collide with read-mostly values above.
     */
    struct sched_avg        avg;
#endif
};

在调度时,多个任务调度实体会首先区分是实时任务还是普通任务,然后通过以时间为顺序的红黑树结构组合起来,vruntime 最小的在树的左侧,vruntime最多的在树的右侧。以CFS策略为例,则会选择红黑树最左边的叶子节点作为下一个将获得 CPU 的任务。而这颗红黑树,我们称之为运行时队列(run queue),即struct rq。

/*
 * This is the main, per-CPU runqueue data structure.
 *
 * Locking rule: those places that want to lock multiple runqueues
 * (such as the load balancing or the thread migration code), lock
 * acquire operations must be ordered by ascending &runqueue.
 */
struct rq {
    /* runqueue lock: */
    raw_spinlock_t      lock;
    /*
     * nr_running and cpu_load should be in the same cacheline because
     * remote CPUs use both these fields when doing load calculation.
     */
    unsigned int        nr_running;
......
    #define CPU_LOAD_IDX_MAX 5
    unsigned long       cpu_load[CPU_LOAD_IDX_MAX];
......
    /* capture load from *all* tasks on this CPU: */
    struct load_weight  load;
    unsigned long       nr_load_updates;
    u64         nr_switches;
    struct cfs_rq       cfs;
    struct rt_rq        rt;
    struct dl_rq        dl;
......
    /*
     * This is part of a global counter where only the total sum
     * over all CPUs matters. A task can increase this counter on
     * one CPU and if it got migrated afterwards it may decrease
     * it on another CPU. Always updated under the runqueue lock:
     */
    unsigned long       nr_uninterruptible;
    struct task_struct  *curr;
    struct task_struct  *idle;
    struct task_struct  *stop;
    unsigned long       next_balance;
    struct mm_struct    *prev_mm;
    unsigned int        clock_update_flags;
    u64         clock;
    /* Ensure that all clocks are in the same cache line */
    u64         clock_task ____cacheline_aligned;
    u64         clock_pelt;
    unsigned long       lost_idle_time;
    atomic_t        nr_iowait;
......
    /* calc_load related fields */
    unsigned long       calc_load_update;
    long            calc_load_active;
......
};

其中包含结构体cfs_rq,其定义如下,主要是CFS调度相关的结构体,主要有权值相关变量、vruntime相关变量以及红黑树指针,其中结构体rb_root_cached即为红黑树的节点

/* CFS-related fields in a runqueue */
struct cfs_rq {
    struct load_weight  load;
    unsigned long       runnable_weight;
    unsigned int        nr_running;
    unsigned int        h_nr_running;
    u64         exec_clock;
    u64         min_vruntime;
#ifndef CONFIG_64BIT
    u64         min_vruntime_copy;
#endif
    struct rb_root_cached   tasks_timeline;
    /*
     * 'curr' points to currently running entity on this cfs_rq.
     * It is set to NULL otherwise (i.e when none are currently running).
     */
    struct sched_entity *curr;
    struct sched_entity *next;
    struct sched_entity *last;
    struct sched_entity *skip;
......
};

对结构体dl_rq有类似的定义,运行队列由红黑树结构体构成,并按照deadline策略进行管理

/* Deadline class' related fields in a runqueue */
struct dl_rq {
    /* runqueue is an rbtree, ordered by deadline */
    struct rb_root_cached   root;
    unsigned long       dl_nr_running;
#ifdef CONFIG_SMP
    /*
     * Deadline values of the currently executing and the
     * earliest ready task on this rq. Caching these facilitates
     * the decision whether or not a ready but not running task
     * should migrate somewhere else.
     */
    struct {
        u64     curr;
        u64     next;
    } earliest_dl;
    unsigned long       dl_nr_migratory;
    int         overloaded;
    /*
     * Tasks on this rq that can be pushed away. They are kept in
     * an rb-tree, ordered by tasks' deadlines, with caching
     * of the leftmost (earliest deadline) element.
     */
    struct rb_root_cached   pushable_dl_tasks_root;
#else
    struct dl_bw        dl_bw;
#endif
    /*
     * "Active utilization" for this runqueue: increased when a
     * task wakes up (becomes TASK_RUNNING) and decreased when a
     * task blocks
     */
    u64         running_bw;
    /*
     * Utilization of the tasks "assigned" to this runqueue (including
     * the tasks that are in runqueue and the tasks that executed on this
     * CPU and blocked). Increased when a task moves to this runqueue, and
     * decreased when the task moves away (migrates, changes scheduling
     * policy, or terminates).
     * This is needed to compute the "inactive utilization" for the
     * runqueue (inactive utilization = this_bw - running_bw).
     */
    u64         this_bw;
    u64         extra_bw;
    /*
     * Inverse of the fraction of CPU utilization that can be reclaimed
     * by the GRUB algorithm.
     */
    u64         bw_ratio;
};

  对于实施队列相应的rt_rq则有所不同,并没有用红黑树实现。

/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
    struct rt_prio_array    active;
    unsigned int        rt_nr_running;
    unsigned int        rr_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
    struct {
        int     curr; /* highest queued rt task prio */
#ifdef CONFIG_SMP
        int     next; /* next highest */
#endif
    } highest_prio;
#endif
#ifdef CONFIG_SMP
    unsigned long       rt_nr_migratory;
    unsigned long       rt_nr_total;
    int         overloaded;
    struct plist_head   pushable_tasks;
#endif /* CONFIG_SMP */
    int         rt_queued;
    int         rt_throttled;
    u64         rt_time;
    u64         rt_runtime;
    /* Nests inside the rq lock: */
    raw_spinlock_t      rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
    unsigned long       rt_nr_boosted;
    struct rq       *rq;
    struct task_group   *tg;
#endif
};

下面再看看调度类sched_class,该类以函数指针的形式定义了诸多队列操作,如

  • enqueue_task 向就绪队列中添加一个任务,当某个任务进入可运行状态时,调用这个函数;
  • dequeue_task 将一个任务从就绪队列中删除;
  • yield_task将主动放弃CPU;
  • yield_to_task主动放弃CPU并执行指定的task_struct;
  • check_preempt_curr检查当前任务是否可被强占;
  • pick_next_task 选择接下来要运行的任务;
  • put_prev_task 用另一个进程代替当前运行的任务;
  • set_curr_task 用于修改调度策略;
  • task_tick 每次周期性时钟到的时候,这个函数被调用,可能触发调度。
  • task_dead:进程结束时调用
  • switched_from、switched_to:进程改变调度器时使用
  • prio_changed:改变进程优先级
struct sched_class {
    const struct sched_class *next;
    void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
    void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
    void (*yield_task)   (struct rq *rq);
    bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
    void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
    /*
     * It is the responsibility of the pick_next_task() method that will
     * return the next task to call put_prev_task() on the @prev task or
     * something equivalent.
     *
     * May return RETRY_TASK when it finds a higher prio class has runnable
     * tasks.
     */
    struct task_struct * (*pick_next_task)(struct rq *rq,
                           struct task_struct *prev,
                    *        struct rq_flags *rf);
    void (*put_prev_task)(struct rq *rq, struct task_struct *p);
......
    void (*set_curr_task)(struct rq *rq);
    void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
    void (*task_fork)(struct task_struct *p);
    void (*task_dead)(struct task_struct *p);
    /*
     * The switched_from() call is allowed to drop rq->lock, therefore we
     * cannot assume the switched_from/switched_to pair is serliazed by
     * rq->lock. They are however serialized by p->pi_lock.
     */
    void (*switched_from)(struct rq *this_rq, struct task_struct *task);
    void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
    void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
                  int oldprio);
    unsigned int (*get_rr_interval)(struct rq *rq,
                    struct task_struct *task);
    void (*update_curr)(struct rq *rq);
#define TASK_SET_GROUP     0
#define TASK_MOVE_GROUP        1
......
};

调度类分为下面几种:

extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;

  队列操作中函数指针指向不同策略队列的实际执行函数函数,在linux/kernel/sched/目录下,fair.c、idle.c、rt.c等文件对不同类型的策略实现了不同的函数,如fair.c中定义了

/*
 * All the scheduling class methods:
 */
const struct sched_class fair_sched_class = {
    .next               = &idle_sched_class,
    .enqueue_task       = enqueue_task_fair,
    .dequeue_task       = dequeue_task_fair,
    .yield_task         = yield_task_fair,
    .yield_to_task      = yield_to_task_fair,
    .check_preempt_curr  = check_preempt_wakeup,
    .pick_next_task      = pick_next_task_fair,
    .put_prev_task       = put_prev_task_fair,
......
    .set_curr_task       = set_curr_task_fair,
    .task_tick          = task_tick_fair,
    .task_fork          = task_fork_fair,
    .prio_changed       = prio_changed_fair,
    .switched_from      = switched_from_fair,
    .switched_to        = switched_to_fair,
    .get_rr_interval    = get_rr_interval_fair,
    .update_curr        = update_curr_fair,
......
};

以选择下一个任务为例,CFS对应的是pick_next_task_fair,而rt_rq对应的则是pick_next_task_rt,等等。

由此,我们来总结一下:

  • 每个CPU都有一个struct rq结构体,里面会有着cfs_rq, rt_rq等一系列队列
  • 每个队列由一个红黑树组织,红黑树里每一个节点为一个任务实体sched_entity
  • 每一个任务实体sched_entity对应于一个任务task_struct
  • 在task_struct中对应的sched_class会根据不同策略申明不同的对应处理函数,处理实际的调度工作

酷客网相关文章:

赞(0)

评论 抢沙发

评论前必须登录!