interactive是比较常用的一个cpu调度策略,可调的参数比较多。governor大致相同。他为每个online的cpu创建两个timer来定时计算load(active和idle)计算出target_freq,然后线程cfinteractive获取多个cpu计算的target_freq,然后取最大值来设置policy的freq。
struct cpufreq_interactive_cpuinfo {
struct timer_list cpu_timer; // 计算cpu loading的timer
struct timer_list cpu_slack_timer;
// 保护下面四个变量
spinlock_t load_lock; /* protects the next 4 fields */
u64 time_in_idle; //某个时间点计算的cpuidle的总时间
u64 time_in_idle_timestamp; // idle对应的时刻
u64 cputime_speedadj;// 某个时间点的 (频率* active_time) 累加和
u64 cputime_speedadj_timestamp;// cputime_speedadj对应的时间点. timer start时清0
struct cpufreq_policy *policy;// cpu对应的policy. 对managed cpu, 在governor启动前初始化; 对其他cpu, 在goveror启动时初始化
struct cpufreq_frequency_table *freq_table;// cpu的频率表. 对managed cpu在driver->init时初始化, 其他cpu在goveror启动时初始化
spinlock_t target_freq_lock; /*protects target freq */
unsigned int target_freq; // cpu当前的目标频率, 在timer func中会被更新
unsigned int floor_freq; // 可理解为上次调频的频率. 若new freq小于floor_freq
u64 pol_floor_val_time; /* policy floor_validate_time */
u64 loc_floor_val_time; /* per-cpu floor_validate_time */
u64 pol_hispeed_val_time; /* policy hispeed_validate_time */
u64 loc_hispeed_val_time; /* per-cpu hispeed_validate_time */
struct rw_semaphore enable_sem; // 保护整个结构体
int governor_enabled;// governor enable标志, 启动governor时会置1
};
一、init
static spinlock_t speedchange_cpumask_lock;
static struct mutex gov_lock;
static int __init cpufreq_interactive_init(void)
{
unsigned int i;
struct cpufreq_interactive_cpuinfo *pcpu;
struct sched_param param = {
.sched_priority = MAX_RT_PRIO-1
};
/* Initalize per-cpu timers */
for_each_possible_cpu(i) {
// 拷贝的每CPU变量
pcpu = &per_cpu(cpuinfo, i);
// timer的定时函数,会为每个cpu创建timer
// init_timer_deferrable内核API,实现timer的初始化
init_timer_deferrable(&pcpu->cpu_timer);//计算load的timer
pcpu->cpu_timer.function = cpufreq_interactive_timer;//关联处理函数
pcpu->cpu_timer.data = i;//timer0\timer1\timer2....
// slack_timer的函数
init_timer(&pcpu->cpu_slack_timer);
pcpu->cpu_slack_timer.function = cpufreq_interactive_nop_timer;
// 这个结构体里面总共的三把锁,需要为每个cpu初始化
spin_lock_init(&pcpu->load_lock);
spin_lock_init(&pcpu->target_freq_lock);
init_rwsem(&pcpu->enable_sem);
}
spin_lock_init(&speedchange_cpumask_lock);
mutex_init(&gov_lock);
// 创建内核线程
speedchange_task =kthread_create(cpufreq_interactive_speedchange_task, NULL, "cfinteractive");
if (IS_ERR(speedchange_task))
return PTR_ERR(speedchange_task);
// 设置调度属性,这里设置线程调度是先进先出,还有线程优先级
sched_setscheduler_nocheck(speedchange_task, SCHED_FIFO, ¶m);
get_task_struct(speedchange_task);// 增加线程的引用计数
/* NB: wake up so the thread does not look hung to the freezer */
// 唤醒线程
wake_up_process(speedchange_task);
return cpufreq_register_governor(&cpufreq_gov_interactive);
}
// 一般默认都是interactive
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE
// 默认使用就提前注册fs_initcall是5
fs_initcall(cpufreq_interactive_init);
#else
// module_init是7
module_init(cpufreq_interactive_init);
#endif
二、cpufreq_governor_interactive
governro往核心注册了governor之后,cpufreq core会向具体的governor发送governor事件。绑定相应的回调函数可以响应具体的事件。这里先解析INIT和START事件,最后留一个LIMIT事件,是将freq限制在min和max之间,同时也需要将target_freq固定在min和max之间
// 定义一个每cpu变量
static DEFINE_PER_CPU(struct cpufreq_interactive_cpuinfo, cpuinfo);
static int cpufreq_governor_interactive(struct cpufreq_policy *policy, unsigned int event)
{
int rc;
unsigned int j;
struct cpufreq_interactive_cpuinfo *pcpu;
struct cpufreq_frequency_table *freq_table;
struct cpufreq_interactive_tunables *tunables;
unsigned long flags;
if (have_governor_per_policy())
tunables = policy->governor_data;
else
tunables = common_tunables;
WARN_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT));
switch (event) {
// 这里应该是先响应INIT事件,再响应start事件
case CPUFREQ_GOV_POLICY_INIT:
if (have_governor_per_policy()) {
WARN_ON(tunables);
} else if (tunables) {//公用一套policy参数且已经分配了空间
tunables->usage_count++;
// governor里面的参数
policy->governor_data = tunables;
return 0;
}
tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
if (!tunables) {
pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__);
return -ENOMEM;
}
tunables->usage_count = 1;
tunables->above_hispeed_delay = default_above_hispeed_delay;
tunables->nabove_hispeed_delay = ARRAY_SIZE(default_above_hispeed_delay);
tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;
tunables->target_loads = default_target_loads;
tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);
tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;
tunables->timer_rate = DEFAULT_TIMER_RATE;
tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME;
tunables->timer_slack_val = DEFAULT_TIMER_SLACK;
spin_lock_init(&tunables->target_loads_lock);
spin_lock_init(&tunables->above_hispeed_delay_lock);
policy->governor_data = tunables;
if (!have_governor_per_policy()) {
common_tunables = tunables;
}
rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr());
if (rc) {
kfree(tunables);
policy->governor_data = NULL;
if (!have_governor_per_policy()) {
common_tunables = NULL;
}
return rc;
}
// 何时initialized被赋值为1
if (!policy->governor->initialized) {
// 注册idle通知事件
idle_notifier_register(&cpufreq_inteactive_idle_nb);
// 注册freq变化通知事件
cpufreq_register_notifier(&cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER);
}
break;
case CPUFREQ_GOV_POLICY_EXIT:
if (!--tunables->usage_count) {
if (policy->governor->initialized == 1) {
// 和init对应
cpufreq_unregister_notifier(&cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER);
idle_notifier_unregister(&cpufreq_interactive_idle_nb);
}
sysfs_remove_group(get_governor_parent_kobj(policy),get_sysfs_attr());
kfree(tunables);
common_tunables = NULL;
}
policy->governor_data = NULL;
break;
case CPUFREQ_GOV_START:
mutex_lock(&gov_lock);
// 核心层提供的函数
freq_table = cpufreq_frequency_get_table(policy->cpu);// 获得freq table
// 初始化hispeed_freq
if (!tunables->hispeed_freq)
tunables->hispeed_freq = policy->max;
// 初始化cpufreq_interactive_cpuinfo成员
// cpufreq_interactive_cpuinfo应该是每个governor共有的参数
for_each_cpu(j, policy->cpus) {
pcpu = &per_cpu(cpuinfo, j);
pcpu->policy = policy;
pcpu->target_freq = policy->cur;
pcpu->freq_table = freq_table;// freq_table在driver->init中赋值
pcpu->floor_freq = pcpu->target_freq;
pcpu->pol_floor_val_time = ktime_to_us(ktime_get()) - tunables->min_sample_time;
pcpu->loc_floor_val_time = pcpu->pol_floor_val_time;
pcpu->pol_hispeed_val_time = pcpu->pol_floor_val_time;
pcpu->loc_hispeed_val_time = pcpu->pol_floor_val_time;
down_write(&pcpu->enable_sem);
// 实际使用interactive的两个timer,所以删除公用的timer
del_timer_sync(&pcpu->cpu_timer);
del_timer_sync(&pcpu->cpu_slack_timer);
cpufreq_interactive_timer_start(tunables, j);// 启动采样timer
pcpu->governor_enabled = 1;// 置governor_enable标志
up_write(&pcpu->enable_sem);
}
mutex_unlock(&gov_lock);
break;
case CPUFREQ_GOV_STOP:
mutex_lock(&gov_lock);
for_each_cpu(j, policy->cpus) {
pcpu = &per_cpu(cpuinfo, j);
down_write(&pcpu->enable_sem);
pcpu->governor_enabled = 0;
del_timer_sync(&pcpu->cpu_timer); // 删除cpu_timer
del_timer_sync(&pcpu->cpu_slack_timer);
up_write(&pcpu->enable_sem);
}
mutex_unlock(&gov_lock);
break;
case CPUFREQ_GOV_LIMITS:
// 将实际频率调整到[min, max] 以内
if (policy->max < policy->cur)
__cpufreq_driver_target(policy,
policy->max, CPUFREQ_RELATION_H);
else if (policy->min > policy->cur)
__cpufreq_driver_target(policy,
policy->min, CPUFREQ_RELATION_L);
for_each_cpu(j, policy->cpus) {
pcpu = &per_cpu(cpuinfo, j);
down_read(&pcpu->enable_sem);
if (pcpu->governor_enabled == 0) {
up_read(&pcpu->enable_sem);
continue;
}
spin_lock_irqsave(&pcpu->target_freq_lock, flags);
if (policy->max < pcpu->target_freq)
pcpu->target_freq = policy->max;
else if (policy->min > pcpu->target_freq)
pcpu->target_freq = policy->min;
spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
up_read(&pcpu->enable_sem);
}
break;
}
return 0;
}
在INIT事件中注册了indle通知事件和transition(频率变化)事件。
在START事件中启动了每个cpu的timer
后面的故事就此展开了。
启动timer实际上是调用内核API接口,将timer加入到时钟里面去
// 在notify的回调函数里面实现start
/* The caller shall take enable_sem write semaphore to avoid any timer race.
* The cpu_timer and cpu_slack_timer must be deactivated when calling this
* function.
*/
static void cpufreq_interactive_timer_start( struct cpufreq_interactive_tunables *tunables, int cpu)
{
struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, cpu);
unsigned long expires = jiffies + usecs_to_jiffies(tunables->timer_rate);
unsigned long flags;
pcpu->cpu_timer.expires = expires;//CPU core 统计load的定时器
add_timer_on(&pcpu->cpu_timer, cpu);//加入到定时器链表中去
if (tunables->timer_slack_val >= 0 && pcpu->target_freq > pcpu->policy->min) {
expires += usecs_to_jiffies(tunables->timer_slack_val);
pcpu->cpu_slack_timer.expires = expires;
add_timer_on(&pcpu->cpu_slack_timer, cpu);//加入到定时器链表中去
}
spin_lock_irqsave(&pcpu->load_lock, flags);
//计算CPU启动到现在的idle时间
pcpu->time_in_idle = get_cpu_idle_time(cpu, &pcpu->time_in_idle_timestamp, tunables->io_is_busy);
pcpu->cputime_speedadj = 0;
//计算启动启动到现在的时间
pcpu->cputime_speedadj_timestamp = pcpu->time_in_idle_timestamp;
spin_unlock_irqrestore(&pcpu->load_lock, flags);
}
三、timer
timer会更新target_freq然后wake_up,speedchange_task开始更改freq,遗憾的是timer的回调函数中不能出现printk。
因为在一个timer时间内,可能运行多个freq,所以在freq发生改变时,响应freq改变的回调函数,更新load
static int cpufreq_interactive_notifier( struct notifier_block *nb, unsigned long val, void *data)
{
struct cpufreq_freqs *freq = data;
struct cpufreq_interactive_cpuinfo *pcpu;
int cpu;
unsigned long flags;
if (val == CPUFREQ_POSTCHANGE) {
pcpu = &per_cpu(cpuinfo, freq->cpu);
if (!down_read_trylock(&pcpu->enable_sem))
return 0;
if (!pcpu->governor_enabled) {
up_read(&pcpu->enable_sem);
return 0;
}
for_each_cpu(cpu, pcpu->policy->cpus) {
struct cpufreq_interactive_cpuinfo *pjcpu =
&per_cpu(cpuinfo, cpu);
if (cpu != freq->cpu) {
if (!down_read_trylock(&pjcpu->enable_sem))
continue;
if (!pjcpu->governor_enabled) {
up_read(&pjcpu->enable_sem);
continue;
}
}
spin_lock_irqsave(&pjcpu->load_lock, flags);
update_load(cpu);//更新load,因为在一个timer内可能运行多个freq
spin_unlock_irqrestore(&pjcpu->load_lock, flags);
if (cpu != freq->cpu)
up_read(&pjcpu->enable_sem);
}
up_read(&pcpu->enable_sem);
}
return 0;
}
static struct notifier_block cpufreq_notifier_block = {
.notifier_call = cpufreq_interactive_notifier,
};
// 更新load
static u64 update_load(int cpu)
{
struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, cpu);
struct cpufreq_interactive_tunables *tunables = pcpu->policy->governor_data;
u64 now;//本次的update time,应该是本次统计idle时的时间戳
u64 now_idle; //系统启动以后运行的idle的总时间
u64 delta_idle;
u64 delta_time;
u64 active_time;
now_idle = get_cpu_idle_time(cpu, &now, tunables->io_is_busy);
//pcpu->time_in_idle:上次统计时的idle的总时间
delta_idle = (now_idle - pcpu->time_in_idle);//delta_idle:两次统计之间的idle总时间
//pcpu->time_in_idle_timestamp,上次统计idle时的时间戳
delta_time = (now - pcpu->time_in_idle_timestamp);//delta_time:两次统计之间系统运行的总时间
if (delta_time <= delta_idle)
active_time = 0;
else
active_time = delta_time - delta_idle;
// 这个并不是因为有多个cpu,是单个cpu在time_rate里面实际上会跑多个freq
// 实际上一个timer_rate里面跑多个freq也是统计不出来啊
pcpu->cputime_speedadj += active_time * pcpu->policy->cur;
pcpu->time_in_idle = now_idle;
pcpu->time_in_idle_timestamp = now;
return now;
}
//计算cpu load的timer,也就是说定时会去计算一次load
static void cpufreq_interactive_timer(unsigned long data)
{
u64 now;
unsigned int delta_time;
u64 cputime_speedadj;
int cpu_load;
struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, data);
// 在init事件中已经对他进行了赋值
struct cpufreq_interactive_tunables *tunables = pcpu->policy->governor_data;
unsigned int new_freq;
unsigned int loadadjfreq;
unsigned int index;
unsigned long flags;
u64 max_fvtime;
if (!down_read_trylock(&pcpu->enable_sem))
return;
if (!pcpu->governor_enabled)//在START事件结束的时候被赋值为1
goto exit;
spin_lock_irqsave(&pcpu->load_lock, flags);
now = update_load(data); // 更新cputime_speedadj
delta_time = (unsigned int)(now - pcpu->cputime_speedadj_timestamp);//两次统计的总的时间间隔
cputime_speedadj = pcpu->cputime_speedadj;//已经在update_load中更新
spin_unlock_irqrestore(&pcpu->load_lock, flags);
if (WARN_ON_ONCE(!delta_time))
goto rearm;
spin_lock_irqsave(&pcpu->target_freq_lock, flags);
/*活动时间除以总的运行时间在乘以当前频率,值存储在cputime_speedadj中*/
do_div(cputime_speedadj, delta_time);
loadadjfreq = (unsigned int)cputime_speedadj * 100;//内核不支持浮点数
cpu_load = loadadjfreq / pcpu->policy->cur;// 得到cpu的平均loading,为什么除以cur的值就是平均的load呢?
tunables->boosted = tunables->boost_val || now < tunables->boostpulse_endtime;
// load很高,或者boost开关打开,所以此时最好的是从hispeed模式切换到boost模式
if (cpu_load >= tunables->go_hispeed_load || tunables->boosted) {
// 此时目标频率不能低于hispeed_freq,需要拉高freq
if (pcpu->policy->cur < tunables->hispeed_freq) {
new_freq = tunables->hispeed_freq;
} else {
new_freq = choose_freq(pcpu, loadadjfreq);// 通过choose_freq选择目标频率
if (new_freq < tunables->hispeed_freq)
new_freq = tunables->hispeed_freq;
}
} else {// load比较低的情况,选择的freq高于hispeed但是当前小于hispeed此时应该先升高到hispeed
new_freq = choose_freq(pcpu, loadadjfreq);// 通过choose_freq选择目标频率
// hispeed_freq以上升频时, 必须满足一定的间隔, 功耗考量
if (new_freq > tunables->hispeed_freq && pcpu->policy->cur < tunables->hispeed_freq)
new_freq = tunables->hispeed_freq;
}
//(系统仍然想增加频率)
//now是本次采样时间戳,pcpu->hispeed_validate_time是上次hispeed生效的时间戳,如果两次时间间隔比above_hispeed_delay小,那么直接goto rearm,不调节频率.
if (pcpu->policy->cur >= tunables->hispeed_freq &&new_freq > pcpu->policy->cur &&
now - pcpu->pol_hispeed_val_time < freq_to_above_hispeed_delay(tunables, pcpu->policy->cur)) {
trace_cpufreq_interactive_notyet(data, cpu_load, pcpu->target_freq, pcpu->policy->cur, new_freq);
spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
goto rearm;
}
//更新hispeed_validate_time为now
pcpu->loc_hispeed_val_time = now;
//CPUFREQ_RELATION_L,表示要取大于等于target的最小值
if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table,new_freq, CPUFREQ_RELATION_L, &index)) {
spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
goto rearm;
}
// 取大于等于target的最小值作为freq,转换成平台支持freq
new_freq = pcpu->freq_table[index].frequency;
/*
* Do not scale below floor_freq unless we have been at or above the
* floor frequency for the minimum sample time since last validated.
*/
// 进一步将频时,需要在最小变化时间之后才可以继续降频
max_fvtime = max(pcpu->pol_floor_val_time, pcpu->loc_floor_val_time);
//当new_freq < pcpu->floor_freq,并且两次floor_validate_time的间隔小于min_sample_time,此时不需要更新频率
if (new_freq < pcpu->floor_freq) {
if (now - max_fvtime < tunables->min_sample_time) {
trace_cpufreq_interactive_notyet(data, cpu_load, pcpu->target_freq, pcpu->policy->cur, new_freq);
pcpu->target_freq = new_freq;
spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
goto rearm;
}
}
/*
* Update the timestamp for checking whether speed has been held at
* or above the selected frequency for a minimum of min_sample_time,
* if not boosted to hispeed_freq. If boosted to hispeed_freq then we
* allow the speed to drop as soon as the boostpulse duration expires
* (or the indefinite boost is turned off).
*/
// 仅当bootsted && new_freq < hispeed_freq时
if (!tunables->boosted || new_freq > tunables->hispeed_freq) {
pcpu->floor_freq = new_freq;//floor_freq可以视为基准freq
if (pcpu->target_freq >= pcpu->policy->cur || new_freq >= pcpu->policy->cur)
pcpu->loc_floor_val_time = now;
}
// 现在就是目标freq,就不变freq了
if (pcpu->policy->cur <= pcpu->policy->max &&pcpu->policy->cur >= pcpu->policy->min &&pcpu->target_freq == new_freq &&pcpu->target_freq == pcpu->policy->cur) {
trace_cpufreq_interactive_already(data, cpu_load, pcpu->target_freq, pcpu->policy->cur, new_freq);
spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
goto rearm;
}
trace_cpufreq_interactive_target(data, cpu_load, pcpu->target_freq, pcpu->policy->cur, new_freq);
pcpu->target_freq = new_freq; // 更新target_freq变量
spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
spin_lock_irqsave(&speedchange_cpumask_lock, flags);
cpumask_set_cpu(data, &speedchange_cpumask);
spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
wake_up_process(speedchange_task); // 唤醒调频线程, 开始调频
rearm:
if (!timer_pending(&pcpu->cpu_timer))//定时器是否被pending
cpufreq_interactive_timer_resched(pcpu);// 设置timer下一次触发
exit:
up_read(&pcpu->enable_sem);
return;
}
为下一次计算load做准备,提前应该重启定时器:
// 重启定时器
static void cpufreq_interactive_timer_resched(struct cpufreq_interactive_cpuinfo *pcpu)
{
struct cpufreq_interactive_tunables *tunables = pcpu->policy->governor_data;
unsigned long expires;
unsigned long flags;
spin_lock_irqsave(&pcpu->load_lock, flags);
pcpu->time_in_idle =get_cpu_idle_time(smp_processor_id(),&pcpu->time_in_idle_timestamp, tunables->io_is_busy);
pcpu->cputime_speedadj = 0;// 重启定时器的最重要的一点就是将cputime_speedadj设置为0,好重新开始计算load
pcpu->cputime_speedadj_timestamp = pcpu->time_in_idle_timestamp;
// 重启time_rate和time_slack
expires = jiffies + usecs_to_jiffies(tunables->timer_rate);
mod_timer_pinned(&pcpu->cpu_timer, expires);
if (tunables->timer_slack_val >= 0 &&pcpu->target_freq > pcpu->policy->min) {
expires += usecs_to_jiffies(tunables->timer_slack_val);
mod_timer_pinned(&pcpu->cpu_slack_timer, expires);
}
spin_unlock_irqrestore(&pcpu->load_lock, flags);
}
至于何时重启,后面继续分析。还有就是这里常用的以数组的形式传入参数进行解析
static unsigned int freq_to_above_hispeed_delay(struct cpufreq_interactive_tunables *tunables, unsigned int freq)
{
int i;
unsigned int ret;
unsigned long flags;
spin_lock_irqsave(&tunables->above_hispeed_delay_lock, flags);
// 直接是索引到最大值
for (i = 0; i < tunables->nabove_hispeed_delay - 1 &&
freq >= tunables->above_hispeed_delay[i+1]; i += 2)
;
ret = tunables->above_hispeed_delay[i];
spin_unlock_irqrestore(&tunables->above_hispeed_delay_lock, flags);
return ret;
}
四、speedchange_task
speedchange_task被唤醒之后会查找每个online,cpu的target_freq,然后取最大值freq作为policy的target的freq,调用具体cpufreq_driver的taget或者是target_index接口来实现调频
static int cpufreq_interactive_speedchange_task(void *data)
{
unsigned int cpu;
cpumask_t tmp_mask;
unsigned long flags;
struct cpufreq_interactive_cpuinfo *pcpu;
// 循环等待,但是会通过schedule()来释放cpu资源
while (1) {
set_current_state(TASK_INTERRUPTIBLE);//进程被挂起
spin_lock_irqsave(&speedchange_cpumask_lock, flags);
//查询是否有调频请求
if (cpumask_empty(&speedchange_cpumask)) {
spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
schedule();//如果没有哪个CPUcore的频率需要调整,就去执行其他事情
if (kthread_should_stop())//判断当前线程是否stop
break;
spin_lock_irqsave(&speedchange_cpumask_lock, flags);
}
set_current_state(TASK_RUNNING);/*将线程设置为可运行状态*/
tmp_mask = speedchange_cpumask;// 获得需要调频的cpu信息
cpumask_clear(&speedchange_cpumask);/*记得每次都要清除,因为这个值可能时刻在改变着*/
spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
/*这里开始真正的频率调节了*/
for_each_cpu(cpu, &tmp_mask) {//先遍历cpu_mask相关的cpu,再遍历cpu online的cpu
pcpu = &per_cpu(cpuinfo, cpu);
down_write(&pcpu->policy->rwsem);
if (likely(down_read_trylock(&pcpu->enable_sem))) {//尝试上锁成功
if (likely(pcpu->governor_enabled))
cpufreq_interactive_adjust_cpu(cpu, pcpu->policy);
up_read(&pcpu->enable_sem);
}
up_write(&pcpu->policy->rwsem);
}
}
return 0;
}
static void cpufreq_interactive_adjust_cpu(unsigned int cpu, struct cpufreq_policy *policy)
{
struct cpufreq_interactive_cpuinfo *pcpu;
u64 hvt, fvt;
unsigned int max_freq;
int i;
cpufreq_interactive_get_policy_info(policy, &max_freq, &hvt, &fvt);
for_each_cpu(i, policy->cpus) {
pcpu = &per_cpu(cpuinfo, i);
pcpu->pol_floor_val_time = fvt;
}
if (max_freq != policy->cur || policy->cur > policy->max || policy->cur < policy->min) {
// 核型层函数,实现调用最高频
__cpufreq_driver_target(policy, max_freq, CPUFREQ_RELATION_H);
for_each_cpu(i, policy->cpus) {
pcpu = &per_cpu(cpuinfo, i);
pcpu->pol_hispeed_val_time = hvt;
}
}
// 外部提供的函数
trace_cpufreq_interactive_setspeed(cpu, max_freq, policy->cur);
}
static void cpufreq_interactive_get_policy_info(struct cpufreq_policy *policy,unsigned int *pmax_freq, u64 *phvt, u64 *pfvt)
{
struct cpufreq_interactive_cpuinfo *pcpu;
unsigned int max_freq = 0;
u64 hvt = ~0ULL, fvt = 0;
unsigned int i;
for_each_cpu(i, policy->cpus) {//先遍历cpu_mask相关的cpu,再遍历cpu online的cpu
pcpu = &per_cpu(cpuinfo, i);
fvt = max(fvt, pcpu->loc_floor_val_time);
if (pcpu->target_freq > max_freq) {
max_freq = pcpu->target_freq;// 获取最大的target_freq作为policy->target_freq
hvt = pcpu->loc_hispeed_val_time;
} else if (pcpu->target_freq == max_freq) {
hvt = min(hvt, pcpu->loc_hispeed_val_time);
}
}
*pmax_freq = max_freq;
*phvt = hvt;
*pfvt = fvt;
}
五、idle处理
进入idle,cpu将尝试timer pending并将freq调制最低。idle结束的时候,将重启timer
static int cpufreq_interactive_idle_notifier(struct notifier_block *nb, unsigned long val, void *data)
{
if (val == IDLE_END)
cpufreq_interactive_idle_end();
return 0;
}
static struct notifier_block cpufreq_interactive_idle_nb = {
.notifier_call = cpufreq_interactive_idle_notifier,
};
// 在idle状态会将freq调到最低,在idle结束的时候就应该重启timer
static void cpufreq_interactive_idle_end(void)
{
struct cpufreq_interactive_cpuinfo *pcpu =
&per_cpu(cpuinfo, smp_processor_id());
if (!down_read_trylock(&pcpu->enable_sem))
return;
if (!pcpu->governor_enabled) {
up_read(&pcpu->enable_sem);
return;
}
/* Arm the timer for 1-2 ticks later if not already. */
if (!timer_pending(&pcpu->cpu_timer)) {
cpufreq_interactive_timer_resched(pcpu);// 重启timer
} else if (time_after_eq(jiffies, pcpu->cpu_timer.expires)) {// timer过期 (idle太久)
del_timer(&pcpu->cpu_timer);
del_timer(&pcpu->cpu_slack_timer);
cpufreq_interactive_timer(smp_processor_id());// 删除timer, 再启动timer
}
up_read(&pcpu->enable_sem);
}