【cpufreq】【governor】【interactive】

最新推荐文章于 2022-01-13 21:57:20 发布

money_yuan

最新推荐文章于 2022-01-13 21:57:20 发布

阅读量1.3k

点赞数

分类专栏： linux内核

linux内核专栏收录该内容

57 篇文章 12 订阅

订阅专栏

interactive是比较常用的一个cpu调度策略，可调的参数比较多。governor大致相同。他为每个online的cpu创建两个timer来定时计算load（active和idle）计算出target_freq，然后线程cfinteractive获取多个cpu计算的target_freq,然后取最大值来设置policy的freq。

struct cpufreq_interactive_cpuinfo {
	struct timer_list cpu_timer; // 计算cpu loading的timer
	struct timer_list cpu_slack_timer;

	// 保护下面四个变量
	spinlock_t load_lock; /* protects the next 4 fields */
	u64 time_in_idle; //某个时间点计算的cpuidle的总时间
	u64 time_in_idle_timestamp; // idle对应的时刻
	u64 cputime_speedadj;//  某个时间点的 (频率* active_time) 累加和
	u64 cputime_speedadj_timestamp;// cputime_speedadj对应的时间点. timer start时清0
	
	struct cpufreq_policy *policy;// cpu对应的policy.  对managed cpu, 在governor启动前初始化; 对其他cpu, 在goveror启动时初始化
	struct cpufreq_frequency_table *freq_table;// cpu的频率表. 对managed cpu在driver->init时初始化, 其他cpu在goveror启动时初始化

	spinlock_t target_freq_lock; /*protects target freq */
	unsigned int target_freq; // cpu当前的目标频率, 在timer func中会被更新
	
	unsigned int floor_freq; // 可理解为上次调频的频率. 若new freq小于floor_freq
	u64 pol_floor_val_time; /* policy floor_validate_time */
	u64 loc_floor_val_time; /* per-cpu floor_validate_time */
	u64 pol_hispeed_val_time; /* policy hispeed_validate_time */
	u64 loc_hispeed_val_time; /* per-cpu hispeed_validate_time */
	struct rw_semaphore enable_sem; // 保护整个结构体
	
	int governor_enabled;// governor enable标志, 启动governor时会置1
};

一、init

static spinlock_t speedchange_cpumask_lock;
static struct mutex gov_lock;

static int __init cpufreq_interactive_init(void)
{
	unsigned int i;
	struct cpufreq_interactive_cpuinfo *pcpu;
	struct sched_param param = {
		.sched_priority = MAX_RT_PRIO-1
	};

	/* Initalize per-cpu timers */
	for_each_possible_cpu(i) {
		// 拷贝的每CPU变量
		pcpu = &per_cpu(cpuinfo, i);

		// timer的定时函数，会为每个cpu创建timer
		// init_timer_deferrable内核API,实现timer的初始化
		init_timer_deferrable(&pcpu->cpu_timer);//计算load的timer
		pcpu->cpu_timer.function = cpufreq_interactive_timer;//关联处理函数
		pcpu->cpu_timer.data = i;//timer0\timer1\timer2....

		// slack_timer的函数
		init_timer(&pcpu->cpu_slack_timer);
		pcpu->cpu_slack_timer.function = cpufreq_interactive_nop_timer;
		
		// 这个结构体里面总共的三把锁，需要为每个cpu初始化
		spin_lock_init(&pcpu->load_lock);
		spin_lock_init(&pcpu->target_freq_lock);
		init_rwsem(&pcpu->enable_sem);
	}

	spin_lock_init(&speedchange_cpumask_lock);
	mutex_init(&gov_lock);

	// 创建内核线程
	speedchange_task =kthread_create(cpufreq_interactive_speedchange_task, NULL, "cfinteractive");
	if (IS_ERR(speedchange_task))
		return PTR_ERR(speedchange_task);

	// 设置调度属性，这里设置线程调度是先进先出，还有线程优先级
	sched_setscheduler_nocheck(speedchange_task, SCHED_FIFO, ¶m);
	get_task_struct(speedchange_task);// 增加线程的引用计数

	/* NB: wake up so the thread does not look hung to the freezer */
	// 唤醒线程
	wake_up_process(speedchange_task);

	return cpufreq_register_governor(&cpufreq_gov_interactive);
}

// 一般默认都是interactive
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE
// 默认使用就提前注册fs_initcall是5
fs_initcall(cpufreq_interactive_init);
#else
// module_init是7
module_init(cpufreq_interactive_init);
#endif

二、cpufreq_governor_interactive

governro往核心注册了governor之后，cpufreq core会向具体的governor发送governor事件。绑定相应的回调函数可以响应具体的事件。这里先解析INIT和START事件，最后留一个LIMIT事件，是将freq限制在min和max之间，同时也需要将target_freq固定在min和max之间

// 定义一个每cpu变量
static DEFINE_PER_CPU(struct cpufreq_interactive_cpuinfo, cpuinfo);
static int cpufreq_governor_interactive(struct cpufreq_policy *policy, unsigned int event)
{
	int rc;
	unsigned int j;
	struct cpufreq_interactive_cpuinfo *pcpu;
	struct cpufreq_frequency_table *freq_table;
	struct cpufreq_interactive_tunables *tunables;
	unsigned long flags;

	if (have_governor_per_policy())
		tunables = policy->governor_data;
	else
		tunables = common_tunables;

	WARN_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT));

	switch (event) {
	// 这里应该是先响应INIT事件，再响应start事件
	case CPUFREQ_GOV_POLICY_INIT:
		if (have_governor_per_policy()) {
			WARN_ON(tunables);
		} else if (tunables) {//公用一套policy参数且已经分配了空间
			tunables->usage_count++;
			// governor里面的参数
			policy->governor_data = tunables;
			return 0;
		}

		tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
		if (!tunables) {
			pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__);
			return -ENOMEM;
		}

		tunables->usage_count = 1;
		tunables->above_hispeed_delay = default_above_hispeed_delay;
		tunables->nabove_hispeed_delay = ARRAY_SIZE(default_above_hispeed_delay);
		
		tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;
		
		tunables->target_loads = default_target_loads;
		tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);
		
		tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;
		tunables->timer_rate = DEFAULT_TIMER_RATE;
		tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME;
		tunables->timer_slack_val = DEFAULT_TIMER_SLACK;

		spin_lock_init(&tunables->target_loads_lock);
		spin_lock_init(&tunables->above_hispeed_delay_lock);

		policy->governor_data = tunables;
		if (!have_governor_per_policy()) {
			common_tunables = tunables;
		}

		rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr());
		if (rc) {
			kfree(tunables);
			policy->governor_data = NULL;
			if (!have_governor_per_policy()) {
				common_tunables = NULL;
			}
			return rc;
		}
		
		// 何时initialized被赋值为1
		if (!policy->governor->initialized) {
			// 注册idle通知事件
			idle_notifier_register(&cpufreq_inteactive_idle_nb);
			// 注册freq变化通知事件
			cpufreq_register_notifier(&cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER);
		}

		break;

	case CPUFREQ_GOV_POLICY_EXIT:
		if (!--tunables->usage_count) {
			if (policy->governor->initialized == 1) {
				// 和init对应
				cpufreq_unregister_notifier(&cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER);
				idle_notifier_unregister(&cpufreq_interactive_idle_nb);
			}

			sysfs_remove_group(get_governor_parent_kobj(policy),get_sysfs_attr());

			kfree(tunables);
			common_tunables = NULL;
		}

		policy->governor_data = NULL;
		break;

	case CPUFREQ_GOV_START:
		mutex_lock(&gov_lock);
		
		// 核心层提供的函数
		freq_table = cpufreq_frequency_get_table(policy->cpu);// 获得freq table

		// 初始化hispeed_freq
		if (!tunables->hispeed_freq)
			tunables->hispeed_freq = policy->max;
		
		// 初始化cpufreq_interactive_cpuinfo成员
		// cpufreq_interactive_cpuinfo应该是每个governor共有的参数
		for_each_cpu(j, policy->cpus) { 
			pcpu = &per_cpu(cpuinfo, j);
			pcpu->policy = policy;
			pcpu->target_freq = policy->cur;
			pcpu->freq_table = freq_table;// freq_table在driver->init中赋值
			pcpu->floor_freq = pcpu->target_freq;
			pcpu->pol_floor_val_time = ktime_to_us(ktime_get()) - tunables->min_sample_time;
			pcpu->loc_floor_val_time = pcpu->pol_floor_val_time;
			pcpu->pol_hispeed_val_time = pcpu->pol_floor_val_time;
			pcpu->loc_hispeed_val_time = pcpu->pol_floor_val_time;
			
			down_write(&pcpu->enable_sem);
			
			// 实际使用interactive的两个timer，所以删除公用的timer
			del_timer_sync(&pcpu->cpu_timer); 
			del_timer_sync(&pcpu->cpu_slack_timer);
			
			cpufreq_interactive_timer_start(tunables, j);// 启动采样timer
			pcpu->governor_enabled = 1;// 置governor_enable标志
			up_write(&pcpu->enable_sem);
		}

		mutex_unlock(&gov_lock);
		break;

	case CPUFREQ_GOV_STOP:
		mutex_lock(&gov_lock);
		for_each_cpu(j, policy->cpus) {
			pcpu = &per_cpu(cpuinfo, j);
			down_write(&pcpu->enable_sem);
			pcpu->governor_enabled = 0;
			del_timer_sync(&pcpu->cpu_timer); //  删除cpu_timer
			del_timer_sync(&pcpu->cpu_slack_timer);
			up_write(&pcpu->enable_sem);
		}

		mutex_unlock(&gov_lock);
		break;

	case CPUFREQ_GOV_LIMITS:
		 // 将实际频率调整到[min, max] 以内
		if (policy->max < policy->cur)
			__cpufreq_driver_target(policy,
					policy->max, CPUFREQ_RELATION_H);
		else if (policy->min > policy->cur)
			__cpufreq_driver_target(policy,
					policy->min, CPUFREQ_RELATION_L);
		for_each_cpu(j, policy->cpus) {
			pcpu = &per_cpu(cpuinfo, j);

			down_read(&pcpu->enable_sem);
			if (pcpu->governor_enabled == 0) {
				up_read(&pcpu->enable_sem);
				continue;
			}

			spin_lock_irqsave(&pcpu->target_freq_lock, flags);
			if (policy->max < pcpu->target_freq)
				pcpu->target_freq = policy->max;
			else if (policy->min > pcpu->target_freq)
				pcpu->target_freq = policy->min;

			spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
			up_read(&pcpu->enable_sem);
		}
		break;
	}
	return 0;
}

在INIT事件中注册了indle通知事件和transition（频率变化）事件。

在START事件中启动了每个cpu的timer

后面的故事就此展开了。

启动timer实际上是调用内核API接口，将timer加入到时钟里面去

// 在notify的回调函数里面实现start
/* The caller shall take enable_sem write semaphore to avoid any timer race.
 * The cpu_timer and cpu_slack_timer must be deactivated when calling this
 * function.
 */
static void cpufreq_interactive_timer_start( struct cpufreq_interactive_tunables *tunables, int cpu)
{
	struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, cpu);
	unsigned long expires = jiffies + usecs_to_jiffies(tunables->timer_rate);
	unsigned long flags;

	pcpu->cpu_timer.expires = expires;//CPU core 统计load的定时器
	add_timer_on(&pcpu->cpu_timer, cpu);//加入到定时器链表中去
	if (tunables->timer_slack_val >= 0 && pcpu->target_freq > pcpu->policy->min) {
		expires += usecs_to_jiffies(tunables->timer_slack_val);
		pcpu->cpu_slack_timer.expires = expires;
		add_timer_on(&pcpu->cpu_slack_timer, cpu);//加入到定时器链表中去
	}

	spin_lock_irqsave(&pcpu->load_lock, flags);
	//计算CPU启动到现在的idle时间
	pcpu->time_in_idle = get_cpu_idle_time(cpu, &pcpu->time_in_idle_timestamp, tunables->io_is_busy);
	pcpu->cputime_speedadj = 0;
	//计算启动启动到现在的时间
	pcpu->cputime_speedadj_timestamp = pcpu->time_in_idle_timestamp;
	spin_unlock_irqrestore(&pcpu->load_lock, flags);
}

三、timer

timer会更新target_freq然后wake_up,speedchange_task开始更改freq，遗憾的是timer的回调函数中不能出现printk。

因为在一个timer时间内，可能运行多个freq，所以在freq发生改变时，响应freq改变的回调函数，更新load

static int cpufreq_interactive_notifier( struct notifier_block *nb, unsigned long val, void *data)
{
	struct cpufreq_freqs *freq = data;
	struct cpufreq_interactive_cpuinfo *pcpu;
	int cpu;
	unsigned long flags;

	if (val == CPUFREQ_POSTCHANGE) {
		pcpu = &per_cpu(cpuinfo, freq->cpu);
		if (!down_read_trylock(&pcpu->enable_sem))
			return 0;
		if (!pcpu->governor_enabled) {
			up_read(&pcpu->enable_sem);
			return 0;
		}

		for_each_cpu(cpu, pcpu->policy->cpus) {
			struct cpufreq_interactive_cpuinfo *pjcpu =
				&per_cpu(cpuinfo, cpu);
			if (cpu != freq->cpu) {
				if (!down_read_trylock(&pjcpu->enable_sem))
					continue;
				if (!pjcpu->governor_enabled) {
					up_read(&pjcpu->enable_sem);
					continue;
				}
			}
			spin_lock_irqsave(&pjcpu->load_lock, flags);
			update_load(cpu);//更新load，因为在一个timer内可能运行多个freq
			spin_unlock_irqrestore(&pjcpu->load_lock, flags);
			if (cpu != freq->cpu)
				up_read(&pjcpu->enable_sem);
		}

		up_read(&pcpu->enable_sem);
	}
	return 0;
}

static struct notifier_block cpufreq_notifier_block = {
	.notifier_call = cpufreq_interactive_notifier,
};

// 更新load
static u64 update_load(int cpu)
{
	struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, cpu);
	struct cpufreq_interactive_tunables *tunables = pcpu->policy->governor_data;
	u64 now;//本次的update time，应该是本次统计idle时的时间戳
	u64 now_idle; //系统启动以后运行的idle的总时间
	u64 delta_idle;
	u64 delta_time;
	u64 active_time;

	now_idle = get_cpu_idle_time(cpu, &now, tunables->io_is_busy);
	//pcpu->time_in_idle：上次统计时的idle的总时间 
	delta_idle = (now_idle - pcpu->time_in_idle);//delta_idle：两次统计之间的idle总时间
	//pcpu->time_in_idle_timestamp，上次统计idle时的时间戳 
	delta_time = (now - pcpu->time_in_idle_timestamp);//delta_time：两次统计之间系统运行的总时间

	if (delta_time <= delta_idle)
		active_time = 0;
	else
		active_time = delta_time - delta_idle;
	// 这个并不是因为有多个cpu，是单个cpu在time_rate里面实际上会跑多个freq
	// 实际上一个timer_rate里面跑多个freq也是统计不出来啊
	pcpu->cputime_speedadj += active_time * pcpu->policy->cur;

	pcpu->time_in_idle = now_idle;
	pcpu->time_in_idle_timestamp = now;
	return now;
}

//计算cpu load的timer，也就是说定时会去计算一次load
static void cpufreq_interactive_timer(unsigned long data)
{
	u64 now;
	unsigned int delta_time;
	u64 cputime_speedadj;
	int cpu_load;
	struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, data);
	
	// 在init事件中已经对他进行了赋值
	struct cpufreq_interactive_tunables *tunables = pcpu->policy->governor_data;
	unsigned int new_freq;
	unsigned int loadadjfreq;
	unsigned int index;
	unsigned long flags;
	u64 max_fvtime;

	if (!down_read_trylock(&pcpu->enable_sem))
		return;
	if (!pcpu->governor_enabled)//在START事件结束的时候被赋值为1
		goto exit;

	spin_lock_irqsave(&pcpu->load_lock, flags);
	now = update_load(data); // 更新cputime_speedadj
	
	delta_time = (unsigned int)(now - pcpu->cputime_speedadj_timestamp);//两次统计的总的时间间隔
	cputime_speedadj = pcpu->cputime_speedadj;//已经在update_load中更新
	spin_unlock_irqrestore(&pcpu->load_lock, flags);

	if (WARN_ON_ONCE(!delta_time))
		goto rearm;

	spin_lock_irqsave(&pcpu->target_freq_lock, flags);
	
	/*活动时间除以总的运行时间在乘以当前频率，值存储在cputime_speedadj中*/
	do_div(cputime_speedadj, delta_time);

	loadadjfreq = (unsigned int)cputime_speedadj * 100;//内核不支持浮点数
	cpu_load = loadadjfreq / pcpu->policy->cur;// 得到cpu的平均loading，为什么除以cur的值就是平均的load呢？
	
	tunables->boosted = tunables->boost_val || now < tunables->boostpulse_endtime;

	// load很高，或者boost开关打开，所以此时最好的是从hispeed模式切换到boost模式
	if (cpu_load >= tunables->go_hispeed_load || tunables->boosted) {
		// 此时目标频率不能低于hispeed_freq，需要拉高freq
		if (pcpu->policy->cur < tunables->hispeed_freq) {
			new_freq = tunables->hispeed_freq;
		} else {
			new_freq = choose_freq(pcpu, loadadjfreq);// 通过choose_freq选择目标频率

			if (new_freq < tunables->hispeed_freq)
				new_freq = tunables->hispeed_freq;
		}
	} else {// load比较低的情况，选择的freq高于hispeed但是当前小于hispeed此时应该先升高到hispeed
		new_freq = choose_freq(pcpu, loadadjfreq);// 通过choose_freq选择目标频率
		// hispeed_freq以上升频时, 必须满足一定的间隔, 功耗考量
		if (new_freq > tunables->hispeed_freq && pcpu->policy->cur < tunables->hispeed_freq)
			new_freq = tunables->hispeed_freq;
	}

	//（系统仍然想增加频率）
	//now是本次采样时间戳，pcpu->hispeed_validate_time是上次hispeed生效的时间戳，如果两次时间间隔比above_hispeed_delay小，那么直接goto rearm，不调节频率.
	if (pcpu->policy->cur >= tunables->hispeed_freq &&new_freq > pcpu->policy->cur &&
	    now - pcpu->pol_hispeed_val_time < freq_to_above_hispeed_delay(tunables, pcpu->policy->cur)) {
		trace_cpufreq_interactive_notyet(data, cpu_load, pcpu->target_freq, pcpu->policy->cur, new_freq);
		spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
		goto rearm;
	}

	//更新hispeed_validate_time为now
	pcpu->loc_hispeed_val_time = now;

	//CPUFREQ_RELATION_L，表示要取大于等于target的最小值 
	if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table,new_freq, CPUFREQ_RELATION_L, &index)) {
		spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
		goto rearm;
	}
	// 取大于等于target的最小值作为freq，转换成平台支持freq
	new_freq = pcpu->freq_table[index].frequency;

	/*
	 * Do not scale below floor_freq unless we have been at or above the
	 * floor frequency for the minimum sample time since last validated.
	 */
	// 进一步将频时，需要在最小变化时间之后才可以继续降频
	max_fvtime = max(pcpu->pol_floor_val_time, pcpu->loc_floor_val_time);
	//当new_freq < pcpu->floor_freq，并且两次floor_validate_time的间隔小于min_sample_time，此时不需要更新频率
	if (new_freq < pcpu->floor_freq) {
		if (now - max_fvtime < tunables->min_sample_time) {
			trace_cpufreq_interactive_notyet(data, cpu_load, pcpu->target_freq, pcpu->policy->cur, new_freq);
			pcpu->target_freq = new_freq;
			spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
			goto rearm;
		}
	}

	/*
	 * Update the timestamp for checking whether speed has been held at
	 * or above the selected frequency for a minimum of min_sample_time,
	 * if not boosted to hispeed_freq.  If boosted to hispeed_freq then we
	 * allow the speed to drop as soon as the boostpulse duration expires
	 * (or the indefinite boost is turned off).
	 */
	// 仅当bootsted && new_freq < hispeed_freq时
	if (!tunables->boosted || new_freq > tunables->hispeed_freq) {
		pcpu->floor_freq = new_freq;//floor_freq可以视为基准freq
		if (pcpu->target_freq >= pcpu->policy->cur || new_freq >= pcpu->policy->cur)
			pcpu->loc_floor_val_time = now;
	}
	
	// 现在就是目标freq，就不变freq了
	if (pcpu->policy->cur <= pcpu->policy->max &&pcpu->policy->cur >= pcpu->policy->min &&pcpu->target_freq == new_freq &&pcpu->target_freq == pcpu->policy->cur) {
		trace_cpufreq_interactive_already(data, cpu_load, pcpu->target_freq, pcpu->policy->cur, new_freq);
		spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
		goto rearm;
	}

	trace_cpufreq_interactive_target(data, cpu_load, pcpu->target_freq, pcpu->policy->cur, new_freq);

	pcpu->target_freq = new_freq; // 更新target_freq变量
	spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
	spin_lock_irqsave(&speedchange_cpumask_lock, flags);
	cpumask_set_cpu(data, &speedchange_cpumask);
	spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
	wake_up_process(speedchange_task); // 唤醒调频线程, 开始调频

rearm:
	if (!timer_pending(&pcpu->cpu_timer))//定时器是否被pending
		cpufreq_interactive_timer_resched(pcpu);// 设置timer下一次触发

exit:
	up_read(&pcpu->enable_sem);
	return;
}

为下一次计算load做准备，提前应该重启定时器：

// 重启定时器
static void cpufreq_interactive_timer_resched(struct cpufreq_interactive_cpuinfo *pcpu)
{
	struct cpufreq_interactive_tunables *tunables = pcpu->policy->governor_data;
	unsigned long expires;
	unsigned long flags;


	spin_lock_irqsave(&pcpu->load_lock, flags);
	pcpu->time_in_idle =get_cpu_idle_time(smp_processor_id(),&pcpu->time_in_idle_timestamp, tunables->io_is_busy);
	pcpu->cputime_speedadj = 0;// 重启定时器的最重要的一点就是将cputime_speedadj设置为0，好重新开始计算load
	pcpu->cputime_speedadj_timestamp = pcpu->time_in_idle_timestamp;
	
	// 重启time_rate和time_slack
	expires = jiffies + usecs_to_jiffies(tunables->timer_rate);
	mod_timer_pinned(&pcpu->cpu_timer, expires);


	if (tunables->timer_slack_val >= 0 &&pcpu->target_freq > pcpu->policy->min) {
		expires += usecs_to_jiffies(tunables->timer_slack_val);
		mod_timer_pinned(&pcpu->cpu_slack_timer, expires);
	}


	spin_unlock_irqrestore(&pcpu->load_lock, flags);
}

至于何时重启，后面继续分析。还有就是这里常用的以数组的形式传入参数进行解析

static unsigned int freq_to_above_hispeed_delay(struct cpufreq_interactive_tunables *tunables, unsigned int freq)
{
	int i;
	unsigned int ret;
	unsigned long flags;

	spin_lock_irqsave(&tunables->above_hispeed_delay_lock, flags);
	// 直接是索引到最大值
	for (i = 0; i < tunables->nabove_hispeed_delay - 1 &&
			freq >= tunables->above_hispeed_delay[i+1]; i += 2)
		;

	ret = tunables->above_hispeed_delay[i];
	spin_unlock_irqrestore(&tunables->above_hispeed_delay_lock, flags);
	return ret;
}

四、speedchange_task

speedchange_task被唤醒之后会查找每个online，cpu的target_freq，然后取最大值freq作为policy的target的freq，调用具体cpufreq_driver的taget或者是target_index接口来实现调频

static int cpufreq_interactive_speedchange_task(void *data)
{
	unsigned int cpu;
	cpumask_t tmp_mask;
	unsigned long flags;
	struct cpufreq_interactive_cpuinfo *pcpu;
	
	// 循环等待，但是会通过schedule()来释放cpu资源
	while (1) {
		set_current_state(TASK_INTERRUPTIBLE);//进程被挂起
		
		spin_lock_irqsave(&speedchange_cpumask_lock, flags);

		//查询是否有调频请求
		if (cpumask_empty(&speedchange_cpumask)) {
			spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
			schedule();//如果没有哪个CPUcore的频率需要调整，就去执行其他事情

			if (kthread_should_stop())//判断当前线程是否stop
				break;

			spin_lock_irqsave(&speedchange_cpumask_lock, flags);
		}

		set_current_state(TASK_RUNNING);/*将线程设置为可运行状态*/
		tmp_mask = speedchange_cpumask;// 获得需要调频的cpu信息
		cpumask_clear(&speedchange_cpumask);/*记得每次都要清除，因为这个值可能时刻在改变着*/
		spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
		
		/*这里开始真正的频率调节了*/
		for_each_cpu(cpu, &tmp_mask) {//先遍历cpu_mask相关的cpu，再遍历cpu online的cpu
			pcpu = &per_cpu(cpuinfo, cpu);

			down_write(&pcpu->policy->rwsem);

			if (likely(down_read_trylock(&pcpu->enable_sem))) {//尝试上锁成功
				if (likely(pcpu->governor_enabled))
					cpufreq_interactive_adjust_cpu(cpu, pcpu->policy);
				up_read(&pcpu->enable_sem);
			}

			up_write(&pcpu->policy->rwsem);
		}
	}

	return 0;
}

static void cpufreq_interactive_adjust_cpu(unsigned int cpu, struct cpufreq_policy *policy)
{
	struct cpufreq_interactive_cpuinfo *pcpu;
	u64 hvt, fvt;
	unsigned int max_freq;
	int i;

	cpufreq_interactive_get_policy_info(policy, &max_freq, &hvt, &fvt);

	for_each_cpu(i, policy->cpus) {
		pcpu = &per_cpu(cpuinfo, i);
		pcpu->pol_floor_val_time = fvt;
	}

	if (max_freq != policy->cur || policy->cur > policy->max || policy->cur < policy->min) {
		// 核型层函数，实现调用最高频
		__cpufreq_driver_target(policy, max_freq, CPUFREQ_RELATION_H);
		for_each_cpu(i, policy->cpus) {
			pcpu = &per_cpu(cpuinfo, i);
			pcpu->pol_hispeed_val_time = hvt;
		}
	}
	
	// 外部提供的函数
	trace_cpufreq_interactive_setspeed(cpu, max_freq, policy->cur);
}


static void cpufreq_interactive_get_policy_info(struct cpufreq_policy *policy,unsigned int *pmax_freq, u64 *phvt, u64 *pfvt)
{
	struct cpufreq_interactive_cpuinfo *pcpu;
	unsigned int max_freq = 0;
	u64 hvt = ~0ULL, fvt = 0;
	unsigned int i;

	for_each_cpu(i, policy->cpus) {//先遍历cpu_mask相关的cpu，再遍历cpu online的cpu
		pcpu = &per_cpu(cpuinfo, i);

		fvt = max(fvt, pcpu->loc_floor_val_time);
		if (pcpu->target_freq > max_freq) {
			max_freq = pcpu->target_freq;// 获取最大的target_freq作为policy->target_freq
			hvt = pcpu->loc_hispeed_val_time;
		} else if (pcpu->target_freq == max_freq) {
			hvt = min(hvt, pcpu->loc_hispeed_val_time);
		}
	}

	*pmax_freq = max_freq;
	*phvt = hvt;
	*pfvt = fvt;
}

五、idle处理

进入idle，cpu将尝试timer pending并将freq调制最低。idle结束的时候，将重启timer

static int cpufreq_interactive_idle_notifier(struct notifier_block *nb, unsigned long val, void *data)
{
	if (val == IDLE_END)
		cpufreq_interactive_idle_end();

	return 0;
}

static struct notifier_block cpufreq_interactive_idle_nb = {
	.notifier_call = cpufreq_interactive_idle_notifier,
};

// 在idle状态会将freq调到最低，在idle结束的时候就应该重启timer
static void cpufreq_interactive_idle_end(void)
{
	struct cpufreq_interactive_cpuinfo *pcpu =
		&per_cpu(cpuinfo, smp_processor_id());

	if (!down_read_trylock(&pcpu->enable_sem))
		return;
	if (!pcpu->governor_enabled) {
		up_read(&pcpu->enable_sem);
		return;
	}

	/* Arm the timer for 1-2 ticks later if not already. */
	if (!timer_pending(&pcpu->cpu_timer)) {
		cpufreq_interactive_timer_resched(pcpu);// 重启timer
	} else if (time_after_eq(jiffies, pcpu->cpu_timer.expires)) {// timer过期 (idle太久)
		del_timer(&pcpu->cpu_timer);
		del_timer(&pcpu->cpu_slack_timer);
		cpufreq_interactive_timer(smp_processor_id());// 删除timer, 再启动timer
	}

	up_read(&pcpu->enable_sem);
}

money_yuan

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
【cpufreq】【governor】【interactive】

interactive是比较常用的一个cpu调度策略，可调的参数比较多。governor大致相同。他为每个online的cpu创建两个timer来定时计算load（active和idle）计算出target_freq，然后线程cfinteractive获取多个cpu计算的target_freq,然后取最大值来设置policy的freq。struct cpufreq_interactive...
复制链接

扫一扫