softlockup(watchdog)用于检测系统调度是否正常,即软锁的状况,当发生softlockup时,内核不能调度,但还能响应中断,对用户的表现可能为:能ping通,但没法登录系统,没法进行正常操做。
其基本原理为:为每一个CPU启动一个内核线程(watchdog/x),此线程为优先级最高的实时线程,在该线程获得调度时,会更新相应的计数(时间戳),同时会启动定时器,当定时器到期时检查相应的时间戳,若是超过指定时间,都没有更新,则说明这段时间内都没有发生调度(由于此线程优先级最高),则打印相应告警或根据配置能够进入panic流程。
基本代码分析(2.6.32)
rest_init->kernel_init->lockup_detector_init->cpu_callback->watchdog_prepare_cpu(初始化watchdog定时器):
less
点击(此处)折叠或打开ide
static int watchdog_prepare_cpu(int cpu)
函数
{
this
struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
atom
WARN_ON(per_cpu(softlockup_watchdog, cpu));
spa
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);//初始化高精度定时器
线程
hrtimer->function = watchdog_timer_fn;//设置定时器处理函数
debug
return 0;
rest
}get
看门狗定时器处理函数:
点击(此处)折叠或打开
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
//获取计数watchdog_touch_ts,该计数在watchdog内核线程被调度时更新
unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
struct pt_regs *regs = get_irq_regs();
int duration;
/* kick the hardlockup detector */
//增长中断计数,证实没有发生硬锁(关中断死锁)
watchdog_interrupt_count();
/* kick the softlockup detector */
//唤醒wathdog内核线程
wake_up_process(__get_cpu_var(softlockup_watchdog));
/* .. and repeat */
//重启定时器
hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
if (touch_ts == 0) {
if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
/*
* If the time stamp was touched atomically
* make sure the scheduler tick is up to date.
*/
__get_cpu_var(softlockup_touch_sync) = false;
sched_clock_tick();
}
__touch_watchdog();
return HRTIMER_RESTART;
}
/* check for a softlockup
* This is done by making sure a high priority task is
* being scheduled. The task touches the watchdog to
* indicate it is getting cpu time. If it hasn't then
* this is a good indication some task is hogging the cpu
*/
//判断是否发生了软锁,原理是判断touch_ts(时间戳)是否超过必定时间没有更新
duration = is_softlockup(touch_ts);
if (unlikely(duration)) {
/* only warn once */
if (__get_cpu_var(soft_watchdog_warn) == true)
return HRTIMER_RESTART;
//发生了软锁后,进行一些列的信息记录和告警。
printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
print_modules();
print_irqtrace_events(current);
if (regs)
show_regs(regs);
else
dump_stack();
//若是配置了softlockup_panic(proc中配置),则panic
if (softlockup_panic)
panic("softlockup: hung tasks");
__get_cpu_var(soft_watchdog_warn) = true;
} else
__get_cpu_var(soft_watchdog_warn) = false;
return HRTIMER_RESTART;
}
启动看门狗,即建立watchdog内核线程。
点击(此处)折叠或打开
static int watchdog_enable(int cpu)
{
struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
int err = 0;
/* enable the perf event */
err = watchdog_nmi_enable(cpu);
/* Regardless of err above, fall through and start softlockup */
/* create the watchdog thread */
if (!p) {
//建立watchdog内核线程
p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
if (IS_ERR(p)) {
printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
if (!err)
/* if hardlockup hasn't already set this */
err = PTR_ERR(p);
goto out;
}
kthread_bind(p, cpu);
per_cpu(watchdog_touch_ts, cpu) = 0;
per_cpu(softlockup_watchdog, cpu) = p;
wake_up_process(p);
}
out:
return err;
}
watchdog内核线程执行主函数,主要是要更新计数(时间戳)
点击(此处)折叠或打开
static int watchdog(void *unused)
{
//设置为最高优先级
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
//设置为实时线程
sched_setscheduler(current, SCHED_FIFO, ¶m);
/* initialize timestamp */
//初始化计数(时间戳)
__touch_watchdog();
/* kick off the timer for the hardlockup detector */
/* done here because hrtimer_start can only pin to smp_processor_id() */
//启动定时器,用于检测是否发生软锁
hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
HRTIMER_MODE_REL_PINNED);
//睡眠
set_current_state(TASK_INTERRUPTIBLE);
/*
* Run briefly once per second to reset the softlockup timestamp.
* If this gets delayed for more than 60 seconds then the
* debug-printout triggers in watchdog_timer_fn().
*/
while (!kthread_should_stop()) {
//更新计数
__touch_watchdog();
schedule();
if (kthread_should_stop())
break;
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return 0;
}
判断是否发生软锁:is_softlockup
点击(此处)折叠或打开
static int is_softlockup(unsigned long touch_ts)
{
unsigned long now = get_timestamp(smp_processor_id());
/* Warn about unreasonable delays: */
//检测计数多久没有更新了,若是超过了60s,则表示发生了软锁
if (time_after(now, touch_ts + softlockup_thresh))
return now - touch_ts;
return 0;
}