From: jbeulich@novell.com Subject: reduce contention on xtime_lock Patch-mainline: n/a References: bnc#569014, bnc#571041, bnc#571769, bnc#572146 Especially on large systems the number of CPUs queueing up on xtime_lock may become signficiant, and (as reported in the bugs above) may even prevent proper operation of the system when Xen is using deep C-states. There is, however, no need for all CPUs in the system to update global time - it is sufficient to have a single (at any given point in time) CPU being responsible for this. Also, while touching that code, avoid calling printk() with xtime_lock held. --- sle11sp1-2010-02-17.orig/arch/x86/kernel/time-xen.c 2010-02-18 17:30:18.000000000 +0100 +++ sle11sp1-2010-02-17/arch/x86/kernel/time-xen.c 2010-02-18 17:33:07.000000000 +0100 @@ -58,6 +58,7 @@ static u32 shadow_tv_version; /* Keep track of last time we did processing/updating of jiffies and xtime. */ static u64 processed_system_time; /* System time (ns) at last processing. */ static DEFINE_PER_CPU(u64, processed_system_time); +static DEFINE_PER_CPU(u64, accounted_system_time); /* How much CPU time was spent blocked and how much was 'stolen'? */ static DEFINE_PER_CPU(u64, processed_stolen_time); @@ -123,6 +124,19 @@ static int __init __permitted_clock_jitt __setup("permitted_clock_jitter=", __permitted_clock_jitter); /* + * Limit on the number of CPUs that may concurrently attempt to acquire + * xtime_lock in timer_interrupt() (reducing contention potentially leading + * to a live lock on systems with many CPUs. + */ +static unsigned int __read_mostly duty_limit = -2; +static int __init set_duty_limit(char *str) +{ + duty_limit = simple_strtoul(str, NULL, 0) - 1; + return 1; +} +__setup("timer_duty_limit=", set_duty_limit); + +/* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. */ @@ -422,9 +436,11 @@ EXPORT_SYMBOL(profile_pc); */ static irqreturn_t timer_interrupt(int irq, void *dev_id) { + static unsigned int contention_count; s64 delta, delta_cpu, stolen, blocked; unsigned int i, cpu = smp_processor_id(); struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); + bool duty = false; struct vcpu_runstate_info runstate; /* Keep nmi watchdog up to date */ @@ -437,7 +453,13 @@ static irqreturn_t timer_interrupt(int i * the irq version of write_lock because as just said we have irq * locally disabled. -arca */ - write_seqlock(&xtime_lock); + asm (LOCK_PREFIX "xaddl %1, %0" + : "+m" (contention_count), "=r" (i) : "1" (1)); + if (i <= duty_limit) { + write_seqlock(&xtime_lock); + duty = true; + } + asm (LOCK_PREFIX "decl %0" : "+m" (contention_count)); do { get_time_values_from_xen(cpu); @@ -451,40 +473,63 @@ static irqreturn_t timer_interrupt(int i get_runstate_snapshot(&runstate); } while (!time_values_up_to_date()); - if ((unlikely(delta < -(s64)permitted_clock_jitter) || - unlikely(delta_cpu < -(s64)permitted_clock_jitter)) - && printk_ratelimit()) { - printk("Timer ISR/%u: Time went backwards: " - "delta=%lld delta_cpu=%lld shadow=%lld " - "off=%lld processed=%lld cpu_processed=%lld\n", - cpu, delta, delta_cpu, shadow->system_timestamp, - (s64)get_nsec_offset(shadow), - processed_system_time, - per_cpu(processed_system_time, cpu)); - for (i = 0; i < num_online_cpus(); i++) - printk(" %d: %lld\n", i, - per_cpu(processed_system_time, i)); - } + if (duty && unlikely(delta < -(s64)permitted_clock_jitter)) { + blocked = processed_system_time; + write_sequnlock(&xtime_lock); + if (printk_ratelimit()) { + printk("Timer ISR/%u: Time went backwards: " + "delta=%Ld/%Ld shadow=%Lx off=%Lx " + "processed=%Lx/%Lx\n", + cpu, delta, delta_cpu, shadow->system_timestamp, + get_nsec_offset(shadow), blocked, + per_cpu(processed_system_time, cpu)); + for_each_cpu_and(i, cpu_online_mask, cpumask_of(cpu)) + printk(" %u: %Lx\n", i, + per_cpu(processed_system_time, i)); + } + } else if (unlikely(delta_cpu < -(s64)permitted_clock_jitter)) { + blocked = processed_system_time; + if (duty) + write_sequnlock(&xtime_lock); + if (printk_ratelimit()) { + printk("Timer ISR/%u: Time went backwards: delta=%Ld" + " shadow=%Lx off=%Lx processed=%Lx/%Lx\n", + cpu, delta_cpu, shadow->system_timestamp, + get_nsec_offset(shadow), blocked, + per_cpu(processed_system_time, cpu)); + for_each_cpu_and(i, cpu_online_mask, cpumask_of(cpu)) + printk(" %u: %Lx\n", i, + per_cpu(processed_system_time, i)); + } + } else if (duty) { + /* System-wide jiffy work. */ + if (delta >= NS_PER_TICK) { + do_div(delta, NS_PER_TICK); + processed_system_time += delta * NS_PER_TICK; + while (delta > HZ) { + clobber_induction_variable(delta); + do_timer(HZ); + delta -= HZ; + } + do_timer(delta); + } - /* System-wide jiffy work. */ - if (delta >= NS_PER_TICK) { - do_div(delta, NS_PER_TICK); - processed_system_time += delta * NS_PER_TICK; - while (delta > HZ) { - clobber_induction_variable(delta); - do_timer(HZ); - delta -= HZ; + if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) { + update_wallclock(); + if (keventd_up()) + schedule_work(&clock_was_set_work); } - do_timer(delta); - } - if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) { - update_wallclock(); - if (keventd_up()) - schedule_work(&clock_was_set_work); + write_sequnlock(&xtime_lock); } - write_sequnlock(&xtime_lock); + delta = delta_cpu; + delta_cpu += per_cpu(processed_system_time, cpu) + - per_cpu(accounted_system_time, cpu); + if (delta >= NS_PER_TICK) { + do_div(delta, NS_PER_TICK); + per_cpu(processed_system_time, cpu) += delta * NS_PER_TICK; + } /* * Account stolen ticks. @@ -499,7 +544,7 @@ static irqreturn_t timer_interrupt(int i stolen += delta_cpu; /* clamp local-time progress */ do_div(stolen, NS_PER_TICK); per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK; - per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK; + per_cpu(accounted_system_time, cpu) += stolen * NS_PER_TICK; account_steal_ticks(stolen); } @@ -515,14 +560,14 @@ static irqreturn_t timer_interrupt(int i blocked += delta_cpu; /* clamp local-time progress */ do_div(blocked, NS_PER_TICK); per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK; - per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK; + per_cpu(accounted_system_time, cpu) += blocked * NS_PER_TICK; account_idle_ticks(blocked); } /* Account user/system ticks. */ if (delta_cpu > 0) { do_div(delta_cpu, NS_PER_TICK); - per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK; + per_cpu(accounted_system_time, cpu) += delta_cpu * NS_PER_TICK; if (user_mode_vm(get_irq_regs())) account_user_time(current, (cputime_t)delta_cpu, (cputime_t)delta_cpu); @@ -623,6 +668,7 @@ static void xen_clocksource_resume(void) BUG(); } get_time_values_from_xen(cpu); + per_cpu(accounted_system_time, cpu) = per_cpu(processed_system_time, cpu) = per_cpu(shadow_time, 0).system_timestamp; init_missing_ticks_accounting(cpu); @@ -725,6 +771,7 @@ void __init time_init(void) processed_system_time = per_cpu(shadow_time, 0).system_timestamp; per_cpu(processed_system_time, 0) = processed_system_time; + per_cpu(accounted_system_time, 0) = processed_system_time; init_missing_ticks_accounting(0); clocksource_register(&clocksource_xen); @@ -735,6 +782,9 @@ void __init time_init(void) /* Cannot request_irq() until kmem is initialised. */ late_time_init = setup_cpu0_timer_irq; + + if (!(duty_limit + 2)) + duty_limit = __fls(nr_cpu_ids); } /* Convert jiffies to system time. */ @@ -773,6 +823,7 @@ static void stop_hz_timer(void) struct vcpu_set_singleshot_timer singleshot; unsigned int cpu = smp_processor_id(); unsigned long j; + u64 local; int rc; cpumask_set_cpu(cpu, nohz_cpu_mask); @@ -798,6 +849,11 @@ static void stop_hz_timer(void) singleshot.timeout_abs_ns = jiffies_to_st(j); if (!singleshot.timeout_abs_ns) return; + local = per_cpu(processed_system_time, cpu); + if ((s64)(singleshot.timeout_abs_ns - local) <= NS_PER_TICK) { + cpumask_clear_cpu(cpu, nohz_cpu_mask); + singleshot.timeout_abs_ns = local + NS_PER_TICK; + } singleshot.timeout_abs_ns += NS_PER_TICK / 2; singleshot.flags = 0; rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot); @@ -862,6 +918,7 @@ int __cpuinit local_setup_timer(unsigned do { seq = read_seqbegin(&xtime_lock); /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */ + per_cpu(accounted_system_time, cpu) = per_cpu(processed_system_time, cpu) = per_cpu(shadow_time, 0).system_timestamp; init_missing_ticks_accounting(cpu);