kernel-xen/xen-x86-xtime-lock

246 lines
8.7 KiB
Text
Raw Permalink Normal View History

2012-04-14 04:10:33 +04:00
From: jbeulich@novell.com
Subject: reduce contention on xtime_lock
Patch-mainline: n/a
References: bnc#569014, bnc#571041, bnc#571769, bnc#572146
Especially on large systems the number of CPUs queueing up on
xtime_lock may become signficiant, and (as reported in the bugs above)
may even prevent proper operation of the system when Xen is using
deep C-states. There is, however, no need for all CPUs in the system
to update global time - it is sufficient to have a single (at any given
point in time) CPU being responsible for this.
Also, while touching that code, avoid calling printk() with xtime_lock
held.
--- sle11sp1-2010-02-17.orig/arch/x86/kernel/time-xen.c 2010-02-18 17:30:18.000000000 +0100
+++ sle11sp1-2010-02-17/arch/x86/kernel/time-xen.c 2010-02-18 17:33:07.000000000 +0100
@@ -58,6 +58,7 @@ static u32 shadow_tv_version;
/* Keep track of last time we did processing/updating of jiffies and xtime. */
static u64 processed_system_time; /* System time (ns) at last processing. */
static DEFINE_PER_CPU(u64, processed_system_time);
+static DEFINE_PER_CPU(u64, accounted_system_time);
/* How much CPU time was spent blocked and how much was 'stolen'? */
static DEFINE_PER_CPU(u64, processed_stolen_time);
@@ -123,6 +124,19 @@ static int __init __permitted_clock_jitt
__setup("permitted_clock_jitter=", __permitted_clock_jitter);
/*
+ * Limit on the number of CPUs that may concurrently attempt to acquire
+ * xtime_lock in timer_interrupt() (reducing contention potentially leading
+ * to a live lock on systems with many CPUs.
+ */
+static unsigned int __read_mostly duty_limit = -2;
+static int __init set_duty_limit(char *str)
+{
+ duty_limit = simple_strtoul(str, NULL, 0) - 1;
+ return 1;
+}
+__setup("timer_duty_limit=", set_duty_limit);
+
+/*
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
* yielding a 64-bit result.
*/
@@ -422,9 +436,11 @@ EXPORT_SYMBOL(profile_pc);
*/
static irqreturn_t timer_interrupt(int irq, void *dev_id)
{
+ static unsigned int contention_count;
s64 delta, delta_cpu, stolen, blocked;
unsigned int i, cpu = smp_processor_id();
struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+ bool duty = false;
struct vcpu_runstate_info runstate;
/* Keep nmi watchdog up to date */
@@ -437,7 +453,13 @@ static irqreturn_t timer_interrupt(int i
* the irq version of write_lock because as just said we have irq
* locally disabled. -arca
*/
- write_seqlock(&xtime_lock);
+ asm (LOCK_PREFIX "xaddl %1, %0"
+ : "+m" (contention_count), "=r" (i) : "1" (1));
+ if (i <= duty_limit) {
+ write_seqlock(&xtime_lock);
+ duty = true;
+ }
+ asm (LOCK_PREFIX "decl %0" : "+m" (contention_count));
do {
get_time_values_from_xen(cpu);
@@ -451,40 +473,63 @@ static irqreturn_t timer_interrupt(int i
get_runstate_snapshot(&runstate);
} while (!time_values_up_to_date());
- if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
- unlikely(delta_cpu < -(s64)permitted_clock_jitter))
- && printk_ratelimit()) {
- printk("Timer ISR/%u: Time went backwards: "
- "delta=%lld delta_cpu=%lld shadow=%lld "
- "off=%lld processed=%lld cpu_processed=%lld\n",
- cpu, delta, delta_cpu, shadow->system_timestamp,
- (s64)get_nsec_offset(shadow),
- processed_system_time,
- per_cpu(processed_system_time, cpu));
- for (i = 0; i < num_online_cpus(); i++)
- printk(" %d: %lld\n", i,
- per_cpu(processed_system_time, i));
- }
+ if (duty && unlikely(delta < -(s64)permitted_clock_jitter)) {
+ blocked = processed_system_time;
+ write_sequnlock(&xtime_lock);
+ if (printk_ratelimit()) {
+ printk("Timer ISR/%u: Time went backwards: "
+ "delta=%Ld/%Ld shadow=%Lx off=%Lx "
+ "processed=%Lx/%Lx\n",
+ cpu, delta, delta_cpu, shadow->system_timestamp,
+ get_nsec_offset(shadow), blocked,
+ per_cpu(processed_system_time, cpu));
+ for_each_cpu_and(i, cpu_online_mask, cpumask_of(cpu))
+ printk(" %u: %Lx\n", i,
+ per_cpu(processed_system_time, i));
+ }
+ } else if (unlikely(delta_cpu < -(s64)permitted_clock_jitter)) {
+ blocked = processed_system_time;
+ if (duty)
+ write_sequnlock(&xtime_lock);
+ if (printk_ratelimit()) {
+ printk("Timer ISR/%u: Time went backwards: delta=%Ld"
+ " shadow=%Lx off=%Lx processed=%Lx/%Lx\n",
+ cpu, delta_cpu, shadow->system_timestamp,
+ get_nsec_offset(shadow), blocked,
+ per_cpu(processed_system_time, cpu));
+ for_each_cpu_and(i, cpu_online_mask, cpumask_of(cpu))
+ printk(" %u: %Lx\n", i,
+ per_cpu(processed_system_time, i));
+ }
+ } else if (duty) {
+ /* System-wide jiffy work. */
+ if (delta >= NS_PER_TICK) {
+ do_div(delta, NS_PER_TICK);
+ processed_system_time += delta * NS_PER_TICK;
+ while (delta > HZ) {
+ clobber_induction_variable(delta);
+ do_timer(HZ);
+ delta -= HZ;
+ }
+ do_timer(delta);
+ }
- /* System-wide jiffy work. */
- if (delta >= NS_PER_TICK) {
- do_div(delta, NS_PER_TICK);
- processed_system_time += delta * NS_PER_TICK;
- while (delta > HZ) {
- clobber_induction_variable(delta);
- do_timer(HZ);
- delta -= HZ;
+ if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
+ update_wallclock();
+ if (keventd_up())
+ schedule_work(&clock_was_set_work);
}
- do_timer(delta);
- }
- if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
- update_wallclock();
- if (keventd_up())
- schedule_work(&clock_was_set_work);
+ write_sequnlock(&xtime_lock);
}
- write_sequnlock(&xtime_lock);
+ delta = delta_cpu;
+ delta_cpu += per_cpu(processed_system_time, cpu)
+ - per_cpu(accounted_system_time, cpu);
+ if (delta >= NS_PER_TICK) {
+ do_div(delta, NS_PER_TICK);
+ per_cpu(processed_system_time, cpu) += delta * NS_PER_TICK;
+ }
/*
* Account stolen ticks.
@@ -499,7 +544,7 @@ static irqreturn_t timer_interrupt(int i
stolen += delta_cpu; /* clamp local-time progress */
do_div(stolen, NS_PER_TICK);
per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
- per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
+ per_cpu(accounted_system_time, cpu) += stolen * NS_PER_TICK;
account_steal_ticks(stolen);
}
@@ -515,14 +560,14 @@ static irqreturn_t timer_interrupt(int i
blocked += delta_cpu; /* clamp local-time progress */
do_div(blocked, NS_PER_TICK);
per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
- per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
+ per_cpu(accounted_system_time, cpu) += blocked * NS_PER_TICK;
account_idle_ticks(blocked);
}
/* Account user/system ticks. */
if (delta_cpu > 0) {
do_div(delta_cpu, NS_PER_TICK);
- per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
+ per_cpu(accounted_system_time, cpu) += delta_cpu * NS_PER_TICK;
if (user_mode_vm(get_irq_regs()))
account_user_time(current, (cputime_t)delta_cpu,
(cputime_t)delta_cpu);
@@ -623,6 +668,7 @@ static void xen_clocksource_resume(void)
BUG();
}
get_time_values_from_xen(cpu);
+ per_cpu(accounted_system_time, cpu) =
per_cpu(processed_system_time, cpu) =
per_cpu(shadow_time, 0).system_timestamp;
init_missing_ticks_accounting(cpu);
@@ -725,6 +771,7 @@ void __init time_init(void)
processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
per_cpu(processed_system_time, 0) = processed_system_time;
+ per_cpu(accounted_system_time, 0) = processed_system_time;
init_missing_ticks_accounting(0);
clocksource_register(&clocksource_xen);
@@ -735,6 +782,9 @@ void __init time_init(void)
/* Cannot request_irq() until kmem is initialised. */
late_time_init = setup_cpu0_timer_irq;
+
+ if (!(duty_limit + 2))
+ duty_limit = __fls(nr_cpu_ids);
}
/* Convert jiffies to system time. */
@@ -773,6 +823,7 @@ static void stop_hz_timer(void)
struct vcpu_set_singleshot_timer singleshot;
unsigned int cpu = smp_processor_id();
unsigned long j;
+ u64 local;
int rc;
cpumask_set_cpu(cpu, nohz_cpu_mask);
@@ -798,6 +849,11 @@ static void stop_hz_timer(void)
singleshot.timeout_abs_ns = jiffies_to_st(j);
if (!singleshot.timeout_abs_ns)
return;
+ local = per_cpu(processed_system_time, cpu);
+ if ((s64)(singleshot.timeout_abs_ns - local) <= NS_PER_TICK) {
+ cpumask_clear_cpu(cpu, nohz_cpu_mask);
+ singleshot.timeout_abs_ns = local + NS_PER_TICK;
+ }
singleshot.timeout_abs_ns += NS_PER_TICK / 2;
singleshot.flags = 0;
rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot);
@@ -862,6 +918,7 @@ int __cpuinit local_setup_timer(unsigned
do {
seq = read_seqbegin(&xtime_lock);
/* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
+ per_cpu(accounted_system_time, cpu) =
per_cpu(processed_system_time, cpu) =
per_cpu(shadow_time, 0).system_timestamp;
init_missing_ticks_accounting(cpu);