Skip to content

Commit

Permalink
x86/delay: add support for Zhaoxin ZXPAUSE instruction
Browse files Browse the repository at this point in the history
zhaoxin inclusion
category: feature

-------------------

ZXPAUSE instructs the processor to enter an implementation-dependent
optimized state. The instruction execution wakes up when the time-stamp
counter reaches or exceeds the implicit EDX:EAX 64-bit input value.
The instruction execution also wakes up due to the expiration of
the operating system time-limit or by an external interrupt.

ZXPAUSE is available on processors with X86_FEATURE_ZXPAUSE.
ZXPAUSE allows the processor to enter a light-weight power/performance
optimized state (C0.1 state) for a period specified by the instruction
or until the system time limit.

MSR_ZX_PAUSE_CONTROL MSR register allows the OS to enable/disable C0.2 on
the processor and to set the maximum time the processor can reside in C0.1
or C0.2. By default C0.2 is disabled.

A sysfs interface to adjust the time and the C0.2 enablement is provided in
a follow up change.

Signed-off-by: leoliu-oc <[email protected]>
  • Loading branch information
leoliu-oc committed Jun 14, 2024
1 parent 5086b5c commit 730ba28
Show file tree
Hide file tree
Showing 17 changed files with 341 additions and 8 deletions.
7 changes: 5 additions & 2 deletions arch/x86/include/asm/cpufeature.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ enum cpuid_leafs
CPUID_8000_001F_EAX,
CPUID_8000_0021_EAX,
CPUID_LNX_5,
CPUID_C000_0006_EAX,
NR_CPUID_WORDS,
};

Expand Down Expand Up @@ -94,8 +95,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 22, feature_bit) || \
REQUIRED_MASK_CHECK || \
BUILD_BUG_ON_ZERO(NCAPINTS != 22))
BUILD_BUG_ON_ZERO(NCAPINTS != 23))

#define DISABLED_MASK_BIT_SET(feature_bit) \
( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
Expand All @@ -120,8 +122,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 22, feature_bit) || \
DISABLED_MASK_CHECK || \
BUILD_BUG_ON_ZERO(NCAPINTS != 22))
BUILD_BUG_ON_ZERO(NCAPINTS != 23))

#define cpu_has(c, bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
Expand Down
5 changes: 4 additions & 1 deletion arch/x86/include/asm/cpufeatures.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
/*
* Defines x86 CPU feature bits
*/
#define NCAPINTS 22 /* N 32-bit words worth of info */
#define NCAPINTS 23 /* N 32-bit words worth of info */
#define NBUGINTS 2 /* N 32-bit bug flags */

/*
Expand Down Expand Up @@ -468,6 +468,9 @@
#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */
#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */

/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000006, word 22 */
#define X86_FEATURE_ZXPAUSE (22*32+ 0) /* ZHAOXIN ZXPAUSE */

/*
* BUG word(s)
*/
Expand Down
1 change: 1 addition & 0 deletions arch/x86/include/asm/delay.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

void __init use_tsc_delay(void);
void __init use_tpause_delay(void);
void __init use_zxpause_delay(void);
void use_mwaitx_delay(void);

#endif /* _ASM_X86_DELAY_H */
3 changes: 2 additions & 1 deletion arch/x86/include/asm/disabled-features.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@
#define DISABLED_MASK19 0
#define DISABLED_MASK20 0
#define DISABLED_MASK21 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22)
#define DISABLED_MASK22 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 23)

#endif /* _ASM_X86_DISABLED_FEATURES_H */
11 changes: 11 additions & 0 deletions arch/x86/include/asm/msr-index.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,23 @@
#define MSR_IA32_UMWAIT_CONTROL 0xe1
#define MSR_IA32_UMWAIT_CONTROL_C02_DISABLE BIT(0)
#define MSR_IA32_UMWAIT_CONTROL_RESERVED BIT(1)

#define MSR_ZX_PAUSE_CONTROL 0x187f
#define MSR_ZX_PAUSE_CONTROL_C02_DISABLE BIT(0)
#define MSR_ZX_PAUSE_CONTROL_RESERVED BIT(1)

/*
* The time field is bit[31:2], but representing a 32bit value with
* bit[1:0] zero.
*/
#define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U)

/*
* The time field is bit[31:2], but representing a 32bit value with
* bit[1:0] zero.
*/
#define MSR_ZX_PAUSE_CONTROL_TIME_MASK (~0x03U)

/* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */
#define MSR_IA32_CORE_CAPS 0x000000cf
#define MSR_IA32_CORE_CAPS_INTEGRITY_CAPS_BIT 2
Expand Down
21 changes: 21 additions & 0 deletions arch/x86/include/asm/mwait.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#define TPAUSE_C01_STATE 1
#define TPAUSE_C02_STATE 0

#define ZXPAUSE_C01_STATE 1

static __always_inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
{
Expand Down Expand Up @@ -148,4 +150,23 @@ static inline void __tpause(u32 ecx, u32 edx, u32 eax)
#endif
}

/*
* Caller can specify whether to enter C0.1 (low latency, less
* power saving) or C0.2 state (saves more power, but longer wakeup
* latency). This may be overridden by the ZX_PAUSE_CONTROL MSR
* which can force requests for C0.2 to be downgraded to C0.1.
*/
static inline void __zxpause(u32 ecx, u32 edx, u32 eax)
{
/* "zxpause %ecx, %edx, %eax;" */
#ifdef CONFIG_AS_ZXPAUSE
asm volatile("zxpause %%ecx\n"
:
: "c"(ecx), "d"(edx), "a"(eax));
#else
asm volatile(".byte 0xf2, 0x0f, 0xa6, 0xd0\t\n"
:
: "c"(ecx), "d"(edx), "a"(eax));
#endif
}
#endif /* _ASM_X86_MWAIT_H */
3 changes: 2 additions & 1 deletion arch/x86/include/asm/required-features.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
#define REQUIRED_MASK19 0
#define REQUIRED_MASK20 0
#define REQUIRED_MASK21 0
#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22)
#define REQUIRED_MASK22 0
#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 23)

#endif /* _ASM_X86_REQUIRED_FEATURES_H */
1 change: 1 addition & 0 deletions arch/x86/kernel/cpu/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ obj-y += bugs.o
obj-y += aperfmperf.o
obj-y += cpuid-deps.o
obj-y += umwait.o
obj-y += zxpause.o

obj-$(CONFIG_PROC_FS) += proc.o
obj-y += capflags.o powerflags.o
Expand Down
3 changes: 3 additions & 0 deletions arch/x86/kernel/cpu/centaur.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ static void early_init_centaur(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
}

if (cpuid_eax(0xC0000000) >= 0xC0000006)
c->x86_capability[CPUID_C000_0006_EAX] = cpuid_eax(0xC0000006);
}

static void init_centaur(struct cpuinfo_x86 *c)
Expand Down
2 changes: 2 additions & 0 deletions arch/x86/kernel/cpu/zhaoxin.c
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c)
c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
}

if (cpuid_eax(0xC0000000) >= 0xC0000006)
c->x86_capability[CPUID_C000_0006_EAX] = cpuid_eax(0xC0000006);
}

static void init_zhaoxin(struct cpuinfo_x86 *c)
Expand Down
238 changes: 238 additions & 0 deletions arch/x86/kernel/cpu/zxpause.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/syscore_ops.h>
#include <linux/suspend.h>
#include <linux/cpu.h>

#include <asm/msr.h>
#include <asm/mwait.h>

#define ZXPAUSE_C02_ENABLE 0

#define ZXPAUSE_CTRL_VAL(max_time, c02_disable) \
(((max_time) & MSR_ZX_PAUSE_CONTROL_TIME_MASK) | \
((c02_disable) & MSR_ZX_PAUSE_CONTROL_C02_DISABLE))

/*
* Cache ZX_PAUSE_CONTROL MSR. This is a systemwide control. By default,
* zxpause max time is 100000 in TSC-quanta and C0.2 is enabled
*/
static u32 zxpause_control_cached = ZXPAUSE_CTRL_VAL(100000, ZXPAUSE_C02_ENABLE);

/*
* Cache the original ZX_PAUSE_CONTROL MSR value which is configured by
* hardware or BIOS before kernel boot.
*/
static u32 orig_zxpause_control_cached __ro_after_init;

/*
* Serialize access to zxpause_control_cached and ZX_PAUSE_CONTROL MSR in
* the sysfs write functions.
*/
static DEFINE_MUTEX(zxpause_lock);

static void zxpause_update_control_msr(void * unused)
{
lockdep_assert_irqs_disabled();
wrmsr(MSR_ZX_PAUSE_CONTROL, READ_ONCE(zxpause_control_cached), 0);
}

/*
* The CPU hotplug callback sets the control MSR to the global control
* value.
*
* Disable interrupts so the read of zxpause_control_cached and the WRMSR
* are protected against a concurrent sysfs write. Otherwise the sysfs
* write could update the cached value after it had been read on this CPU
* and issue the IPI before the old value had been written. The IPI would
* interrupt, write the new value and after return from IPI the previous
* value would be written by this CPU.
*
* With interrupts disabled the upcoming CPU either sees the new control
* value or the IPI is updating this CPU to the new control value after
* interrupts have been reenabled.
*/
static int zxpause_cpu_online(unsigned int cpu)
{
local_irq_disable();
zxpause_update_control_msr(NULL);
local_irq_enable();
return 0;
}

/*
* The CPU hotplug callback sets the control MSR to the original control
* value.
*/
static int zxpause_cpu_offline(unsigned int cpu)
{
/*
* This code is protected by the CPU hotplug already and
* orig_zxpause_control_cached is never changed after it caches
* the original control MSR value in zxpause_init(). So there
* is no race condition here.
*/
wrmsr(MSR_ZX_PAUSE_CONTROL, orig_zxpause_control_cached, 0);

return 0;
}

/*
* On resume, restore ZX_PAUSE_CONTROL MSR on the boot processor which
* is the only active CPU at this time. The MSR is set up on the APs via the
* CPU hotplug callback.
*
* This function is invoked on resume from suspend and hibernation. On
* resume from suspend the restore should be not required, but we neither
* trust the firmware nor does it matter if the same value is written
* again.
*/
static void zxpause_syscore_resume(void)
{
zxpause_update_control_msr(NULL);
}

static struct syscore_ops zxpause_syscore_ops = {
.resume = zxpause_syscore_resume,
};

/* sysfs interface */

/*
* When bit 0 in ZX_PAUSE_CONTROL MSR is 1, C0.2 is disabled.
* Otherwise, C0.2 is enabled.
*/
static inline bool zxpause_ctrl_c02_enabled(u32 ctrl)
{
return !(ctrl & MSR_ZX_PAUSE_CONTROL_C02_DISABLE);
}

static inline u32 zxpause_ctrl_max_time(u32 ctrl)
{
return ctrl & MSR_ZX_PAUSE_CONTROL_TIME_MASK;
}

static inline void zxpause_update_control(u32 maxtime, bool c02_enable)
{
u32 ctrl = maxtime & MSR_ZX_PAUSE_CONTROL_TIME_MASK;

if (!c02_enable)
ctrl |= MSR_ZX_PAUSE_CONTROL_C02_DISABLE;

WRITE_ONCE(zxpause_control_cached, ctrl);
/* Propagate to all CPUs */
on_each_cpu(zxpause_update_control_msr, NULL, 1);
}

static ssize_t
enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf)
{
u32 ctrl = READ_ONCE(zxpause_control_cached);

return sprintf(buf, "%d\n", zxpause_ctrl_c02_enabled(ctrl));
}

static ssize_t enable_c02_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
bool c02_enable;
u32 ctrl;
int ret;

ret = kstrtobool(buf, &c02_enable);
if (ret)
return ret;

mutex_lock(&zxpause_lock);

ctrl = READ_ONCE(zxpause_control_cached);
if (c02_enable != zxpause_ctrl_c02_enabled(ctrl))
zxpause_update_control(ctrl, c02_enable);

mutex_unlock(&zxpause_lock);

return count;
}
static DEVICE_ATTR_RW(enable_c02);

static ssize_t
max_time_show(struct device *kobj, struct device_attribute *attr, char *buf)
{
u32 ctrl = READ_ONCE(zxpause_control_cached);

return sprintf(buf, "%u\n", zxpause_ctrl_max_time(ctrl));
}

static ssize_t max_time_store(struct device *kobj,
struct device_attribute *attr,
const char *buf, size_t count)
{
u32 max_time, ctrl;
int ret;

ret = kstrtou32(buf, 0, &max_time);
if (ret)
return ret;

/* bits[1:0] must be zero */
if (max_time & ~MSR_ZX_PAUSE_CONTROL_TIME_MASK)
return -EINVAL;

mutex_lock(&zxpause_lock);

ctrl = READ_ONCE(zxpause_control_cached);
if (max_time != zxpause_ctrl_max_time(ctrl))
zxpause_update_control(max_time, zxpause_ctrl_c02_enabled(ctrl));

mutex_unlock(&zxpause_lock);

return count;
}
static DEVICE_ATTR_RW(max_time);

static struct attribute *zxpause_attrs[] = {
&dev_attr_enable_c02.attr,
&dev_attr_max_time.attr,
NULL
};

static struct attribute_group zxpause_attr_group = {
.attrs = zxpause_attrs,
.name = "zxpause_control",
};

static int __init zxpause_init(void)
{
struct device *dev;
int ret;

if (!boot_cpu_has(X86_FEATURE_ZXPAUSE))
return -ENODEV;

/*
* Cache the original control MSR value before the control MSR is
* changed. This is the only place where orig_zxpause_control_cached
* is modified.
*/
rdmsrl(MSR_ZX_PAUSE_CONTROL, orig_zxpause_control_cached);

ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "zxpause:online",
zxpause_cpu_online, zxpause_cpu_offline);
if (ret < 0) {
/*
* On failure, the control MSR on all CPUs has the
* original control value.
*/
return ret;
}

register_syscore_ops(&zxpause_syscore_ops);

/*
* Add zxpause control interface. Ignore failure, so at least the
* default values are set up in case the machine manages to boot.
*/
dev = bus_get_dev_root(&cpu_subsys);
return sysfs_create_group(&dev->kobj, &zxpause_attr_group);
}
device_initcall(zxpause_init);
Loading

0 comments on commit 730ba28

Please sign in to comment.