diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3508f3fc928d4..368c5460ab1c1 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -34,6 +34,7 @@ enum cpuid_leafs CPUID_8000_001F_EAX, CPUID_8000_0021_EAX, CPUID_LNX_5, + CPUID_C000_0006_EAX, NR_CPUID_WORDS, }; @@ -94,8 +95,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 22, feature_bit) || \ REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 22)) + BUILD_BUG_ON_ZERO(NCAPINTS != 23)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ @@ -120,8 +122,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 22, feature_bit) || \ DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 22)) + BUILD_BUG_ON_ZERO(NCAPINTS != 23)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 64fa10b17aa05..0f56250ea51c9 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 22 /* N 32-bit words worth of info */ +#define NCAPINTS 23 /* N 32-bit words worth of info */ #define NBUGINTS 2 /* N 32-bit bug flags */ /* @@ -470,6 +470,9 @@ #define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */ +/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000006, word 22 */ +#define X86_FEATURE_ZXPAUSE (22*32+ 0) /* ZHAOXIN ZXPAUSE */ + /* * BUG word(s) */ diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index 630891d258198..4dbb3fea67fb5 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -7,6 +7,7 @@ void __init use_tsc_delay(void); void __init use_tpause_delay(void); +void __init use_zxpause_delay(void); void use_mwaitx_delay(void); #endif /* _ASM_X86_DELAY_H */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 88fcf08458d9c..b108e656fa5b8 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -144,6 +144,7 @@ #define DISABLED_MASK19 0 #define DISABLED_MASK20 0 #define DISABLED_MASK21 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) +#define DISABLED_MASK22 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 23) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 621bac6b74011..5a72d758b1133 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -75,12 +75,23 @@ #define MSR_IA32_UMWAIT_CONTROL 0xe1 #define MSR_IA32_UMWAIT_CONTROL_C02_DISABLE BIT(0) #define MSR_IA32_UMWAIT_CONTROL_RESERVED BIT(1) + +#define MSR_ZX_PAUSE_CONTROL 0x187f +#define MSR_ZX_PAUSE_CONTROL_C02_DISABLE BIT(0) +#define MSR_ZX_PAUSE_CONTROL_RESERVED BIT(1) + /* * The time field is bit[31:2], but representing a 32bit value with * bit[1:0] zero. */ #define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U) +/* + * The time field is bit[31:2], but representing a 32bit value with + * bit[1:0] zero. + */ +#define MSR_ZX_PAUSE_CONTROL_TIME_MASK (~0x03U) + /* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */ #define MSR_IA32_CORE_CAPS 0x000000cf #define MSR_IA32_CORE_CAPS_INTEGRITY_CAPS_BIT 2 @@ -768,6 +779,13 @@ #define MSR_VIA_RNG 0x0000110b #define MSR_VIA_BCR2 0x00001147 +/* + * Zhaoxin extend VMCS capabilities: + * bit 0: exec-cntl3 VMCS field. + */ +#define MSR_ZX_EXT_VMCS_CAPS 0x1675 +#define MSR_ZX_VMCS_EXEC_CTL3 BIT(0) + /* Transmeta defined MSRs */ #define MSR_TMTA_LONGRUN_CTRL 0x80868010 #define MSR_TMTA_LONGRUN_FLAGS 0x80868011 diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index bae83810505bf..b1836840cf83a 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -26,6 +26,8 @@ #define TPAUSE_C01_STATE 1 #define TPAUSE_C02_STATE 0 +#define ZXPAUSE_C01_STATE 1 + static __always_inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) { @@ -148,4 +150,23 @@ static inline void __tpause(u32 ecx, u32 edx, u32 eax) #endif } +/* + * Caller can specify whether to enter C0.1 (low latency, less + * power saving) or C0.2 state (saves more power, but longer wakeup + * latency). This may be overridden by the ZX_PAUSE_CONTROL MSR + * which can force requests for C0.2 to be downgraded to C0.1. + */ +static inline void __zxpause(u32 ecx, u32 edx, u32 eax) +{ + /* "zxpause %ecx, %edx, %eax;" */ + #ifdef CONFIG_AS_ZXPAUSE + asm volatile("zxpause %%ecx\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #else + asm volatile(".byte 0xf2, 0x0f, 0xa6, 0xd0\t\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #endif + } #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index e9187ddd3d1fd..76953f757f3c3 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -100,6 +100,7 @@ #define REQUIRED_MASK19 0 #define REQUIRED_MASK20 0 #define REQUIRED_MASK21 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) +#define REQUIRED_MASK22 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 23) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0e73616b82f34..32dc7414b83be 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -84,6 +84,12 @@ */ #define TERTIARY_EXEC_IPI_VIRT VMCS_CONTROL_BIT(IPI_VIRT) +/* + * Definitions of Zhaoxin Tertiary Processor-Based VM-Execution Controls. + */ +#define ZX_TERTIARY_EXEC_GUEST_ZXPAUSE VMCS_CONTROL_BIT(GUEST_ZXPAUSE) + + #define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING) #define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING) #define PIN_BASED_VIRTUAL_NMIS VMCS_CONTROL_BIT(VIRTUAL_NMIS) @@ -235,6 +241,7 @@ enum vmcs_field { TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035, PID_POINTER_TABLE = 0x00002042, PID_POINTER_TABLE_HIGH = 0x00002043, + ZXPAUSE_VMEXIT_TSC = 0x00002200, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, @@ -284,6 +291,7 @@ enum vmcs_field { PLE_GAP = 0x00004020, PLE_WINDOW = 0x00004022, NOTIFY_WINDOW = 0x00004024, + ZX_TERTIARY_VM_EXEC_CONTROL = 0x00004200, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index c6a7eed039145..837f7431d4f25 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -5,7 +5,7 @@ /* * Defines VMX CPU feature bits */ -#define NVMXINTS 5 /* N 32-bit words worth of info */ +#define NVMXINTS 6 /* N 32-bit words worth of info */ /* * Note: If the comment begins with a quoted string, that string is used @@ -89,4 +89,7 @@ /* Tertiary Processor-Based VM-Execution Controls, word 3 */ #define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* Enable IPI virtualization */ + +/* Zhaoxin Tertiary Processor-Based VM-Execution Controls, word 3 */ +#define VMX_FEATURE_GUEST_ZXPAUSE ( 5*32+ 0) /* zxpause instruction in guest mode */ #endif /* _ASM_X86_VMXFEATURES_H */ diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4350f6bfc0641..e10099f4a0afc 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -25,6 +25,7 @@ obj-y += bugs.o obj-y += aperfmperf.o obj-y += cpuid-deps.o obj-y += umwait.o +obj-y += zxpause.o obj-$(CONFIG_PROC_FS) += proc.o obj-y += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 345f7d905db67..6c1e1606f40c6 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -109,6 +109,9 @@ static void early_init_centaur(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); } + + if (cpuid_eax(0xC0000000) >= 0xC0000006) + c->x86_capability[CPUID_C000_0006_EAX] = cpuid_eax(0xC0000006); } static void init_centaur(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 03851240c3e36..c97f161705037 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -17,6 +17,7 @@ enum vmx_feature_leafs { SECONDARY_CTLS, TERTIARY_CTLS_LOW, TERTIARY_CTLS_HIGH, + ZX_TERTIARY_CTLS, NR_VMX_FEATURE_WORDS, }; @@ -97,6 +98,13 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_EPT_AD); if (c->vmx_capability[MISC_FEATURES] & VMX_F(VPID)) set_cpu_cap(c, X86_FEATURE_VPID); + /* + * Initialize Zhaoxin Tertiary Exec Control feature flags. + */ + rdmsr_safe(MSR_ZX_EXT_VMCS_CAPS, &supported, &ign); + if (supported & MSR_ZX_VMCS_EXEC_CTL3) + c->vmx_capability[ZX_TERTIARY_CTLS] |= VMX_F(GUEST_ZXPAUSE); + } #endif /* CONFIG_X86_VMX_FEATURE_NAMES */ diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 415564a6523b6..17a5cdafc2658 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -65,6 +65,7 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); } + } static void init_zhaoxin(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/zxpause.c b/arch/x86/kernel/cpu/zxpause.c new file mode 100644 index 0000000000000..f3f236168871e --- /dev/null +++ b/arch/x86/kernel/cpu/zxpause.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include +#include + +#define ZXPAUSE_C02_ENABLE 0 + +#define ZXPAUSE_CTRL_VAL(max_time, c02_disable) \ + (((max_time) & MSR_ZX_PAUSE_CONTROL_TIME_MASK) | \ + ((c02_disable) & MSR_ZX_PAUSE_CONTROL_C02_DISABLE)) + +/* + * Cache ZX_PAUSE_CONTROL MSR. This is a systemwide control. By default, + * zxpause max time is 100000 in TSC-quanta and C0.2 is enabled + */ +static u32 zxpause_control_cached = ZXPAUSE_CTRL_VAL(100000, ZXPAUSE_C02_ENABLE); + +/* + * Cache the original ZX_PAUSE_CONTROL MSR value which is configured by + * hardware or BIOS before kernel boot. + */ +static u32 orig_zxpause_control_cached __ro_after_init; + +/* + * Serialize access to zxpause_control_cached and ZX_PAUSE_CONTROL MSR in + * the sysfs write functions. + */ +static DEFINE_MUTEX(zxpause_lock); + +static void zxpause_update_control_msr(void * unused) +{ + lockdep_assert_irqs_disabled(); + wrmsr(MSR_ZX_PAUSE_CONTROL, READ_ONCE(zxpause_control_cached), 0); +} + +/* + * The CPU hotplug callback sets the control MSR to the global control + * value. + * + * Disable interrupts so the read of zxpause_control_cached and the WRMSR + * are protected against a concurrent sysfs write. Otherwise the sysfs + * write could update the cached value after it had been read on this CPU + * and issue the IPI before the old value had been written. The IPI would + * interrupt, write the new value and after return from IPI the previous + * value would be written by this CPU. + * + * With interrupts disabled the upcoming CPU either sees the new control + * value or the IPI is updating this CPU to the new control value after + * interrupts have been reenabled. + */ +static int zxpause_cpu_online(unsigned int cpu) +{ + local_irq_disable(); + zxpause_update_control_msr(NULL); + local_irq_enable(); + return 0; +} + +/* + * The CPU hotplug callback sets the control MSR to the original control + * value. + */ +static int zxpause_cpu_offline(unsigned int cpu) +{ + /* + * This code is protected by the CPU hotplug already and + * orig_zxpause_control_cached is never changed after it caches + * the original control MSR value in zxpause_init(). So there + * is no race condition here. + */ + wrmsr(MSR_ZX_PAUSE_CONTROL, orig_zxpause_control_cached, 0); + + return 0; +} + +/* + * On resume, restore ZX_PAUSE_CONTROL MSR on the boot processor which + * is the only active CPU at this time. The MSR is set up on the APs via the + * CPU hotplug callback. + * + * This function is invoked on resume from suspend and hibernation. On + * resume from suspend the restore should be not required, but we neither + * trust the firmware nor does it matter if the same value is written + * again. + */ +static void zxpause_syscore_resume(void) +{ + zxpause_update_control_msr(NULL); +} + +static struct syscore_ops zxpause_syscore_ops = { + .resume = zxpause_syscore_resume, +}; + +/* sysfs interface */ + +/* + * When bit 0 in ZX_PAUSE_CONTROL MSR is 1, C0.2 is disabled. + * Otherwise, C0.2 is enabled. + */ +static inline bool zxpause_ctrl_c02_enabled(u32 ctrl) +{ + return !(ctrl & MSR_ZX_PAUSE_CONTROL_C02_DISABLE); +} + +static inline u32 zxpause_ctrl_max_time(u32 ctrl) +{ + return ctrl & MSR_ZX_PAUSE_CONTROL_TIME_MASK; +} + +static inline void zxpause_update_control(u32 maxtime, bool c02_enable) +{ + u32 ctrl = maxtime & MSR_ZX_PAUSE_CONTROL_TIME_MASK; + + if (!c02_enable) + ctrl |= MSR_ZX_PAUSE_CONTROL_C02_DISABLE; + + WRITE_ONCE(zxpause_control_cached, ctrl); + /* Propagate to all CPUs */ + on_each_cpu(zxpause_update_control_msr, NULL, 1); +} + +static ssize_t +enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(zxpause_control_cached); + + return sprintf(buf, "%d\n", zxpause_ctrl_c02_enabled(ctrl)); +} + +static ssize_t enable_c02_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + bool c02_enable; + u32 ctrl; + int ret; + + ret = kstrtobool(buf, &c02_enable); + if (ret) + return ret; + + mutex_lock(&zxpause_lock); + + ctrl = READ_ONCE(zxpause_control_cached); + if (c02_enable != zxpause_ctrl_c02_enabled(ctrl)) + zxpause_update_control(ctrl, c02_enable); + + mutex_unlock(&zxpause_lock); + + return count; +} +static DEVICE_ATTR_RW(enable_c02); + +static ssize_t +max_time_show(struct device *kobj, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(zxpause_control_cached); + + return sprintf(buf, "%u\n", zxpause_ctrl_max_time(ctrl)); +} + +static ssize_t max_time_store(struct device *kobj, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 max_time, ctrl; + int ret; + + ret = kstrtou32(buf, 0, &max_time); + if (ret) + return ret; + + /* bits[1:0] must be zero */ + if (max_time & ~MSR_ZX_PAUSE_CONTROL_TIME_MASK) + return -EINVAL; + + mutex_lock(&zxpause_lock); + + ctrl = READ_ONCE(zxpause_control_cached); + if (max_time != zxpause_ctrl_max_time(ctrl)) + zxpause_update_control(max_time, zxpause_ctrl_c02_enabled(ctrl)); + + mutex_unlock(&zxpause_lock); + + return count; +} +static DEVICE_ATTR_RW(max_time); + +static struct attribute *zxpause_attrs[] = { + &dev_attr_enable_c02.attr, + &dev_attr_max_time.attr, + NULL +}; + +static struct attribute_group zxpause_attr_group = { + .attrs = zxpause_attrs, + .name = "zxpause_control", +}; + +static int __init zxpause_init(void) +{ + struct device *dev; + int ret; + + if (!boot_cpu_has(X86_FEATURE_ZXPAUSE)) + return -ENODEV; + + /* + * Cache the original control MSR value before the control MSR is + * changed. This is the only place where orig_zxpause_control_cached + * is modified. + */ + rdmsrl(MSR_ZX_PAUSE_CONTROL, orig_zxpause_control_cached); + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "zxpause:online", + zxpause_cpu_online, zxpause_cpu_offline); + if (ret < 0) { + /* + * On failure, the control MSR on all CPUs has the + * original control value. + */ + return ret; + } + + register_syscore_ops(&zxpause_syscore_ops); + + /* + * Add zxpause control interface. Ignore failure, so at least the + * default values are set up in case the machine manages to boot. + */ + dev = bus_get_dev_root(&cpu_subsys); + return sysfs_create_group(&dev->kobj, &zxpause_attr_group); +} +device_initcall(zxpause_init); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index e42faa792c079..0b5aa39f0db6c 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -102,6 +102,9 @@ static __init void x86_late_time_init(void) if (static_cpu_has(X86_FEATURE_WAITPKG)) use_tpause_delay(); + + if (static_cpu_has(X86_FEATURE_ZXPAUSE)) + use_zxpause_delay(); } /* diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ac042a9a61f57..62ec29e16cc4e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -791,6 +791,9 @@ void kvm_set_cpu_caps(void) F(PMM) | F(PMM_EN) ); + /* Zhaoxin 0xC0000006 leaf */ + kvm_cpu_cap_mask(CPUID_C000_0006_EAX, 0 /* bit0: zxpause */ | 0 /* bit1 HMAC */); + /* * Hide RDTSCP and RDPID if either feature is reported as supported but * probing MSR_TSC_AUX failed. This is purely a sanity check and @@ -1304,17 +1307,22 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: - /*Just support up to 0xC0000004 now*/ - entry->eax = min(entry->eax, 0xC0000004); + /* Extended to 0xC0000006 */ + entry->eax = min(entry->eax, 0xC0000006); break; case 0xC0000001: cpuid_entry_override(entry, CPUID_C000_0001_EDX); break; + case 0xC0000006: + cpuid_entry_override(entry, CPUID_C000_0006_EAX); + break; + case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ case 0xC0000002: case 0xC0000003: case 0xC0000004: + case 0xC0000005: default: entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 2f4e155080bad..cd2d6abe4762a 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -90,6 +90,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, + [CPUID_C000_0006_EAX] = {0xc0000006, 0, CPUID_EAX}, }; /* diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 41a4533f99897..95745c391d929 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -60,6 +60,7 @@ struct vmcs_config { u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; + u32 zx_cpu_based_3rd_exec_ctrl; u64 cpu_based_3rd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; @@ -255,6 +256,11 @@ static inline bool cpu_has_vmx_xsaves(void) SECONDARY_EXEC_ENABLE_XSAVES; } +static inline bool cpu_has_vmx_zxpause(void) +{ + return vmcs_config.zx_cpu_based_3rd_exec_ctrl & ZX_TERTIARY_EXEC_GUEST_ZXPAUSE; +} + static inline bool cpu_has_vmx_waitpkg(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 7c1996b433e26..4eabed8e5813a 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -50,7 +50,9 @@ struct vmcs_controls_shadow { u32 pin; u32 exec; u32 secondary_exec; + u32 zx_tertiary_exec; u64 tertiary_exec; + u64 zx_vmexit_tsc; }; /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index dae499e2da84e..e1cbf23c82027 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -214,6 +214,8 @@ module_param(ple_window_max, uint, 0444); int __read_mostly pt_mode = PT_MODE_SYSTEM; module_param(pt_mode, int, S_IRUGO); +static u32 zx_ext_vmcs_cap; + static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); static DEFINE_MUTEX(vmx_l1d_flush_mutex); @@ -2015,7 +2017,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1; - + msr_info->data = vmx->msr_ia32_umwait_control; + break; + case MSR_ZX_PAUSE_CONTROL: + if (!msr_info->host_initiated && !vmx_guest_zxpause_enabled(vmx)) + return 1; msr_info->data = vmx->msr_ia32_umwait_control; break; case MSR_IA32_SPEC_CTRL: @@ -2275,7 +2281,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* The reserved bit 1 and non-32 bit [63:32] should be zero */ if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) return 1; + vmx->msr_ia32_umwait_control = data; + break; + case MSR_ZX_PAUSE_CONTROL: + if (!msr_info->host_initiated && !vmx_guest_zxpause_enabled(vmx)) + return 1; + /* The reserved bit 1 and non-32 bit [63:32] should be zero */ + if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) + return 1; vmx->msr_ia32_umwait_control = data; break; case MSR_IA32_SPEC_CTRL: @@ -2733,6 +2747,10 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->vmentry_ctrl = _vmentry_control; vmcs_conf->misc = misc_msr; + /* Setup Zhaoxin exec-cntl3 VMCS field. */ + if (zx_ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3) + vmcs_conf->zx_cpu_based_3rd_exec_ctrl |= ZX_TERTIARY_EXEC_GUEST_ZXPAUSE; + #if IS_ENABLED(CONFIG_HYPERV) if (enlightened_vmcs) evmcs_sanitize_exec_ctrls(vmcs_conf); @@ -4525,6 +4543,29 @@ static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static u32 vmx_zx_tertiary_exec_control(struct vcpu_vmx *vmx) +{ + struct kvm_vcpu *vcpu = &vmx->vcpu; + u32 exec_control = vmcs_config.zx_cpu_based_3rd_exec_ctrl; + + /* + * Show errors if Qemu wants to enable guest_zxpause while + * vmx not support it. + */ + if (guest_cpuid_has(vcpu, X86_FEATURE_ZXPAUSE)) { + if (!cpu_has_vmx_zxpause()) + pr_err("VMX not support guest_zxpause!\n"); + else + exec_control |= ZX_TERTIARY_EXEC_GUEST_ZXPAUSE; + } else { + exec_control &= ~ZX_TERTIARY_EXEC_GUEST_ZXPAUSE; + } + + /* enable other features here */ + + return exec_control; +} + /* * Adjust a single secondary execution control bit to intercept/allow an * instruction in the guest. This is usually done based on whether or not a @@ -4731,6 +4772,11 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_secondary_exec_ctrls()) secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); + if (zx_ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3) { + zx_tertiary_exec_controls_set(vmx, vmx_zx_tertiary_exec_control(vmx)); + zx_vmexit_tsc_controls_set(vmx, 0); + } + if (cpu_has_tertiary_exec_ctrls()) tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); @@ -6260,6 +6306,13 @@ void dump_vmcs(struct kvm_vcpu *vcpu) else tertiary_exec_control = 0; + if (zx_ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3) { + pr_err("*** Zhaoxin Specific Fields ***\n"); + pr_err("Zhaoxin TertiaryExec Cntl = 0x%016x\n", + vmcs_read32(ZX_TERTIARY_VM_EXEC_CONTROL)); + pr_err("ZXPAUSE Saved TSC = 0x%016llx\n", vmcs_read64(ZXPAUSE_VMEXIT_TSC)); + } + pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); pr_err("*** Guest State ***\n"); @@ -7782,6 +7835,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmcs_set_secondary_exec_control(vmx, vmx_secondary_exec_control(vmx)); + if (zx_ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3) { + zx_tertiary_exec_controls_set(vmx, vmx_zx_tertiary_exec_control(vmx)); + zx_vmexit_tsc_controls_set(vmx, 0); + } + if (guest_can_use(vcpu, X86_FEATURE_VMX)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX | @@ -7932,6 +7990,9 @@ static __init void vmx_set_cpu_caps(void) if (cpu_has_vmx_waitpkg()) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); + + if (cpu_has_vmx_zxpause()) + kvm_cpu_cap_check_and_set(X86_FEATURE_ZXPAUSE); } static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) @@ -8455,6 +8516,10 @@ static __init int hardware_setup(void) unsigned long host_bndcfgs; struct desc_ptr dt; int r; + u32 ign; + + /* Caches Zhaoxin extend VMCS capabilities. */ + rdmsr_safe(MSR_ZX_EXT_VMCS_CAPS, &zx_ext_vmcs_cap, &ign); store_idt(&dt); host_idt_base = dt.address; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index c2130d2c8e24b..b5cebb6e08670 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -575,8 +575,17 @@ static inline u8 vmx_get_rvi(void) SECONDARY_EXEC_ENCLS_EXITING) #define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0 -#define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \ - (TERTIARY_EXEC_IPI_VIRT) +#define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL (TERTIARY_EXEC_IPI_VIRT) + +#define KVM_REQUIRED_VMX_ZX_TERTIARY_VM_EXEC_CONTROL 0 +#define KVM_OPTIONAL_VMX_ZX_TERTIARY_VM_EXEC_CONTROL (ZX_TERTIARY_EXEC_GUEST_ZXPAUSE) + +/* + * We shouldn't rw zxpause_vmexit_tsc vmcs field in this + * way, try to use another way in the future. + */ +#define KVM_REQUIRED_VMX_ZXPAUSE_VMEXIT_TSC 0 +#define KVM_OPTIONAL_VMX_ZXPAUSE_VMEXIT_TSC 1 #define BUILD_CONTROLS_SHADOW(lname, uname, bits) \ static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \ @@ -610,6 +619,8 @@ BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32) BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32) BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32) BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64) +BUILD_CONTROLS_SHADOW(zx_tertiary_exec, ZX_TERTIARY_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(zx_vmexit_tsc, ZXPAUSE_VMEXIT_TSC, 64) /* * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the @@ -712,6 +723,11 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; } +static inline bool vmx_guest_zxpause_enabled(struct vcpu_vmx *vmx) +{ + return zx_tertiary_exec_controls_get(vmx) & ZX_TERTIARY_EXEC_GUEST_ZXPAUSE; +} + static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) { if (!enable_ept) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9dd4624bdef2d..ff750e8797ad5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1462,6 +1462,7 @@ static const u32 msrs_to_save_base[] = { MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, MSR_IA32_UMWAIT_CONTROL, + MSR_ZX_PAUSE_CONTROL, MSR_IA32_XFD, MSR_IA32_XFD_ERR, }; @@ -7149,6 +7150,10 @@ static void kvm_probe_msr_to_save(u32 msr_index) if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) return; break; + case MSR_ZX_PAUSE_CONTROL: + if (!kvm_cpu_cap_has(X86_FEATURE_ZXPAUSE)) + return; + break; case MSR_IA32_RTIT_CTL: case MSR_IA32_RTIT_STATUS: if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 0e65d00e2339f..84076b82fb892 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -117,6 +117,27 @@ static void delay_halt_tpause(u64 start, u64 cycles) __tpause(TPAUSE_C02_STATE, edx, eax); } +/* +* On ZHAOXIN the ZXPAUSE instruction waits until any of: +* 1) the delta of TSC counter exceeds the value provided in EDX:EAX +* 2) global timeout in ZX_PAUSE_CONTROL is exceeded +* 3) an external interrupt occurs +*/ +static void delay_halt_zxpause(u64 unused, u64 cycles) +{ + u64 until = cycles; + u32 eax, edx; + + eax = lower_32_bits(until); + edx = upper_32_bits(until); + + /* + * Hard code the deeper (C0.1) sleep state because exit latency is + * small compared to the "microseconds" that usleep() will delay. + */ + __zxpause(ZXPAUSE_C01_STATE, edx, eax); +} + /* * On some AMD platforms, MWAITX has a configurable 32-bit timer, that * counts with TSC frequency. The input value is the number of TSC cycles @@ -183,6 +204,12 @@ void __init use_tpause_delay(void) delay_fn = delay_halt; } +void __init use_zxpause_delay(void) +{ + delay_halt_fn = delay_halt_zxpause; + delay_fn = delay_halt; +} + void use_mwaitx_delay(void) { delay_halt_fn = delay_halt_mwaitx; diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 845a4023ba44e..ef45425f83165 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 21 /* N 32-bit words worth of info */ +#define NCAPINTS 23 /* N 32-bit words worth of info */ #define NBUGINTS 2 /* N 32-bit bug flags */ /* @@ -443,6 +443,9 @@ #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ #define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */ +/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000006, word 22 */ +#define X86_FEATURE_ZXPAUSE (22*32+ 0) /* ZHAOXIN ZXPAUSE */ + /* * BUG word(s) */ diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index fafe9be7a6f4f..cbf8c3ea37fe3 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -131,6 +131,8 @@ #define DISABLED_MASK18 0 #define DISABLED_MASK19 0 #define DISABLED_MASK20 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define DISABLED_MASK21 0 +#define DISABLED_MASK22 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 23) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 1d111350197f3..96486e0436421 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -72,12 +72,23 @@ #define MSR_IA32_UMWAIT_CONTROL 0xe1 #define MSR_IA32_UMWAIT_CONTROL_C02_DISABLE BIT(0) #define MSR_IA32_UMWAIT_CONTROL_RESERVED BIT(1) + +#define MSR_ZX_PAUSE_CONTROL 0x187f +#define MSR_ZX_PAUSE_CONTROL_C02_DISABLE BIT(0) +#define MSR_ZX_PAUSE_CONTROL_RESERVED BIT(1) + /* * The time field is bit[31:2], but representing a 32bit value with * bit[1:0] zero. */ #define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U) +/* + * The time field is bit[31:2], but representing a 32bit value with + * bit[1:0] zero. + */ +#define MSR_ZX_PAUSE_CONTROL_TIME_MASK (~0x03U) + /* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */ #define MSR_IA32_CORE_CAPS 0x000000cf #define MSR_IA32_CORE_CAPS_INTEGRITY_CAPS_BIT 2 @@ -753,6 +764,13 @@ #define MSR_TMTA_LRTI_READOUT 0x80868018 #define MSR_TMTA_LRTI_VOLT_MHZ 0x8086801a +/* + * Zhaoxin extend VMCS capabilities: + * bit 0: exec-cntl3 VMCS field. + */ +#define MSR_ZX_EXT_VMCS_CAPS 0x1675 +#define MSR_ZX_VMCS_EXEC_CTL3 BIT(0) + /* Intel defined MSRs. */ #define MSR_IA32_P5_MC_ADDR 0x00000000 #define MSR_IA32_P5_MC_TYPE 0x00000001 diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h index 7ba1726b71c7b..76953f757f3c3 100644 --- a/tools/arch/x86/include/asm/required-features.h +++ b/tools/arch/x86/include/asm/required-features.h @@ -99,6 +99,8 @@ #define REQUIRED_MASK18 0 #define REQUIRED_MASK19 0 #define REQUIRED_MASK20 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define REQUIRED_MASK21 0 +#define REQUIRED_MASK22 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 23) #endif /* _ASM_X86_REQUIRED_FEATURES_H */