From 2e958a8a510d956ec8528f0bd20e309b5bb5156c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 17 May 2021 16:38:09 +0900 Subject: [PATCH 01/16] x86/entry/x32: Rename __x32_compat_sys_* to __x64_compat_sys_* The SYSCALL macros are mapped to symbols as follows: __SYSCALL_COMMON(nr, sym) --> __x64_ __SYSCALL_X32(nr, sym) --> __x32_ Originally, the syscalls in the x32 special range (512-547) were all compat. This assumption is now broken after the following commits: 55db9c0e8534 ("net: remove compat_sys_{get,set}sockopt") 5f764d624a89 ("fs: remove the compat readv/writev syscalls") 598b3cec831f ("fs: remove compat_sys_vmsplice") c3973b401ef2 ("mm: remove compat_process_vm_{readv,writev}") Those commits redefined __x32_sys_* to __x64_sys_* because there is no stub like __x32_sys_*. Defining them as follows is more sensible and cleaner. __SYSCALL_COMMON(nr, sym) --> __x64_ __SYSCALL_X32(nr, sym) --> __x64_ This works because both x86_64 and x32 use the same ABI (RDI, RSI, RDX, R10, R8, R9) The ugly #define __x32_sys_* will go away. Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210517073815.97426-2-masahiroy@kernel.org --- arch/x86/entry/syscall_x32.c | 16 ++-------------- arch/x86/include/asm/syscall_wrapper.h | 10 +++++----- 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index f2fe0a33bcfdd..3fea8fb9cd6a6 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -8,27 +8,15 @@ #include #include -/* - * Reuse the 64-bit entry points for the x32 versions that occupy different - * slots in the syscall table. - */ -#define __x32_sys_readv __x64_sys_readv -#define __x32_sys_writev __x64_sys_writev -#define __x32_sys_getsockopt __x64_sys_getsockopt -#define __x32_sys_setsockopt __x64_sys_setsockopt -#define __x32_sys_vmsplice __x64_sys_vmsplice -#define __x32_sys_process_vm_readv __x64_sys_process_vm_readv -#define __x32_sys_process_vm_writev __x64_sys_process_vm_writev - #define __SYSCALL_64(nr, sym) -#define __SYSCALL_X32(nr, sym) extern long __x32_##sym(const struct pt_regs *); +#define __SYSCALL_X32(nr, sym) extern long __x64_##sym(const struct pt_regs *); #define __SYSCALL_COMMON(nr, sym) extern long __x64_##sym(const struct pt_regs *); #include #undef __SYSCALL_X32 #undef __SYSCALL_COMMON -#define __SYSCALL_X32(nr, sym) [nr] = __x32_##sym, +#define __SYSCALL_X32(nr, sym) [nr] = __x64_##sym, #define __SYSCALL_COMMON(nr, sym) [nr] = __x64_##sym, asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_x32_syscall_max+1] = { diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index 80c08c7d5e72e..6a2827d0681fc 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -17,7 +17,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * __x64_sys_*() - 64-bit native syscall * __ia32_sys_*() - 32-bit native syscall or common compat syscall * __ia32_compat_sys_*() - 32-bit compat syscall - * __x32_compat_sys_*() - 64-bit X32 compat syscall + * __x64_compat_sys_*() - 64-bit X32 compat syscall * * The registers are decoded according to the ABI: * 64-bit: RDI, RSI, RDX, R10, R8, R9 @@ -166,17 +166,17 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * with x86_64 obviously do not need such care. */ #define __X32_COMPAT_SYS_STUB0(name) \ - __SYS_STUB0(x32, compat_sys_##name) + __SYS_STUB0(x64, compat_sys_##name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) \ - __SYS_STUBx(x32, compat_sys##name, \ + __SYS_STUBx(x64, compat_sys##name, \ SC_X86_64_REGS_TO_ARGS(x, __VA_ARGS__)) #define __X32_COMPAT_COND_SYSCALL(name) \ - __COND_SYSCALL(x32, compat_sys_##name) + __COND_SYSCALL(x64, compat_sys_##name) #define __X32_COMPAT_SYS_NI(name) \ - __SYS_NI(x32, compat_sys_##name) + __SYS_NI(x64, compat_sys_##name) #else /* CONFIG_X86_X32 */ #define __X32_COMPAT_SYS_STUB0(name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) From 6218d0f6b8dece1f2e82f0a47a0e6b8ecb631ef6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 17 May 2021 16:38:10 +0900 Subject: [PATCH 02/16] x86/syscalls: Switch to generic syscalltbl.sh Many architectures duplicate similar shell scripts. Convert x86 and UML to use scripts/syscalltbl.sh. The generic script generates seperate headers for x86/64 and x86/x32 syscalls, while the x86 specific script coalesced them into one. Adjust the code accordingly. Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210517073815.97426-3-masahiroy@kernel.org --- arch/x86/entry/syscall_32.c | 12 +++++-- arch/x86/entry/syscall_64.c | 9 ++---- arch/x86/entry/syscall_x32.c | 15 +++------ arch/x86/entry/syscalls/Makefile | 10 ++++-- arch/x86/entry/syscalls/syscalltbl.sh | 46 --------------------------- arch/x86/include/asm/Kbuild | 1 + arch/x86/um/sys_call_table_32.c | 8 +++-- arch/x86/um/sys_call_table_64.c | 9 ++---- 8 files changed, 34 insertions(+), 76 deletions(-) delete mode 100644 arch/x86/entry/syscalls/syscalltbl.sh diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 86eb0d89d46fa..70bf46e73b1cf 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -8,12 +8,18 @@ #include #include -#define __SYSCALL_I386(nr, sym) extern long __ia32_##sym(const struct pt_regs *); +#ifdef CONFIG_IA32_EMULATION +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) +#else +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) +#endif + +#define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *); #include -#undef __SYSCALL_I386 +#undef __SYSCALL -#define __SYSCALL_I386(nr, sym) [nr] = __ia32_##sym, +#define __SYSCALL(nr, sym) [nr] = __ia32_##sym, __visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { /* diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 1594ec72bcbb7..82670bb109319 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -8,14 +8,11 @@ #include #include -#define __SYSCALL_X32(nr, sym) -#define __SYSCALL_COMMON(nr, sym) __SYSCALL_64(nr, sym) - -#define __SYSCALL_64(nr, sym) extern long __x64_##sym(const struct pt_regs *); +#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); #include -#undef __SYSCALL_64 +#undef __SYSCALL -#define __SYSCALL_64(nr, sym) [nr] = __x64_##sym, +#define __SYSCALL(nr, sym) [nr] = __x64_##sym, asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index 3fea8fb9cd6a6..6d2ef887d7b60 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -8,16 +8,11 @@ #include #include -#define __SYSCALL_64(nr, sym) +#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); +#include +#undef __SYSCALL -#define __SYSCALL_X32(nr, sym) extern long __x64_##sym(const struct pt_regs *); -#define __SYSCALL_COMMON(nr, sym) extern long __x64_##sym(const struct pt_regs *); -#include -#undef __SYSCALL_X32 -#undef __SYSCALL_COMMON - -#define __SYSCALL_X32(nr, sym) [nr] = __x64_##sym, -#define __SYSCALL_COMMON(nr, sym) [nr] = __x64_##sym, +#define __SYSCALL(nr, sym) [nr] = __x64_##sym, asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_x32_syscall_max+1] = { /* @@ -25,5 +20,5 @@ asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_x32_syscall_max+1] = { * when the & below is removed. */ [0 ... __NR_x32_syscall_max] = &__x64_sys_ni_syscall, -#include +#include }; diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile index d8c4f6c9eadc0..c4bd8dd82bb16 100644 --- a/arch/x86/entry/syscalls/Makefile +++ b/arch/x86/entry/syscalls/Makefile @@ -10,7 +10,7 @@ syscall32 := $(src)/syscall_32.tbl syscall64 := $(src)/syscall_64.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh -systbl := $(srctree)/$(src)/syscalltbl.sh +systbl := $(srctree)/scripts/syscalltbl.sh quiet_cmd_syshdr = SYSHDR $@ cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ @@ -18,7 +18,7 @@ quiet_cmd_syshdr = SYSHDR $@ '$(syshdr_pfx_$(basetarget))' \ '$(syshdr_offset_$(basetarget))' quiet_cmd_systbl = SYSTBL $@ - cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ + cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@ quiet_cmd_hypercalls = HYPERCALLS $@ cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<, $(real-prereqs)) @@ -46,10 +46,15 @@ syshdr_pfx_unistd_64_x32 := x32_ $(out)/unistd_64_x32.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) +$(out)/syscalls_32.h: abis := i386 $(out)/syscalls_32.h: $(syscall32) $(systbl) FORCE $(call if_changed,systbl) +$(out)/syscalls_64.h: abis := common,64 $(out)/syscalls_64.h: $(syscall64) $(systbl) FORCE $(call if_changed,systbl) +$(out)/syscalls_x32.h: abis := common,x32 +$(out)/syscalls_x32.h: $(syscall64) $(systbl) FORCE + $(call if_changed,systbl) $(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh FORCE $(call if_changed,hypercalls) @@ -60,6 +65,7 @@ uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h syshdr-y += syscalls_32.h syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h syshdr-$(CONFIG_X86_64) += syscalls_64.h +syshdr-$(CONFIG_X86_X32) += syscalls_x32.h syshdr-$(CONFIG_XEN) += xen-hypercalls.h uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh deleted file mode 100644 index 929bde120d6bb..0000000000000 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" - -syscall_macro() { - local abi="$1" - local nr="$2" - local entry="$3" - - echo "__SYSCALL_${abi}($nr, $entry)" -} - -emit() { - local abi="$1" - local nr="$2" - local entry="$3" - local compat="$4" - - if [ "$abi" != "I386" -a -n "$compat" ]; then - echo "a compat entry ($abi: $compat) for a 64-bit syscall makes no sense" >&2 - exit 1 - fi - - if [ -z "$compat" ]; then - if [ -n "$entry" ]; then - syscall_macro "$abi" "$nr" "$entry" - fi - else - echo "#ifdef CONFIG_X86_32" - if [ -n "$entry" ]; then - syscall_macro "$abi" "$nr" "$entry" - fi - echo "#else" - syscall_macro "$abi" "$nr" "$compat" - echo "#endif" - fi -} - -grep '^[0-9]' "$in" | sort -n | ( - while read nr abi name entry compat; do - abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` - emit "$abi" "$nr" "$entry" "$compat" - done -) > "$out" diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index b19ec8282d507..1e51650b79d7c 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -3,6 +3,7 @@ generated-y += syscalls_32.h generated-y += syscalls_64.h +generated-y += syscalls_x32.h generated-y += unistd_32_ia32.h generated-y += unistd_64_x32.h generated-y += xen-hypercalls.h diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 2ed81e581755b..e83619c365dcc 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -26,11 +26,13 @@ #define old_mmap sys_old_mmap -#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) + +#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); #include -#undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym) [ nr ] = sym, +#undef __SYSCALL +#define __SYSCALL(nr, sym) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 2e8544dafbb01..6fb75af7cf54b 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -36,14 +36,11 @@ #define stub_execveat sys_execveat #define stub_rt_sigreturn sys_rt_sigreturn -#define __SYSCALL_X32(nr, sym) -#define __SYSCALL_COMMON(nr, sym) __SYSCALL_64(nr, sym) - -#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); #include -#undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym) [ nr ] = sym, +#undef __SYSCALL +#define __SYSCALL(nr, sym) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); From 44fe4895f47cbe9f4692e1d3cdc2ef8352f4d88e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 17 May 2021 16:38:11 +0900 Subject: [PATCH 03/16] x86/syscalls: Stop filling syscall arrays with *_sys_ni_syscall This is a follow-up cleanup after switching to the generic syscalltbl.sh. The old x86 specific script skipped non-existing syscalls. So, the generated syscalls_64.h, for example, had a big hole in the syscall numbers 335-423 range. That is why there exists [0 ... __NR_*_syscall_max] = &__*_sys_ni_cyscall. The new script, scripts/syscalltbl.sh automatically fills holes with __SYSCALL(, sys_ni_syscall), hence such ugly code can go away. The designated initializers, '[nr] =' are also unneeded. Also, there is no need to give __NR_*_syscall_max+1 because the array size is implied by the number of syscalls in the generated headers. Hence, there is no need to include , either. Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210517073815.97426-4-masahiroy@kernel.org --- arch/x86/entry/syscall_32.c | 10 ++-------- arch/x86/entry/syscall_64.c | 10 ++-------- arch/x86/entry/syscall_x32.c | 10 ++-------- arch/x86/um/sys_call_table_32.c | 6 ------ arch/x86/um/sys_call_table_64.c | 6 ------ 5 files changed, 6 insertions(+), 36 deletions(-) diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 70bf46e73b1cf..8cfc9bc73e7f8 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #ifdef CONFIG_IA32_EMULATION @@ -19,13 +18,8 @@ #include #undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = __ia32_##sym, +#define __SYSCALL(nr, sym) __ia32_##sym, -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_ia32_syscall_max] = &__ia32_sys_ni_syscall, +__visible const sys_call_ptr_t ia32_sys_call_table[] = { #include }; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 82670bb109319..be120eec1fc9f 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -5,20 +5,14 @@ #include #include #include -#include #include #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); #include #undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = __x64_##sym, +#define __SYSCALL(nr, sym) __x64_##sym, -asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_syscall_max] = &__x64_sys_ni_syscall, +asmlinkage const sys_call_ptr_t sys_call_table[] = { #include }; diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index 6d2ef887d7b60..bdd0e03a1265d 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -5,20 +5,14 @@ #include #include #include -#include #include #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); #include #undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = __x64_##sym, +#define __SYSCALL(nr, sym) __x64_##sym, -asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_x32_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_x32_syscall_max] = &__x64_sys_ni_syscall, +asmlinkage const sys_call_ptr_t x32_sys_call_table[] = { #include }; diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index e83619c365dcc..f8323104e3536 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #define __NO_STUBS @@ -37,11 +36,6 @@ extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, #include }; diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 6fb75af7cf54b..5ed665dc785fb 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #define __NO_STUBS @@ -45,11 +44,6 @@ extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, #include }; From f63815eb1d909a4121806e60928108ff040bf291 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 17 May 2021 16:38:12 +0900 Subject: [PATCH 04/16] x86/unistd: Define X32_NR_syscalls only for 64-bit kernel X32_NR_syscalls is needed only when building a 64bit kernel. Move it to proper #ifdef guard. Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210517073815.97426-5-masahiroy@kernel.org --- arch/x86/include/asm/unistd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index c1c3d31b15c06..1bc6020bc58db 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -26,11 +26,11 @@ # define __ARCH_WANT_COMPAT_SYS_PWRITEV64 # define __ARCH_WANT_COMPAT_SYS_PREADV64V2 # define __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 +# define X32_NR_syscalls (__NR_x32_syscall_max + 1) # endif # define NR_syscalls (__NR_syscall_max + 1) -# define X32_NR_syscalls (__NR_x32_syscall_max + 1) # define IA32_NR_syscalls (__NR_ia32_syscall_max + 1) # define __ARCH_WANT_NEW_STAT From 49f731f1972e6e44d8a5c3982a72902b3944bc34 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 17 May 2021 16:38:13 +0900 Subject: [PATCH 05/16] x86/syscalls: Use __NR_syscalls instead of __NR_syscall_max __NR_syscall_max is only used by x86 and UML. In contrast, __NR_syscalls is widely used by all the architectures. Convert __NR_syscall_max to __NR_syscalls and adjust the usage sites. This prepares x86 to switch to the generic syscallhdr.sh script. Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210517073815.97426-6-masahiroy@kernel.org --- arch/um/kernel/skas/syscall.c | 2 +- arch/x86/entry/syscalls/syscallhdr.sh | 2 +- arch/x86/include/asm/unistd.h | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index 3d91f89fd8527..9ee19e566da3b 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -41,7 +41,7 @@ void handle_syscall(struct uml_pt_regs *r) goto out; syscall = UPT_SYSCALL_NR(r); - if (syscall >= 0 && syscall <= __NR_syscall_max) + if (syscall >= 0 && syscall < __NR_syscalls) PT_REGS_SET_SYSCALL_RETURN(regs, EXECUTE_SYSCALL(syscall, regs)); diff --git a/arch/x86/entry/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh index cc1e638574270..75e66af067732 100644 --- a/arch/x86/entry/syscalls/syscallhdr.sh +++ b/arch/x86/entry/syscalls/syscallhdr.sh @@ -28,7 +28,7 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( echo "" echo "#ifdef __KERNEL__" - echo "#define __NR_${prefix}syscall_max $max" + echo "#define __NR_${prefix}syscalls $(($max + 1))" echo "#endif" echo "" echo "#endif /* ${fileguard} */" diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 1bc6020bc58db..80e9d5206a715 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -13,7 +13,7 @@ # define __ARCH_WANT_SYS_OLD_MMAP # define __ARCH_WANT_SYS_OLD_SELECT -# define __NR_ia32_syscall_max __NR_syscall_max +# define IA32_NR_syscalls (__NR_syscalls) # else @@ -26,12 +26,12 @@ # define __ARCH_WANT_COMPAT_SYS_PWRITEV64 # define __ARCH_WANT_COMPAT_SYS_PREADV64V2 # define __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 -# define X32_NR_syscalls (__NR_x32_syscall_max + 1) +# define X32_NR_syscalls (__NR_x32_syscalls) +# define IA32_NR_syscalls (__NR_ia32_syscalls) # endif -# define NR_syscalls (__NR_syscall_max + 1) -# define IA32_NR_syscalls (__NR_ia32_syscall_max + 1) +# define NR_syscalls (__NR_syscalls) # define __ARCH_WANT_NEW_STAT # define __ARCH_WANT_OLD_READDIR From 3cba325b358f86357b5ce50eb9e6633183927eee Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 17 May 2021 16:38:14 +0900 Subject: [PATCH 06/16] x86/syscalls: Switch to generic syscallhdr.sh Many architectures duplicate similar shell scripts. Converts x86 to use scripts/syscallhdr.sh. Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210517073815.97426-7-masahiroy@kernel.org --- arch/x86/entry/syscalls/Makefile | 26 ++++++++++---------- arch/x86/entry/syscalls/syscallhdr.sh | 35 --------------------------- 2 files changed, 13 insertions(+), 48 deletions(-) delete mode 100644 arch/x86/entry/syscalls/syscallhdr.sh diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile index c4bd8dd82bb16..8eb014bca8c94 100644 --- a/arch/x86/entry/syscalls/Makefile +++ b/arch/x86/entry/syscalls/Makefile @@ -9,40 +9,40 @@ _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ syscall32 := $(src)/syscall_32.tbl syscall64 := $(src)/syscall_64.tbl -syshdr := $(srctree)/$(src)/syscallhdr.sh +syshdr := $(srctree)/scripts/syscallhdr.sh systbl := $(srctree)/scripts/syscalltbl.sh quiet_cmd_syshdr = SYSHDR $@ - cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ - '$(syshdr_abi_$(basetarget))' \ - '$(syshdr_pfx_$(basetarget))' \ - '$(syshdr_offset_$(basetarget))' + cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --abis $(abis) --emit-nr \ + $(if $(offset),--offset $(offset)) \ + $(if $(prefix),--prefix $(prefix)) \ + $< $@ quiet_cmd_systbl = SYSTBL $@ cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@ quiet_cmd_hypercalls = HYPERCALLS $@ cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<, $(real-prereqs)) -syshdr_abi_unistd_32 := i386 +$(uapi)/unistd_32.h: abis := i386 $(uapi)/unistd_32.h: $(syscall32) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abi_unistd_32_ia32 := i386 -syshdr_pfx_unistd_32_ia32 := ia32_ +$(out)/unistd_32_ia32.h: abis := i386 +$(out)/unistd_32_ia32.h: prefix := ia32_ $(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abi_unistd_x32 := common,x32 -syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT +$(uapi)/unistd_x32.h: abis := common,x32 +$(uapi)/unistd_x32.h: offset := __X32_SYSCALL_BIT $(uapi)/unistd_x32.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abi_unistd_64 := common,64 +$(uapi)/unistd_64.h: abis := common,64 $(uapi)/unistd_64.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abi_unistd_64_x32 := x32 -syshdr_pfx_unistd_64_x32 := x32_ +$(out)/unistd_64_x32.h: abis := x32 +$(out)/unistd_64_x32.h: prefix := x32_ $(out)/unistd_64_x32.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) diff --git a/arch/x86/entry/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh deleted file mode 100644 index 75e66af067732..0000000000000 --- a/arch/x86/entry/syscalls/syscallhdr.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -prefix="$4" -offset="$5" - -fileguard=_ASM_X86_`basename "$out" | sed \ - -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ - -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - echo "#ifndef ${fileguard}" - echo "#define ${fileguard} 1" - echo "" - - max=0 - while read nr abi name entry ; do - if [ -z "$offset" ]; then - echo "#define __NR_${prefix}${name} $nr" - else - echo "#define __NR_${prefix}${name} ($offset + $nr)" - fi - - max=$nr - done - - echo "" - echo "#ifdef __KERNEL__" - echo "#define __NR_${prefix}syscalls $(($max + 1))" - echo "#endif" - echo "" - echo "#endif /* ${fileguard} */" -) > "$out" From 15c82d98a0f783bd4b2715ea910f7bb526367f54 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 18 May 2021 12:12:58 -0700 Subject: [PATCH 07/16] selftests/x86/syscall: Update and extend syscall_numbering_64 Update the syscall_numbering_64 selftest to reflect that a system call is to be extended from 32 bits. Add a mix of tests for valid and invalid system calls in 64-bit and x32 space. Use an explicit system call instruction, because the glibc syscall() wrapper might intercept instructions, extend the system call number independently, or anything similar. Use long long instead of long to make it possible to compile this test on x32 as well as 64 bits. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210518191303.4135296-2-hpa@zytor.com --- .../testing/selftests/x86/syscall_numbering.c | 274 ++++++++++++++---- 1 file changed, 222 insertions(+), 52 deletions(-) diff --git a/tools/testing/selftests/x86/syscall_numbering.c b/tools/testing/selftests/x86/syscall_numbering.c index d6b09cb1aa2c1..7dd86bcbee251 100644 --- a/tools/testing/selftests/x86/syscall_numbering.c +++ b/tools/testing/selftests/x86/syscall_numbering.c @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * syscall_arg_fault.c - tests faults 32-bit fast syscall stack args + * syscall_numbering.c - test calling the x86-64 kernel with various + * valid and invalid system call numbers. + * * Copyright (c) 2018 Andrew Lutomirski */ @@ -11,79 +13,247 @@ #include #include #include -#include +#include +#include +#include -static int nerrs; +/* Common system call numbers */ +#define SYS_READ 0 +#define SYS_WRITE 1 +#define SYS_GETPID 39 +/* x64-only system call numbers */ +#define X64_IOCTL 16 +#define X64_READV 19 +#define X64_WRITEV 20 +/* x32-only system call numbers (without X32_BIT) */ +#define X32_IOCTL 514 +#define X32_READV 515 +#define X32_WRITEV 516 -#define X32_BIT 0x40000000UL +#define X32_BIT 0x40000000 -static void check_enosys(unsigned long nr, bool *ok) +static unsigned int nerr = 0; /* Cumulative error count */ +static int nullfd = -1; /* File descriptor for /dev/null */ + +/* + * Directly invokes the given syscall with nullfd as the first argument + * and the rest zero. Avoids involving glibc wrappers in case they ever + * end up intercepting some system calls for some reason, or modify + * the system call number itself. + */ +static inline long long probe_syscall(int msb, int lsb) { - /* If this fails, a segfault is reasonably likely. */ - fflush(stdout); - - long ret = syscall(nr, 0, 0, 0, 0, 0, 0); - if (ret == 0) { - printf("[FAIL]\tsyscall %lu succeeded, but it should have failed\n", nr); - *ok = false; - } else if (errno != ENOSYS) { - printf("[FAIL]\tsyscall %lu had error code %d, but it should have reported ENOSYS\n", nr, errno); - *ok = false; - } + register long long arg1 asm("rdi") = nullfd; + register long long arg2 asm("rsi") = 0; + register long long arg3 asm("rdx") = 0; + register long long arg4 asm("r10") = 0; + register long long arg5 asm("r8") = 0; + register long long arg6 asm("r9") = 0; + long long nr = ((long long)msb << 32) | (unsigned int)lsb; + long long ret; + + asm volatile("syscall" + : "=a" (ret) + : "a" (nr), "r" (arg1), "r" (arg2), "r" (arg3), + "r" (arg4), "r" (arg5), "r" (arg6) + : "rcx", "r11", "memory", "cc"); + + return ret; } -static void test_x32_without_x32_bit(void) +static const char *syscall_str(int msb, int start, int end) { - bool ok = true; + static char buf[64]; + const char * const type = (start & X32_BIT) ? "x32" : "x64"; + int lsb = start; /* - * Syscalls 512-547 are "x32" syscalls. They are intended to be - * called with the x32 (0x40000000) bit set. Calling them without - * the x32 bit set is nonsense and should not work. + * Improve readability by stripping the x32 bit, but round + * toward zero so we don't display -1 as -1073741825. */ - printf("[RUN]\tChecking syscalls 512-547\n"); - for (int i = 512; i <= 547; i++) - check_enosys(i, &ok); + if (lsb < 0) + lsb |= X32_BIT; + else + lsb &= ~X32_BIT; + + if (start == end) + snprintf(buf, sizeof buf, "%s syscall %d:%d", + type, msb, lsb); + else + snprintf(buf, sizeof buf, "%s syscalls %d:%d..%d", + type, msb, lsb, lsb + (end-start)); + + return buf; +} + +static unsigned int _check_for(int msb, int start, int end, long long expect, + const char *expect_str) +{ + unsigned int err = 0; + + for (int nr = start; nr <= end; nr++) { + long long ret = probe_syscall(msb, nr); + + if (ret != expect) { + printf("[FAIL]\t %s returned %lld, but it should have returned %s\n", + syscall_str(msb, nr, nr), + ret, expect_str); + err++; + } + } + if (err) { + nerr += err; + if (start != end) + printf("[FAIL]\t %s had %u failure%s\n", + syscall_str(msb, start, end), + err, (err == 1) ? "s" : ""); + } else { + printf("[OK]\t %s returned %s as expected\n", + syscall_str(msb, start, end), expect_str); + } + + return err; +} + +#define check_for(msb,start,end,expect) \ + _check_for(msb,start,end,expect,#expect) + +static bool check_zero(int msb, int nr) +{ + return check_for(msb, nr, nr, 0); +} + +static bool check_enosys(int msb, int nr) +{ + return check_for(msb, nr, nr, -ENOSYS); +} + +/* + * Anyone diagnosing a failure will want to know whether the kernel + * supports x32. Tell them. This can also be used to conditionalize + * tests based on existence or nonexistence of x32. + */ +static bool test_x32(void) +{ + long long ret; + long long mypid = getpid(); + + printf("[RUN]\tChecking for x32 by calling x32 getpid()\n"); + ret = probe_syscall(0, SYS_GETPID | X32_BIT); + + if (ret == mypid) { + printf("[INFO]\t x32 is supported\n"); + return true; + } else if (ret == -ENOSYS) { + printf("[INFO]\t x32 is not supported\n"); + return false; + } else { + printf("[FAIL]\t x32 getpid() returned %lld, but it should have returned either %lld or -ENOSYS\n", ret, mypid); + nerr++; + return true; /* Proceed as if... */ + } +} + +static void test_syscalls_common(int msb) +{ + printf("[RUN]\t Checking some common syscalls as 64 bit\n"); + check_zero(msb, SYS_READ); + check_zero(msb, SYS_WRITE); + + printf("[RUN]\t Checking some 64-bit only syscalls as 64 bit\n"); + check_zero(msb, X64_READV); + check_zero(msb, X64_WRITEV); + + printf("[RUN]\t Checking out of range system calls\n"); + check_for(msb, -64, -1, -ENOSYS); + check_for(msb, X32_BIT-64, X32_BIT-1, -ENOSYS); + check_for(msb, -64-X32_BIT, -1-X32_BIT, -ENOSYS); + check_for(msb, INT_MAX-64, INT_MAX-1, -ENOSYS); +} + +static void test_syscalls_with_x32(int msb) +{ /* - * Check that a handful of 64-bit-only syscalls are rejected if the x32 - * bit is set. + * Syscalls 512-547 are "x32" syscalls. They are + * intended to be called with the x32 (0x40000000) bit + * set. Calling them without the x32 bit set is + * nonsense and should not work. */ - printf("[RUN]\tChecking some 64-bit syscalls in x32 range\n"); - check_enosys(16 | X32_BIT, &ok); /* ioctl */ - check_enosys(19 | X32_BIT, &ok); /* readv */ - check_enosys(20 | X32_BIT, &ok); /* writev */ + printf("[RUN]\t Checking x32 syscalls as 64 bit\n"); + check_for(msb, 512, 547, -ENOSYS); + + printf("[RUN]\t Checking some common syscalls as x32\n"); + check_zero(msb, SYS_READ | X32_BIT); + check_zero(msb, SYS_WRITE | X32_BIT); + + printf("[RUN]\t Checking some x32 syscalls as x32\n"); + check_zero(msb, X32_READV | X32_BIT); + check_zero(msb, X32_WRITEV | X32_BIT); + + printf("[RUN]\t Checking some 64-bit syscalls as x32\n"); + check_enosys(msb, X64_IOCTL | X32_BIT); + check_enosys(msb, X64_READV | X32_BIT); + check_enosys(msb, X64_WRITEV | X32_BIT); +} + +static void test_syscalls_without_x32(int msb) +{ + printf("[RUN]\t Checking for absence of x32 system calls\n"); + check_for(msb, 0 | X32_BIT, 999 | X32_BIT, -ENOSYS); +} + +static void test_syscall_numbering(void) +{ + static const int msbs[] = { + 0, 1, -1, X32_BIT-1, X32_BIT, X32_BIT-1, -X32_BIT, INT_MAX, + INT_MIN, INT_MIN+1 + }; + bool with_x32 = test_x32(); /* - * Check some syscalls with high bits set. + * The MSB is supposed to be ignored, so we loop over a few + * to test that out. */ - printf("[RUN]\tChecking numbers above 2^32-1\n"); - check_enosys((1UL << 32), &ok); - check_enosys(X32_BIT | (1UL << 32), &ok); + for (size_t i = 0; i < sizeof(msbs)/sizeof(msbs[0]); i++) { + int msb = msbs[i]; + printf("[RUN]\tChecking system calls with msb = %d (0x%x)\n", + msb, msb); - if (!ok) - nerrs++; - else - printf("[OK]\tThey all returned -ENOSYS\n"); + test_syscalls_common(msb); + if (with_x32) + test_syscalls_with_x32(msb); + else + test_syscalls_without_x32(msb); + } } -int main() +int main(void) { /* - * Anyone diagnosing a failure will want to know whether the kernel - * supports x32. Tell them. + * It is quite likely to get a segfault on a failure, so make + * sure the message gets out by setting stdout to nonbuffered. */ - printf("\tChecking for x32..."); - fflush(stdout); - if (syscall(39 | X32_BIT, 0, 0, 0, 0, 0, 0) >= 0) { - printf(" supported\n"); - } else if (errno == ENOSYS) { - printf(" not supported\n"); - } else { - printf(" confused\n"); - } + setvbuf(stdout, NULL, _IONBF, 0); - test_x32_without_x32_bit(); + /* + * Harmless file descriptor to work on... + */ + nullfd = open("/dev/null", O_RDWR); + if (nullfd < 0) { + printf("[FAIL]\tUnable to open /dev/null: %s\n", + strerror(errno)); + printf("[SKIP]\tCannot execute test\n"); + return 71; /* EX_OSERR */ + } - return nerrs ? 1 : 0; + test_syscall_numbering(); + if (!nerr) { + printf("[OK]\tAll system calls succeeded or failed as expected\n"); + return 0; + } else { + printf("[FAIL]\tA total of %u system call%s had incorrect behavior\n", + nerr, nerr != 1 ? "s" : ""); + return 1; + } } From c5c39488dcb5f818bb07f856a349262d667ef147 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 18 May 2021 12:12:59 -0700 Subject: [PATCH 08/16] selftests/x86/syscall: Simplify message reporting in syscall_numbering Reduce some boiler plate in printing and indenting messages. This makes it easier to produce clean status output. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210518191303.4135296-3-hpa@zytor.com --- .../testing/selftests/x86/syscall_numbering.c | 103 ++++++++++++------ 1 file changed, 72 insertions(+), 31 deletions(-) diff --git a/tools/testing/selftests/x86/syscall_numbering.c b/tools/testing/selftests/x86/syscall_numbering.c index 7dd86bcbee251..434fe0efafa0c 100644 --- a/tools/testing/selftests/x86/syscall_numbering.c +++ b/tools/testing/selftests/x86/syscall_numbering.c @@ -16,6 +16,7 @@ #include #include #include +#include /* Common system call numbers */ #define SYS_READ 0 @@ -34,6 +35,33 @@ static unsigned int nerr = 0; /* Cumulative error count */ static int nullfd = -1; /* File descriptor for /dev/null */ +static int indent = 0; + +static inline unsigned int offset(void) +{ + return 8 + indent * 4; +} + +#define msg(lvl, fmt, ...) printf("%-*s" fmt, offset(), "[" #lvl "]", \ + ## __VA_ARGS__) + +#define run(fmt, ...) msg(RUN, fmt, ## __VA_ARGS__) +#define info(fmt, ...) msg(INFO, fmt, ## __VA_ARGS__) +#define ok(fmt, ...) msg(OK, fmt, ## __VA_ARGS__) + +#define fail(fmt, ...) \ + do { \ + msg(FAIL, fmt, ## __VA_ARGS__); \ + nerr++; \ + } while (0) + +#define crit(fmt, ...) \ + do { \ + indent = 0; \ + msg(FAIL, fmt, ## __VA_ARGS__); \ + msg(SKIP, "Unable to run test\n"); \ + exit(EX_OSERR); + } while (0) /* * Directly invokes the given syscall with nullfd as the first argument @@ -91,28 +119,37 @@ static unsigned int _check_for(int msb, int start, int end, long long expect, { unsigned int err = 0; + indent++; + if (start != end) + indent++; + for (int nr = start; nr <= end; nr++) { long long ret = probe_syscall(msb, nr); if (ret != expect) { - printf("[FAIL]\t %s returned %lld, but it should have returned %s\n", + fail("%s returned %lld, but it should have returned %s\n", syscall_str(msb, nr, nr), ret, expect_str); err++; } } + if (start != end) + indent--; + if (err) { nerr += err; if (start != end) - printf("[FAIL]\t %s had %u failure%s\n", + fail("%s had %u failure%s\n", syscall_str(msb, start, end), - err, (err == 1) ? "s" : ""); + err, err == 1 ? "s" : ""); } else { - printf("[OK]\t %s returned %s as expected\n", - syscall_str(msb, start, end), expect_str); + ok("%s returned %s as expected\n", + syscall_str(msb, start, end), expect_str); } + indent--; + return err; } @@ -137,35 +174,38 @@ static bool check_enosys(int msb, int nr) static bool test_x32(void) { long long ret; - long long mypid = getpid(); + pid_t mypid = getpid(); + bool with_x32; - printf("[RUN]\tChecking for x32 by calling x32 getpid()\n"); + run("Checking for x32 by calling x32 getpid()\n"); ret = probe_syscall(0, SYS_GETPID | X32_BIT); + indent++; if (ret == mypid) { - printf("[INFO]\t x32 is supported\n"); - return true; + info("x32 is supported\n"); + with_x32 = true; } else if (ret == -ENOSYS) { - printf("[INFO]\t x32 is not supported\n"); - return false; + info("x32 is not supported\n"); + with_x32 = false; } else { - printf("[FAIL]\t x32 getpid() returned %lld, but it should have returned either %lld or -ENOSYS\n", ret, mypid); - nerr++; - return true; /* Proceed as if... */ + fail("x32 getpid() returned %lld, but it should have returned either %lld or -ENOSYS\n", ret, mypid); + with_x32 = false; } + indent--; + return with_x32; } static void test_syscalls_common(int msb) { - printf("[RUN]\t Checking some common syscalls as 64 bit\n"); + run("Checking some common syscalls as 64 bit\n"); check_zero(msb, SYS_READ); check_zero(msb, SYS_WRITE); - printf("[RUN]\t Checking some 64-bit only syscalls as 64 bit\n"); + run("Checking some 64-bit only syscalls as 64 bit\n"); check_zero(msb, X64_READV); check_zero(msb, X64_WRITEV); - printf("[RUN]\t Checking out of range system calls\n"); + run("Checking out of range system calls\n"); check_for(msb, -64, -1, -ENOSYS); check_for(msb, X32_BIT-64, X32_BIT-1, -ENOSYS); check_for(msb, -64-X32_BIT, -1-X32_BIT, -ENOSYS); @@ -180,18 +220,18 @@ static void test_syscalls_with_x32(int msb) * set. Calling them without the x32 bit set is * nonsense and should not work. */ - printf("[RUN]\t Checking x32 syscalls as 64 bit\n"); + run("Checking x32 syscalls as 64 bit\n"); check_for(msb, 512, 547, -ENOSYS); - printf("[RUN]\t Checking some common syscalls as x32\n"); + run("Checking some common syscalls as x32\n"); check_zero(msb, SYS_READ | X32_BIT); check_zero(msb, SYS_WRITE | X32_BIT); - printf("[RUN]\t Checking some x32 syscalls as x32\n"); + run("Checking some x32 syscalls as x32\n"); check_zero(msb, X32_READV | X32_BIT); check_zero(msb, X32_WRITEV | X32_BIT); - printf("[RUN]\t Checking some 64-bit syscalls as x32\n"); + run("Checking some 64-bit syscalls as x32\n"); check_enosys(msb, X64_IOCTL | X32_BIT); check_enosys(msb, X64_READV | X32_BIT); check_enosys(msb, X64_WRITEV | X32_BIT); @@ -199,7 +239,7 @@ static void test_syscalls_with_x32(int msb) static void test_syscalls_without_x32(int msb) { - printf("[RUN]\t Checking for absence of x32 system calls\n"); + run("Checking for absence of x32 system calls\n"); check_for(msb, 0 | X32_BIT, 999 | X32_BIT, -ENOSYS); } @@ -217,14 +257,18 @@ static void test_syscall_numbering(void) */ for (size_t i = 0; i < sizeof(msbs)/sizeof(msbs[0]); i++) { int msb = msbs[i]; - printf("[RUN]\tChecking system calls with msb = %d (0x%x)\n", - msb, msb); + run("Checking system calls with msb = %d (0x%x)\n", + msb, msb); + + indent++; test_syscalls_common(msb); if (with_x32) test_syscalls_with_x32(msb); else test_syscalls_without_x32(msb); + + indent--; } } @@ -241,19 +285,16 @@ int main(void) */ nullfd = open("/dev/null", O_RDWR); if (nullfd < 0) { - printf("[FAIL]\tUnable to open /dev/null: %s\n", - strerror(errno)); - printf("[SKIP]\tCannot execute test\n"); - return 71; /* EX_OSERR */ + crit("Unable to open /dev/null: %s\n", strerror(errno)); } test_syscall_numbering(); if (!nerr) { - printf("[OK]\tAll system calls succeeded or failed as expected\n"); + ok("All system calls succeeded or failed as expected\n"); return 0; } else { - printf("[FAIL]\tA total of %u system call%s had incorrect behavior\n", - nerr, nerr != 1 ? "s" : ""); + fail("A total of %u system call%s had incorrect behavior\n", + nerr, nerr != 1 ? "s" : ""); return 1; } } From 795e2a023b8080b95442811f26f0762184116caa Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 18 May 2021 12:13:00 -0700 Subject: [PATCH 09/16] selftests/x86/syscall: Add tests under ptrace to syscall_numbering_64 Add tests running under ptrace for syscall_numbering_64. ptrace stopping on syscall entry and possibly modifying the syscall number (regs.orig_rax) or the default return value (regs.rax) can have different results than the normal system call path. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210518191303.4135296-4-hpa@zytor.com --- .../testing/selftests/x86/syscall_numbering.c | 232 ++++++++++++++++-- 1 file changed, 207 insertions(+), 25 deletions(-) diff --git a/tools/testing/selftests/x86/syscall_numbering.c b/tools/testing/selftests/x86/syscall_numbering.c index 434fe0efafa0c..991591718bb0a 100644 --- a/tools/testing/selftests/x86/syscall_numbering.c +++ b/tools/testing/selftests/x86/syscall_numbering.c @@ -16,8 +16,16 @@ #include #include #include +#include #include +#include +#include +#include +#include + +#include + /* Common system call numbers */ #define SYS_READ 0 #define SYS_WRITE 1 @@ -33,13 +41,45 @@ #define X32_BIT 0x40000000 -static unsigned int nerr = 0; /* Cumulative error count */ static int nullfd = -1; /* File descriptor for /dev/null */ -static int indent = 0; +static bool with_x32; /* x32 supported on this kernel? */ + +enum ptrace_pass { + PTP_NOTHING, + PTP_GETREGS, + PTP_WRITEBACK, + PTP_FUZZRET, + PTP_FUZZHIGH, + PTP_INTNUM, + PTP_DONE +}; + +static const char * const ptrace_pass_name[] = +{ + [PTP_NOTHING] = "just stop, no data read", + [PTP_GETREGS] = "only getregs", + [PTP_WRITEBACK] = "getregs, unmodified setregs", + [PTP_FUZZRET] = "modifying the default return", + [PTP_FUZZHIGH] = "clobbering the top 32 bits", + [PTP_INTNUM] = "sign-extending the syscall number", +}; + +/* + * Shared memory block between tracer and test + */ +struct shared { + unsigned int nerr; /* Total error count */ + unsigned int indent; /* Message indentation level */ + enum ptrace_pass ptrace_pass; + bool probing_syscall; /* In probe_syscall() */ +}; +static volatile struct shared *sh; static inline unsigned int offset(void) { - return 8 + indent * 4; + unsigned int level = sh ? sh->indent : 0; + + return 8 + level * 4; } #define msg(lvl, fmt, ...) printf("%-*s" fmt, offset(), "[" #lvl "]", \ @@ -52,16 +92,19 @@ static inline unsigned int offset(void) #define fail(fmt, ...) \ do { \ msg(FAIL, fmt, ## __VA_ARGS__); \ - nerr++; \ - } while (0) + sh->nerr++; \ + } while (0) #define crit(fmt, ...) \ do { \ - indent = 0; \ + sh->indent = 0; \ msg(FAIL, fmt, ## __VA_ARGS__); \ msg(SKIP, "Unable to run test\n"); \ - exit(EX_OSERR); - } while (0) + exit(EX_OSERR); \ + } while (0) + +/* Sentinel for ptrace-modified return value */ +#define MODIFIED_BY_PTRACE -9999 /* * Directly invokes the given syscall with nullfd as the first argument @@ -69,7 +112,7 @@ static inline unsigned int offset(void) * end up intercepting some system calls for some reason, or modify * the system call number itself. */ -static inline long long probe_syscall(int msb, int lsb) +static long long probe_syscall(int msb, int lsb) { register long long arg1 asm("rdi") = nullfd; register long long arg2 asm("rsi") = 0; @@ -80,11 +123,21 @@ static inline long long probe_syscall(int msb, int lsb) long long nr = ((long long)msb << 32) | (unsigned int)lsb; long long ret; + /* + * We pass in an extra copy of the extended system call number + * in %rbx, so we can examine it from the ptrace handler without + * worrying about it being possibly modified. This is to test + * the validity of struct user regs.orig_rax a.k.a. + * struct pt_regs.orig_ax. + */ + sh->probing_syscall = true; asm volatile("syscall" : "=a" (ret) - : "a" (nr), "r" (arg1), "r" (arg2), "r" (arg3), + : "a" (nr), "b" (nr), + "r" (arg1), "r" (arg2), "r" (arg3), "r" (arg4), "r" (arg5), "r" (arg6) : "rcx", "r11", "memory", "cc"); + sh->probing_syscall = false; return ret; } @@ -119,9 +172,9 @@ static unsigned int _check_for(int msb, int start, int end, long long expect, { unsigned int err = 0; - indent++; + sh->indent++; if (start != end) - indent++; + sh->indent++; for (int nr = start; nr <= end; nr++) { long long ret = probe_syscall(msb, nr); @@ -135,20 +188,19 @@ static unsigned int _check_for(int msb, int start, int end, long long expect, } if (start != end) - indent--; + sh->indent--; if (err) { - nerr += err; if (start != end) fail("%s had %u failure%s\n", - syscall_str(msb, start, end), - err, err == 1 ? "s" : ""); + syscall_str(msb, start, end), + err, err == 1 ? "s" : ""); } else { ok("%s returned %s as expected\n", syscall_str(msb, start, end), expect_str); } - indent--; + sh->indent--; return err; } @@ -175,12 +227,11 @@ static bool test_x32(void) { long long ret; pid_t mypid = getpid(); - bool with_x32; run("Checking for x32 by calling x32 getpid()\n"); ret = probe_syscall(0, SYS_GETPID | X32_BIT); - indent++; + sh->indent++; if (ret == mypid) { info("x32 is supported\n"); with_x32 = true; @@ -188,15 +239,17 @@ static bool test_x32(void) info("x32 is not supported\n"); with_x32 = false; } else { - fail("x32 getpid() returned %lld, but it should have returned either %lld or -ENOSYS\n", ret, mypid); + fail("x32 getpid() returned %lld, but it should have returned either %lld or -ENOSYS\n", ret, (long long)mypid); with_x32 = false; } - indent--; + sh->indent--; return with_x32; } static void test_syscalls_common(int msb) { + enum ptrace_pass pass = sh->ptrace_pass; + run("Checking some common syscalls as 64 bit\n"); check_zero(msb, SYS_READ); check_zero(msb, SYS_WRITE); @@ -206,7 +259,11 @@ static void test_syscalls_common(int msb) check_zero(msb, X64_WRITEV); run("Checking out of range system calls\n"); - check_for(msb, -64, -1, -ENOSYS); + check_for(msb, -64, -2, -ENOSYS); + if (pass >= PTP_FUZZRET) + check_for(msb, -1, -1, MODIFIED_BY_PTRACE); + else + check_for(msb, -1, -1, -ENOSYS); check_for(msb, X32_BIT-64, X32_BIT-1, -ENOSYS); check_for(msb, -64-X32_BIT, -1-X32_BIT, -ENOSYS); check_for(msb, INT_MAX-64, INT_MAX-1, -ENOSYS); @@ -249,7 +306,8 @@ static void test_syscall_numbering(void) 0, 1, -1, X32_BIT-1, X32_BIT, X32_BIT-1, -X32_BIT, INT_MAX, INT_MIN, INT_MIN+1 }; - bool with_x32 = test_x32(); + + sh->indent++; /* * The MSB is supposed to be ignored, so we loop over a few @@ -260,7 +318,7 @@ static void test_syscall_numbering(void) run("Checking system calls with msb = %d (0x%x)\n", msb, msb); - indent++; + sh->indent++; test_syscalls_common(msb); if (with_x32) @@ -268,12 +326,119 @@ static void test_syscall_numbering(void) else test_syscalls_without_x32(msb); - indent--; + sh->indent--; + } + + sh->indent--; +} + +static void syscall_numbering_tracee(void) +{ + enum ptrace_pass pass; + + if (ptrace(PTRACE_TRACEME, 0, 0, 0)) { + crit("Failed to request tracing\n"); + return; + } + raise(SIGSTOP); + + for (sh->ptrace_pass = pass = PTP_NOTHING; pass < PTP_DONE; + sh->ptrace_pass = ++pass) { + run("Running tests under ptrace: %s\n", ptrace_pass_name[pass]); + test_syscall_numbering(); + } +} + +static void mess_with_syscall(pid_t testpid, enum ptrace_pass pass) +{ + struct user_regs_struct regs; + + sh->probing_syscall = false; /* Do this on entry only */ + + /* For these, don't even getregs */ + if (pass == PTP_NOTHING || pass == PTP_DONE) + return; + + ptrace(PTRACE_GETREGS, testpid, NULL, ®s); + + if (regs.orig_rax != regs.rbx) { + fail("orig_rax %#llx doesn't match syscall number %#llx\n", + (unsigned long long)regs.orig_rax, + (unsigned long long)regs.rbx); + } + + switch (pass) { + case PTP_GETREGS: + /* Just read, no writeback */ + return; + case PTP_WRITEBACK: + /* Write back the same register state verbatim */ + break; + case PTP_FUZZRET: + regs.rax = MODIFIED_BY_PTRACE; + break; + case PTP_FUZZHIGH: + regs.rax = MODIFIED_BY_PTRACE; + regs.orig_rax = regs.orig_rax | 0xffffffff00000000ULL; + break; + case PTP_INTNUM: + regs.rax = MODIFIED_BY_PTRACE; + regs.orig_rax = (int)regs.orig_rax; + break; + default: + crit("invalid ptrace_pass\n"); + break; + } + + ptrace(PTRACE_SETREGS, testpid, NULL, ®s); +} + +static void syscall_numbering_tracer(pid_t testpid) +{ + int wstatus; + + do { + pid_t wpid = waitpid(testpid, &wstatus, 0); + if (wpid < 0 && errno != EINTR) + break; + if (wpid != testpid) + continue; + if (!WIFSTOPPED(wstatus)) + break; /* Thread exited? */ + + if (sh->probing_syscall && WSTOPSIG(wstatus) == SIGTRAP) + mess_with_syscall(testpid, sh->ptrace_pass); + } while (sh->ptrace_pass != PTP_DONE && + !ptrace(PTRACE_SYSCALL, testpid, NULL, NULL)); + + ptrace(PTRACE_DETACH, testpid, NULL, NULL); + + /* Wait for the child process to terminate */ + while (waitpid(testpid, &wstatus, 0) != testpid || !WIFEXITED(wstatus)) + /* wait some more */; +} + +static void test_traced_syscall_numbering(void) +{ + pid_t testpid; + + /* Launch the test thread; this thread continues as the tracer thread */ + testpid = fork(); + + if (testpid < 0) { + crit("Unable to launch tracer process\n"); + } else if (testpid == 0) { + syscall_numbering_tracee(); + _exit(0); + } else { + syscall_numbering_tracer(testpid); } } int main(void) { + unsigned int nerr; + /* * It is quite likely to get a segfault on a failure, so make * sure the message gets out by setting stdout to nonbuffered. @@ -288,7 +453,24 @@ int main(void) crit("Unable to open /dev/null: %s\n", strerror(errno)); } + /* + * Set up a block of shared memory... + */ + sh = mmap(NULL, sysconf(_SC_PAGE_SIZE), PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_SHARED, 0, 0); + if (sh == MAP_FAILED) { + crit("Unable to allocated shared memory block: %s\n", + strerror(errno)); + } + + with_x32 = test_x32(); + + run("Running tests without ptrace...\n"); test_syscall_numbering(); + + test_traced_syscall_numbering(); + + nerr = sh->nerr; if (!nerr) { ok("All system calls succeeded or failed as expected\n"); return 0; From 0595494891723a1dcca5eaa8eeca8ab54ad953b9 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 18 May 2021 12:13:01 -0700 Subject: [PATCH 10/16] x86/entry/64: Sign-extend system calls on entry to int Right now, *some* code will treat e.g. 0x0000000100000001 as a system call and some will not. Some of the code, notably in ptrace, will treat 0x000000018000000 as a system call and some will not. Finally, right now, e.g. 335 for x86-64 will force the exit code to be set to -ENOSYS even if poked by ptrace, but 548 will not, because there is an observable difference between an out of range system call and a system call number that falls outside the range of the table. This is visible to the user: for example, the syscall_numbering_64 test fails if run under strace, because as strace uses ptrace, it ends up clobbering the upper half of the 64-bit system call number. The architecture independent code all assumes that a system call is "int" that the value -1 specifically and not just any negative value is used for a non-system call. This is the case on x86 as well when arch-independent code is involved. The arch-independent API is defined/documented (but not *implemented*!) in . This is an ABI change, but is in fact a revert to the original x86-64 ABI. The original assembly entry code would zero-extend the system call number; Use sign extend to be explicit that this is treated as a signed number (although in practice it makes no difference, of course) and to avoid people getting the idea of "optimizing" it, as has happened on at least two(!) separate occasions. Do not store the extended value into regs->orig_ax, however: on x86-64, the ABI is that the callee is responsible for extending parameters, so only examining the lower 32 bits is fully consistent with any "int" argument to any system call, e.g. regs->di for write(2). The full value of %rax on entry to the kernel is thus still available. [ tglx: Add a comment to the ASM code ] Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210518191303.4135296-5-hpa@zytor.com --- arch/x86/entry/entry_64.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1d9db15fdc692..a5f02d03c585a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -108,7 +108,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) /* IRQs are off. */ movq %rsp, %rdi - movq %rax, %rsi + /* Sign extend the lower 32bit as syscall numbers are treated as int */ + movslq %eax, %rsi call do_syscall_64 /* returns with IRQs disabled */ /* From b337b4965e3a3e567f11828a9e3fe3fb3faefa47 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 18 May 2021 12:13:02 -0700 Subject: [PATCH 11/16] x86/entry: Treat out of range and gap system calls the same The current 64-bit system call entry code treats out-of-range system calls differently than system calls that map to a hole in the system call table. This is visible to the user if system calls are intercepted via ptrace or seccomp and the return value (regs->ax) is modified: in the former case, the return value is preserved, and in the latter case, sys_ni_syscall() is called and the return value is forced to -ENOSYS. The API spec in is very clear that only (int)-1 is the non-system-call sentinel value, so make the system call behavior consistent by calling sys_ni_syscall() for all invalid system call numbers except for -1. Although currently sys_ni_syscall() simply returns -ENOSYS, calling it explicitly is friendly for tracing and future possible extensions, and as this is an error path there is no reason to optimize it. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210518191303.4135296-6-hpa@zytor.com --- arch/x86/entry/common.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 00da0f5420de8..f51bc17262db1 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -52,6 +52,8 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, unsigned long nr) X32_NR_syscalls); regs->ax = x32_sys_call_table[nr](regs); #endif + } else if (unlikely((int)nr != -1)) { + regs->ax = __x64_sys_ni_syscall(regs); } instrumentation_end(); syscall_exit_to_user_mode(regs); @@ -76,6 +78,8 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, if (likely(nr < IA32_NR_syscalls)) { nr = array_index_nospec(nr, IA32_NR_syscalls); regs->ax = ia32_sys_call_table[nr](regs); + } else if (unlikely((int)nr != -1)) { + regs->ax = __ia32_sys_ni_syscall(regs); } } From 2978996f620001f4e748c79af0fe89be729ef58d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 18 May 2021 12:13:03 -0700 Subject: [PATCH 12/16] x86/entry: Use int everywhere for system call numbers System call numbers are defined as int, so use int everywhere for system call numbers. This is strictly a cleanup; it should not change anything user visible; all ABI changes have been done in the preceeding patches. [ tglx: Replaced the unsigned long cast ] Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210518191303.4135296-7-hpa@zytor.com --- arch/x86/entry/common.c | 87 +++++++++++++++++++++++----------- arch/x86/include/asm/syscall.h | 2 +- 2 files changed, 60 insertions(+), 29 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index f51bc17262db1..ee95fe3f15185 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -36,49 +36,81 @@ #include #ifdef CONFIG_X86_64 -__visible noinstr void do_syscall_64(struct pt_regs *regs, unsigned long nr) + +static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) +{ + /* + * Convert negative numbers to very high and thus out of range + * numbers for comparisons. + */ + unsigned int unr = nr; + + if (likely(unr < NR_syscalls)) { + unr = array_index_nospec(unr, NR_syscalls); + regs->ax = sys_call_table[unr](regs); + return true; + } + return false; +} + +static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) +{ + /* + * Adjust the starting offset of the table, and convert numbers + * < __X32_SYSCALL_BIT to very high and thus out of range + * numbers for comparisons. + */ + unsigned int xnr = nr - __X32_SYSCALL_BIT; + + if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { + xnr = array_index_nospec(xnr, X32_NR_syscalls); + regs->ax = x32_sys_call_table[xnr](regs); + return true; + } + return false; +} + +__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) { add_random_kstack_offset(); nr = syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); - if (likely(nr < NR_syscalls)) { - nr = array_index_nospec(nr, NR_syscalls); - regs->ax = sys_call_table[nr](regs); -#ifdef CONFIG_X86_X32_ABI - } else if (likely((nr & __X32_SYSCALL_BIT) && - (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) { - nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT, - X32_NR_syscalls); - regs->ax = x32_sys_call_table[nr](regs); -#endif - } else if (unlikely((int)nr != -1)) { + + if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { + /* Invalid system call, but still a system call. */ regs->ax = __x64_sys_ni_syscall(regs); } + instrumentation_end(); syscall_exit_to_user_mode(regs); } #endif #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) -static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) +static __always_inline int syscall_32_enter(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_IA32_EMULATION)) current_thread_info()->status |= TS_COMPAT; - return (unsigned int)regs->orig_ax; + return (int)regs->orig_ax; } /* * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. */ -static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, - unsigned int nr) +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) { - if (likely(nr < IA32_NR_syscalls)) { - nr = array_index_nospec(nr, IA32_NR_syscalls); - regs->ax = ia32_sys_call_table[nr](regs); - } else if (unlikely((int)nr != -1)) { + /* + * Convert negative numbers to very high and thus out of range + * numbers for comparisons. + */ + unsigned int unr = nr; + + if (likely(unr < IA32_NR_syscalls)) { + unr = array_index_nospec(unr, IA32_NR_syscalls); + regs->ax = ia32_sys_call_table[unr](regs); + } else if (nr != -1) { regs->ax = __ia32_sys_ni_syscall(regs); } } @@ -86,15 +118,15 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, /* Handles int $0x80 */ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { - unsigned int nr = syscall_32_enter(regs); + int nr = syscall_32_enter(regs); add_random_kstack_offset(); /* - * Subtlety here: if ptrace pokes something larger than 2^32-1 into - * orig_ax, the unsigned int return value truncates it. This may - * or may not be necessary, but it matches the old asm behavior. + * Subtlety here: if ptrace pokes something larger than 2^31-1 into + * orig_ax, the int return value truncates it. This matches + * the semantics of syscall_get_nr(). */ - nr = (unsigned int)syscall_enter_from_user_mode(regs, nr); + nr = syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); do_syscall_32_irqs_on(regs, nr); @@ -105,7 +137,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) { - unsigned int nr = syscall_32_enter(regs); + int nr = syscall_32_enter(regs); int res; add_random_kstack_offset(); @@ -140,8 +172,7 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) return false; } - /* The case truncates any ptrace induced syscall nr > 2^32 -1 */ - nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr); + nr = syscall_enter_from_user_mode_work(regs, nr); /* Now this is just like a normal syscall. */ do_syscall_32_irqs_on(regs, nr); diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index f6593cafdbd93..f7e2d82d24fb1 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -159,7 +159,7 @@ static inline int syscall_get_arch(struct task_struct *task) ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } -void do_syscall_64(struct pt_regs *regs, unsigned long nr); +void do_syscall_64(struct pt_regs *regs, int nr); void do_int80_syscall_32(struct pt_regs *regs); long do_fast_syscall_32(struct pt_regs *regs); From 1eb8a49836949a77c4f7d738786719e7fde0c333 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 25 May 2021 20:54:20 +0900 Subject: [PATCH 13/16] x86/syscalls: Clear 'offset' and 'prefix' in case they are set in env If the environment variable 'prefix' is set on the build host, it is wrongly used as syscall macro prefixes. $ export prefix=/usr $ make -s defconfig all In file included from ./arch/x86/include/asm/unistd.h:20, from :2: ./arch/x86/include/generated/uapi/asm/unistd_64.h:4:9: warning: missing whitespace after the macro name 4 | #define __NR_/usrread 0 | ^~~~~ arch/x86/entry/syscalls/Makefile should clear 'offset' and 'prefix'. Fixes: 3cba325b358f ("x86/syscalls: Switch to generic syscallhdr.sh") Reported-by: Naresh Kamboju Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210525115420.679416-1-masahiroy@kernel.org --- arch/x86/entry/syscalls/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile index 8eb014bca8c94..5b3efed0e4e86 100644 --- a/arch/x86/entry/syscalls/Makefile +++ b/arch/x86/entry/syscalls/Makefile @@ -11,6 +11,8 @@ syscall64 := $(src)/syscall_64.tbl syshdr := $(srctree)/scripts/syscallhdr.sh systbl := $(srctree)/scripts/syscalltbl.sh +offset := +prefix := quiet_cmd_syshdr = SYSHDR $@ cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --abis $(abis) --emit-nr \ From d48ca5b98fa5d21444e04bb17373d339200b679a Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 24 May 2021 14:17:05 -0400 Subject: [PATCH 14/16] x86/uml/syscalls: Remove array index from syscall initializers The recent syscall table generator rework removed the index from the initializers for native x86 syscall tables, but missed the UML syscall tables. Fixes: 44fe4895f47c ("Stop filling syscall arrays with *_sys_ni_syscall") Signed-off-by: Brian Gerst Signed-off-by: Thomas Gleixner Reviewed-by: Masahiro Yamada Link: https://lore.kernel.org/r/20210524181707.132844-2-brgerst@gmail.com --- arch/x86/um/sys_call_table_32.c | 2 +- arch/x86/um/sys_call_table_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index f8323104e3536..0575decb5e544 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -31,7 +31,7 @@ #include #undef __SYSCALL -#define __SYSCALL(nr, sym) [ nr ] = sym, +#define __SYSCALL(nr, sym) sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 5ed665dc785fb..95725b5a41ac8 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -39,7 +39,7 @@ #include #undef __SYSCALL -#define __SYSCALL(nr, sym) [ nr ] = sym, +#define __SYSCALL(nr, sym) sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); From fd9e8691f38712892fa2ac73132dcc8b85b07a8f Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 24 May 2021 14:17:06 -0400 Subject: [PATCH 15/16] x86/syscalls: Remove -Wno-override-init for syscall tables Commit 44fe4895f47c ("Stop filling syscall arrays with *_sys_ni_syscall") removes the need for -Wno-override-init, since the table is now filled sequentially instead of overriding a default value. Signed-off-by: Brian Gerst Signed-off-by: Thomas Gleixner Reviewed-by: Masahiro Yamada Link: https://lore.kernel.org/r/20210524181707.132844-3-brgerst@gmail.com --- arch/x86/entry/Makefile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 08bf95dbc9112..94d2843ce80c6 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -17,10 +17,6 @@ CFLAGS_syscall_64.o += -fno-stack-protector CFLAGS_syscall_32.o += -fno-stack-protector CFLAGS_syscall_x32.o += -fno-stack-protector -CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) -CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,) -CFLAGS_syscall_x32.o += $(call cc-option,-Wno-override-init,) - obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += common.o From 48f7eee81cd53a94699d28959566b41a9dcac1d9 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 24 May 2021 14:17:07 -0400 Subject: [PATCH 16/16] x86/syscalls: Don't adjust CFLAGS for syscall tables The syscall_*.c files only contain data (the syscall tables). There is no need to adjust CFLAGS for tracing and stack protector since they contain no code. Signed-off-by: Brian Gerst Signed-off-by: Thomas Gleixner Reviewed-by: Masahiro Yamada Link: https://lore.kernel.org/r/20210524181707.132844-4-brgerst@gmail.com --- arch/x86/entry/Makefile | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 94d2843ce80c6..7fec5dcf64386 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -8,14 +8,8 @@ UBSAN_SANITIZE := n KCOV_INSTRUMENT := n CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_syscall_x32.o = $(CC_FLAGS_FTRACE) CFLAGS_common.o += -fno-stack-protector -CFLAGS_syscall_64.o += -fno-stack-protector -CFLAGS_syscall_32.o += -fno-stack-protector -CFLAGS_syscall_x32.o += -fno-stack-protector obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += common.o