[llvm] [ARM64EC] Fix thunks for vector args (PR #96003)
Daniel Paoliello via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 18 15:32:13 PDT 2024
https://github.com/dpaoliello created https://github.com/llvm/llvm-project/pull/96003
The checks when building a thunk to decide if an arg needed to be cast to/from an integer or redirected via a pointer didn't match how arg types were changed in `canonicalizeThunkType`, this caused LLVM to ICE when using vector types as args due to incorrect types in a call instruction.
Instead of duplicating these checks, we should check if the arg type differs between x64 and AArch64 and then cast or redirect as appropriate.
>From 8b9e0f0cb483c4b8338587a031a8911d1354a80a Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao at microsoft.com>
Date: Tue, 18 Jun 2024 12:42:50 -0700
Subject: [PATCH] [ARM64EC] Fix thunks for vector args
---
.../AArch64/AArch64Arm64ECCallLowering.cpp | 10 +-
.../CodeGen/AArch64/arm64ec-entry-thunks.ll | 109 ++++++++++++++++
.../CodeGen/AArch64/arm64ec-exit-thunks.ll | 117 ++++++++++++++++++
3 files changed, 231 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 218201f24aaab..b6cd816d78938 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -387,6 +387,7 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT,
SmallVector<Value *> Args;
// Pass the called function in x9.
+ auto X64TyOffset = 1;
Args.push_back(F->arg_begin());
Type *RetTy = Arm64Ty->getReturnType();
@@ -396,10 +397,11 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT,
// pointer.
if (DL.getTypeStoreSize(RetTy) > 8) {
Args.push_back(IRB.CreateAlloca(RetTy));
+ X64TyOffset++;
}
}
- for (auto &Arg : make_range(F->arg_begin() + 1, F->arg_end())) {
+ for (auto [Arg, X64ArgType] : llvm::zip_equal(make_range(F->arg_begin() + 1, F->arg_end()), make_range(X64Ty->param_begin() + X64TyOffset, X64Ty->param_end()))) {
// Translate arguments from AArch64 calling convention to x86 calling
// convention.
//
@@ -414,8 +416,7 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT,
// with an attribute.)
//
// The first argument is the called function, stored in x9.
- if (Arg.getType()->isArrayTy() || Arg.getType()->isStructTy() ||
- DL.getTypeStoreSize(Arg.getType()) > 8) {
+ if (Arg.getType() != X64ArgType) {
Value *Mem = IRB.CreateAlloca(Arg.getType());
IRB.CreateStore(&Arg, Mem);
if (DL.getTypeStoreSize(Arg.getType()) <= 8) {
@@ -488,8 +489,7 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
for (unsigned i = ThunkArgOffset, e = PassthroughArgSize; i != e; ++i) {
Value *Arg = Thunk->getArg(i);
Type *ArgTy = Arm64Ty->getParamType(i - ThunkArgOffset);
- if (ArgTy->isArrayTy() || ArgTy->isStructTy() ||
- DL.getTypeStoreSize(ArgTy) > 8) {
+ if (ArgTy != Arg->getType()) {
// Translate array/struct arguments to the expected type.
if (DL.getTypeStoreSize(ArgTy) <= 8) {
Value *CastAlloca = IRB.CreateAlloca(ArgTy);
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
index 0cf678f56e03c..6aeeeed94543d 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
@@ -487,6 +487,109 @@ define void @cxx_method(ptr noundef nonnull align 8 dereferenceable(8) %0, ptr d
ret void
}
+define <4 x i8> @small_vector(<4 x i8> %0) {
+; CHECK-LABEL: .def $ientry_thunk$cdecl$m$m;
+; CHECK: .section .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$m$m
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #192
+; CHECK-NEXT: .seh_stackalloc 192
+; CHECK-NEXT: stp q6, q7, [sp, #16] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q6, 16
+; CHECK-NEXT: stp q8, q9, [sp, #48] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q8, 48
+; CHECK-NEXT: stp q10, q11, [sp, #80] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q10, 80
+; CHECK-NEXT: stp q12, q13, [sp, #112] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q12, 112
+; CHECK-NEXT: stp q14, q15, [sp, #144] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q14, 144
+; CHECK-NEXT: stp x29, x30, [sp, #176] // 16-byte Folded Spill
+; CHECK-NEXT: .seh_save_fplr 176
+; CHECK-NEXT: add x29, sp, #176
+; CHECK-NEXT: .seh_add_fp 176
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: str w0, [sp, #12]
+; CHECK-NEXT: ldr s0, [sp, #12]
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: blr x9
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-NEXT: adrp x9, __os_arm64x_dispatch_ret
+; CHECK-NEXT: str s0, [sp, #8]
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: ldr x0, [x9, :lo12:__os_arm64x_dispatch_ret]
+; CHECK-NEXT: .seh_startepilogue
+; CHECK-NEXT: ldp x29, x30, [sp, #176] // 16-byte Folded Reload
+; CHECK-NEXT: .seh_save_fplr 176
+; CHECK-NEXT: ldp q14, q15, [sp, #144] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q14, 144
+; CHECK-NEXT: ldp q12, q13, [sp, #112] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q12, 112
+; CHECK-NEXT: ldp q10, q11, [sp, #80] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q10, 80
+; CHECK-NEXT: ldp q8, q9, [sp, #48] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q8, 48
+; CHECK-NEXT: ldp q6, q7, [sp, #16] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q6, 16
+; CHECK-NEXT: add sp, sp, #192
+; CHECK-NEXT: .seh_stackalloc 192
+; CHECK-NEXT: .seh_endepilogue
+; CHECK-NEXT: br x0
+; CHECK-NEXT: .seh_endfunclet
+; CHECK-NEXT: .seh_endproc
+start:
+ ret <4 x i8> %0
+}
+
+define <8 x i16> @large_vector(<8 x i16> %0) {
+; CHECK-LABEL: .def $ientry_thunk$cdecl$m16$m16;
+; CHECK: .section .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$m16$m16
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp q6, q7, [sp, #-192]! // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_px q6, 192
+; CHECK-NEXT: stp q8, q9, [sp, #32] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q8, 32
+; CHECK-NEXT: stp q10, q11, [sp, #64] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q10, 64
+; CHECK-NEXT: stp q12, q13, [sp, #96] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q12, 96
+; CHECK-NEXT: stp q14, q15, [sp, #128] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q14, 128
+; CHECK-NEXT: str x19, [sp, #160] // 8-byte Folded Spill
+; CHECK-NEXT: .seh_save_reg x19, 160
+; CHECK-NEXT: stp x29, x30, [sp, #168] // 16-byte Folded Spill
+; CHECK-NEXT: .seh_save_fplr 168
+; CHECK-NEXT: add x29, sp, #168
+; CHECK-NEXT: .seh_add_fp 168
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: mov x19, x0
+; CHECK-NEXT: blr x9
+; CHECK-NEXT: adrp x8, __os_arm64x_dispatch_ret
+; CHECK-NEXT: str q0, [x19]
+; CHECK-NEXT: ldr x0, [x8, :lo12:__os_arm64x_dispatch_ret]
+; CHECK-NEXT: .seh_startepilogue
+; CHECK-NEXT: ldp x29, x30, [sp, #168] // 16-byte Folded Reload
+; CHECK-NEXT: .seh_save_fplr 168
+; CHECK-NEXT: ldr x19, [sp, #160] // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg x19, 160
+; CHECK-NEXT: ldp q14, q15, [sp, #128] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q14, 128
+; CHECK-NEXT: ldp q12, q13, [sp, #96] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q12, 96
+; CHECK-NEXT: ldp q10, q11, [sp, #64] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q10, 64
+; CHECK-NEXT: ldp q8, q9, [sp, #32] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q8, 32
+; CHECK-NEXT: ldp q6, q7, [sp], #192 // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_px q6, 192
+; CHECK-NEXT: .seh_endepilogue
+; CHECK-NEXT: br x0
+; CHECK-NEXT: .seh_endfunclet
+; CHECK-NEXT: .seh_endproc
+start:
+ ret <8 x i16> %0
+}
; Verify the hybrid bitmap
; CHECK-LABEL: .section .hybmp$x,"yi"
@@ -523,3 +626,9 @@ define void @cxx_method(ptr noundef nonnull align 8 dereferenceable(8) %0, ptr d
; CHECK-NEXT: .symidx "#cxx_method"
; CHECK-NEXT: .symidx $ientry_thunk$cdecl$i8$i8i8
; CHECK-NEXT: .word 1
+; CHECK-NEXT: .symidx "#small_vector"
+; CHECK-NEXT: .symidx $ientry_thunk$cdecl$m$m
+; CHECK-NEXT: .word 1
+; CHECK-NEXT: .symidx "#large_vector"
+; CHECK-NEXT: .symidx $ientry_thunk$cdecl$m16$m16
+; CHECK-NEXT: .word 1
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
index 7a40fcd85ac58..dcc675839b714 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
@@ -457,6 +457,109 @@ declare %T2 @simple_struct(%T1, %T2, %T3, %T4) nounwind;
; CHECK-NEXT: .seh_endfunclet
; CHECK-NEXT: .seh_endproc
+declare <4 x i8> @small_vector(<4 x i8> %0) nounwind;
+; CHECK-LABEL: .def $iexit_thunk$cdecl$m$m;
+; CHECK: .section .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m$m
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #64
+; CHECK-NEXT: .seh_stackalloc 64
+; CHECK-NEXT: stp x29, x30, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: .seh_save_fplr 48
+; CHECK-NEXT: add x29, sp, #48
+; CHECK-NEXT: .seh_add_fp 48
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-NEXT: adrp x8, __os_arm64x_dispatch_call_no_redirect
+; CHECK-NEXT: ldr x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect]
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: stur s0, [x29, #-4]
+; CHECK-NEXT: blr x16
+; CHECK-NEXT: stur w8, [x29, #-8]
+; CHECK-NEXT: ldur s0, [x29, #-8]
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: .seh_startepilogue
+; CHECK-NEXT: ldp x29, x30, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: .seh_save_fplr 48
+; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: .seh_stackalloc 64
+; CHECK-NEXT: .seh_endepilogue
+; CHECK-NEXT: ret
+; CHECK-NEXT: .seh_endfunclet
+; CHECK-NEXT: .seh_endproc
+; CHECK-LABEL: .def "#small_vector$exit_thunk";
+; CHECK: .section .wowthk$aa,"xr",discard,"#small_vector$exit_thunk"
+; CHECK: .weak_anti_dep small_vector
+; CHECK: .weak_anti_dep "#small_vector"
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .seh_save_reg_x x30, 16
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: adrp x8, __os_arm64x_check_icall
+; CHECK-NEXT: adrp x11, small_vector
+; CHECK-NEXT: add x11, x11, :lo12:small_vector
+; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$m$m)
+; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$m$m)
+; CHECK-NEXT: blr x8
+; CHECK-NEXT: .seh_startepilogue
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
+; CHECK-NEXT: .seh_endepilogue
+; CHECK-NEXT: br x11
+; CHECK-NEXT: .seh_endfunclet
+; CHECK-NEXT: .seh_endproc
+
+declare <8 x i16> @large_vector(<8 x i16> %0) nounwind;
+; CHECK-LABEL: .def $iexit_thunk$cdecl$m16$m16;
+; CHECK: .section .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m16$m16
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #80
+; CHECK-NEXT: .seh_stackalloc 80
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: .seh_save_fplr 64
+; CHECK-NEXT: add x29, sp, #64
+; CHECK-NEXT: .seh_add_fp 64
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: adrp x8, __os_arm64x_dispatch_call_no_redirect
+; CHECK-NEXT: sub x0, x29, #16
+; CHECK-NEXT: add x1, sp, #32
+; CHECK-NEXT: ldr x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect]
+; CHECK-NEXT: str q0, [sp, #32]
+; CHECK-NEXT: blr x16
+; CHECK-NEXT: ldur q0, [x29, #-16]
+; CHECK-NEXT: .seh_startepilogue
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: .seh_save_fplr 64
+; CHECK-NEXT: add sp, sp, #80
+; CHECK-NEXT: .seh_stackalloc 80
+; CHECK-NEXT: .seh_endepilogue
+; CHECK-NEXT: ret
+; CHECK-NEXT: .seh_endfunclet
+; CHECK-NEXT: .seh_endproc
+; CHECK-LABEL: .def "#large_vector$exit_thunk";
+; CHECK: .section .wowthk$aa,"xr",discard,"#large_vector$exit_thunk"
+; CHECK: .weak_anti_dep large_vector
+; CHECK: .weak_anti_dep "#large_vector"
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .seh_save_reg_x x30, 16
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: adrp x8, __os_arm64x_check_icall
+; CHECK-NEXT: adrp x11, large_vector
+; CHECK-NEXT: add x11, x11, :lo12:large_vector
+; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$m16$m16)
+; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$m16$m16)
+; CHECK-NEXT: blr x8
+; CHECK-NEXT: .seh_startepilogue
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
+; CHECK-NEXT: .seh_endepilogue
+; CHECK-NEXT: br x11
+; CHECK-NEXT: .seh_endfunclet
+; CHECK-NEXT: .seh_endproc
+
; CHECK-LABEL: .section .hybmp$x,"yi"
; CHECK-NEXT: .symidx "#func_caller"
; CHECK-NEXT: .symidx $ientry_thunk$cdecl$v$v
@@ -515,6 +618,18 @@ declare %T2 @simple_struct(%T1, %T2, %T3, %T4) nounwind;
; CHECK-NEXT: .symidx "#simple_struct$exit_thunk"
; CHECK-NEXT: .symidx simple_struct
; CHECK-NEXT: .word 0
+; CHECK-NEXT: .symidx small_vector
+; CHECK-NEXT: .symidx $iexit_thunk$cdecl$m$m
+; CHECK-NEXT: .word 4
+; CHECK-NEXT: .symidx "#small_vector$exit_thunk"
+; CHECK-NEXT: .symidx small_vector
+; CHECK-NEXT: .word 0
+; CHECK-NEXT: .symidx large_vector
+; CHECK-NEXT: .symidx $iexit_thunk$cdecl$m16$m16
+; CHECK-NEXT: .word 4
+; CHECK-NEXT: .symidx "#large_vector$exit_thunk"
+; CHECK-NEXT: .symidx large_vector
+; CHECK-NEXT: .word 0
define void @func_caller() nounwind {
call void @no_op()
@@ -529,5 +644,7 @@ define void @func_caller() nounwind {
call [2 x i8] @small_array([2 x i8] [i8 0, i8 0], [2 x float] [float 0.0, float 0.0])
call [3 x i64] @large_array([3 x i64] [i64 0, i64 0, i64 0], [2 x double] [double 0.0, double 0.0], [2 x [2 x i64]] [[2 x i64] [i64 0, i64 0], [2 x i64] [i64 0, i64 0]])
call %T2 @simple_struct(%T1 { i16 0 }, %T2 { i32 0, float 0.0 }, %T3 { i64 0, double 0.0 }, %T4 { i64 0, double 0.0, i8 0 })
+ call <4 x i8> @small_vector(<4 x i8> <i8 0, i8 0, i8 0, i8 0>)
+ call <8 x i16> @large_vector(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
ret void
}
More information about the llvm-commits
mailing list