[llvm] [AArch64][SVE] Handle consecutive Predicates in CC_AArch64_Custom_Block (PR #90122)
Zhaoshi Zheng via llvm-commits
llvm-commits at lists.llvm.org
Thu May 23 12:56:12 PDT 2024
https://github.com/zhaoshiz updated https://github.com/llvm/llvm-project/pull/90122
>From 27cd3a438d53413c4f5e2ba99a98b4bdec788d2c Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Tue, 26 Mar 2024 16:01:11 -0700
Subject: [PATCH 1/5] [AArch64][SVE] Handle consecutive Predicates in
CC_AArch64_Custom_Block
For 2d masks as function arguments, even in [1 x <vscale x 4 x i1>] type,
they're flagged as InConsecutiveRegs. This fix checks for mask types and
allocate them to P registers.
---
.../AArch64/AArch64CallingConvention.cpp | 14 +++++++++---
.../CodeGen/AArch64/sve-calling-convention.ll | 22 +++++++++++++++++++
2 files changed, 33 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index bfcafc6442d24..9a2838992eb02 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -38,6 +38,8 @@ static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
static const MCPhysReg ZRegList[] = {AArch64::Z0, AArch64::Z1, AArch64::Z2,
AArch64::Z3, AArch64::Z4, AArch64::Z5,
AArch64::Z6, AArch64::Z7};
+static const MCPhysReg PRegList[] = {AArch64::P0, AArch64::P1, AArch64::P2,
+ AArch64::P3};
static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
@@ -140,9 +142,15 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
RegList = DRegList;
else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector())
RegList = QRegList;
- else if (LocVT.isScalableVector())
- RegList = ZRegList;
- else {
+ else if (LocVT.isScalableVector()) {
+ // Scalable masks should be pass by Predicate registers.
+ if (LocVT == MVT::nxv1i1 || LocVT == MVT::nxv2i1 || LocVT == MVT::nxv4i1 ||
+ LocVT == MVT::nxv8i1 || LocVT == MVT::nxv16i1 ||
+ LocVT == MVT::aarch64svcount)
+ RegList = PRegList;
+ else
+ RegList = ZRegList;
+ } else {
// Not an array we want to split up after all.
return false;
}
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
index 0a45244f12be5..a0eee24275f1e 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
@@ -128,6 +128,14 @@ define <vscale x 4 x i1> @sve_signature_pred(<vscale x 4 x i1> %arg1, <vscale x
ret <vscale x 4 x i1> %arg2
}
+; CHECK-LABEL: name: sve_signature_pred_2d
+; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES]]
+; CHECK: RET_ReallyLR implicit $p0
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+ ret [1 x <vscale x 4 x i1>] %arg2
+}
+
; CHECK-LABEL: name: sve_signature_vec_caller
; CHECK-DAG: [[ARG2:%[0-9]+]]:zpr = COPY $z1
; CHECK-DAG: [[ARG1:%[0-9]+]]:zpr = COPY $z0
@@ -156,6 +164,20 @@ define <vscale x 4 x i1> @sve_signature_pred_caller(<vscale x 4 x i1> %arg1, <vs
ret <vscale x 4 x i1> %res
}
+; CHECK-LABEL: name: sve_signature_pred_2d_caller
+; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2]]
+; CHECK-DAG: $p1 = COPY [[ARG1]]
+; CHECK-NEXT: BL @sve_signature_pred_2d, csr_aarch64_sve_aapcs
+; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p0
+; CHECK: $p0 = COPY [[RES]]
+; CHECK: RET_ReallyLR implicit $p0
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_caller([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+ %res = call [1 x <vscale x 4 x i1>] @sve_signature_pred_2d([1 x <vscale x 4 x i1>] %arg2, [1 x <vscale x 4 x i1>] %arg1)
+ ret [1 x <vscale x 4 x i1>] %res
+}
+
; Test that functions returning or taking SVE arguments use the correct
; callee-saved set when using the default C calling convention (as opposed
; to aarch64_sve_vector_pcs)
>From 8264e9af78e5014b342c3e297ba8ae5b6acf72bf Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Thu, 9 May 2024 16:46:24 -0700
Subject: [PATCH 2/5] [AArch64][SVE] Handle consecutive Predicates arguments
through stack
Per AAPCS64, when P0~P4 are exhausted or not able to hold a scalable predicate
argument, the argument is allocated to the stack and a pointer is passed
to the callee.
For consecutive predicates in types like [M x <vscale x N x i1>], we
should handle them in the same way as Z registers, as shown by:
https://reviews.llvm.org/D71216
Reference:
https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#parameter-passing
---
.../AArch64/AArch64CallingConvention.cpp | 15 ++++-
.../AArch64/sve-calling-convention-byref.ll | 60 +++++++++++++++++++
.../CodeGen/AArch64/sve-calling-convention.ll | 44 ++++++++++++--
3 files changed, 110 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index 9a2838992eb02..9a804c12939c4 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -61,11 +61,17 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
// CCAssignFn again we want it to behave as if all remaining registers are
// allocated. This will force the code to pass the tuple indirectly in
// accordance with the PCS.
- bool RegsAllocated[8];
+ bool ZRegsAllocated[8];
for (int I = 0; I < 8; I++) {
- RegsAllocated[I] = State.isAllocated(ZRegList[I]);
+ ZRegsAllocated[I] = State.isAllocated(ZRegList[I]);
State.AllocateReg(ZRegList[I]);
}
+ // The same applies to P registers.
+ bool PRegsAllocated[4];
+ for (int I = 0; I < 4; I++) {
+ PRegsAllocated[I] = State.isAllocated(PRegList[I]);
+ State.AllocateReg(PRegList[I]);
+ }
auto &It = PendingMembers[0];
CCAssignFn *AssignFn =
@@ -81,8 +87,11 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
// Return the register state back to how it was before, leaving any
// unallocated registers available for other smaller types.
for (int I = 0; I < 8; I++)
- if (!RegsAllocated[I])
+ if (!ZRegsAllocated[I])
State.DeallocateReg(ZRegList[I]);
+ for (int I = 0; I < 4; I++)
+ if (!PRegsAllocated[I])
+ State.DeallocateReg(PRegList[I]);
// All pending members have now been allocated
PendingMembers.clear();
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
index 8cb8b1c92fa7e..3a9e29f464011 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
@@ -72,6 +72,66 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_many_svepred_arg(<
ret <vscale x 16 x i1> %ret
}
+; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
+; P0 = ldr [x0]
+; P1 = ldr [x0 + sizeof(Px)]
+; P2 = ldr [x0 + 2*sizeof(Px)]
+; P3 = ldr [x0 + 3*sizeof(Px)]
+define aarch64_sve_vector_pcs [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_2d_4x([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_2d_4x
+; CHECK: [[BASE:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK: [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK: [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[BASE]], killed [[OFFSET1]]
+; CHECK: [[PRED1:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK: [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET2]]
+; CHECK: [[PRED2:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK: [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET3]]
+; CHECK: [[PRED3:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: $p0 = COPY [[PRED0]]
+; CHECK: $p1 = COPY [[PRED1]]
+; CHECK: $p2 = COPY [[PRED2]]
+; CHECK: $p3 = COPY [[PRED3]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+ ret [4 x <vscale x 16 x i1>] %arg2
+}
+
+; Test that arg1 is stored to the stack from p0~p3; and the stack location is passed throuch x0 to setup the call:
+; str P0, [stack_loc_for_args]
+; str P1, [stack_loc_for_args + sizeof(Px)]
+; str P2, [stack_loc_for_args + 2*sizeof(Px)]
+; str P3, [stack_loc_for_args + 3*sizeof(Px)]
+; x0 = stack_loc_for_args
+define [4 x <vscale x 16 x i1>] @caller_with_svepred_arg_2d_4x([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_2d_4x
+; CHECK: stack:
+; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
+; CHECK-NEXT: stack-id: scalable-vector,
+; CHECK: [[PRED3:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[PRED2:%[0-9]+]]:ppr = COPY $p2
+; CHECK: [[PRED1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: [[PRED0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK: [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK: [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK: [[STACK:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+; CHECK: [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], [[OFFSET3]]
+; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+; CHECK: STR_PXI [[PRED3]], killed [[ADDR3]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK: [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], [[OFFSET2]]
+; CHECK: STR_PXI [[PRED2]], killed [[ADDR2]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK: [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[STACK]], [[OFFSET1]]
+; CHECK: STR_PXI [[PRED1]], killed [[ADDR1]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK: STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
+; CHECK: $x0 = COPY [[STACK]]
+; CHECK: BL @callee_with_svepred_arg_2d_4x, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0, implicit-def $p1, implicit-def $p2, implicit-def $p3
+; CHECK: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ %res = call [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_2d_4x([4 x <vscale x 16 x i1>] %arg2, [4 x <vscale x 16 x i1>] %arg1)
+ ret [4 x <vscale x 16 x i1>] %res
+}
+
; Test that z8 and z9, passed by reference, are loaded from a location that is passed on the stack.
; i.e. x0 = %x0
; :
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
index a0eee24275f1e..e6bb660adbf66 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
@@ -128,14 +128,26 @@ define <vscale x 4 x i1> @sve_signature_pred(<vscale x 4 x i1> %arg1, <vscale x
ret <vscale x 4 x i1> %arg2
}
-; CHECK-LABEL: name: sve_signature_pred_2d
+; Test that scalable predicate argument in [1 x <vscale x 4 x i1>] type are properly assigned to P registers.
+; CHECK-LABEL: name: sve_signature_pred_2d_1x
; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p1
; CHECK: $p0 = COPY [[RES]]
; CHECK: RET_ReallyLR implicit $p0
-define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_1x([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
ret [1 x <vscale x 4 x i1>] %arg2
}
+; Test that upto to two scalable predicate argument in [2 x <vscale x 4 x i1>] type can be assigned to P registers.
+; CHECK-LABEL: name: sve_signature_pred_2d_2x
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p2
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 4 x i1>] @sve_signature_pred_2d_2x([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
+ ret [2 x <vscale x 4 x i1>] %arg2
+}
+
; CHECK-LABEL: name: sve_signature_vec_caller
; CHECK-DAG: [[ARG2:%[0-9]+]]:zpr = COPY $z1
; CHECK-DAG: [[ARG1:%[0-9]+]]:zpr = COPY $z0
@@ -164,20 +176,40 @@ define <vscale x 4 x i1> @sve_signature_pred_caller(<vscale x 4 x i1> %arg1, <vs
ret <vscale x 4 x i1> %res
}
-; CHECK-LABEL: name: sve_signature_pred_2d_caller
+; CHECK-LABEL: name: sve_signature_pred_2d_1x_caller
; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p1
; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p0
; CHECK-DAG: $p0 = COPY [[ARG2]]
; CHECK-DAG: $p1 = COPY [[ARG1]]
-; CHECK-NEXT: BL @sve_signature_pred_2d, csr_aarch64_sve_aapcs
+; CHECK-NEXT: BL @sve_signature_pred_2d_1x, csr_aarch64_sve_aapcs
; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p0
; CHECK: $p0 = COPY [[RES]]
; CHECK: RET_ReallyLR implicit $p0
-define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_caller([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
- %res = call [1 x <vscale x 4 x i1>] @sve_signature_pred_2d([1 x <vscale x 4 x i1>] %arg2, [1 x <vscale x 4 x i1>] %arg1)
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_1x_caller([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+ %res = call [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_1x([1 x <vscale x 4 x i1>] %arg2, [1 x <vscale x 4 x i1>] %arg1)
ret [1 x <vscale x 4 x i1>] %res
}
+; CHECK-LABEL: name: sve_signature_pred_2d_2x_caller
+; CHECK-DAG: [[ARG2_2:%[0-9]+]]:ppr = COPY $p3
+; CHECK-DAG: [[ARG2_1:%[0-9]+]]:ppr = COPY $p2
+; CHECK-DAG: [[ARG1_2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1_1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2_1]]
+; CHECK-DAG: $p1 = COPY [[ARG2_2]]
+; CHECK-DAG: $p2 = COPY [[ARG1_1]]
+; CHECK-DAG: $p3 = COPY [[ARG1_2]]
+; CHECK-NEXT: BL @sve_signature_pred_2d_2x, csr_aarch64_sve_aapcs
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 4 x i1>] @sve_signature_pred_2d_2x_caller([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
+ %res = call [2 x <vscale x 4 x i1>] @sve_signature_pred_2d_2x([2 x <vscale x 4 x i1>] %arg2, [2 x <vscale x 4 x i1>] %arg1)
+ ret [2 x <vscale x 4 x i1>] %res
+}
+
; Test that functions returning or taking SVE arguments use the correct
; callee-saved set when using the default C calling convention (as opposed
; to aarch64_sve_vector_pcs)
>From cc600d27e04b594d1698bda8635d1dad3abcddd7 Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Fri, 10 May 2024 15:20:26 -0700
Subject: [PATCH 3/5] [AArch64][SVE] Additional test on passing predicate
argument by the stack
Adding a test cast where smaller consecutive predicates (arg1 and arg3 below)
are passed by P0~P3 and larger one (arg2 below) is passed through the stack:
[2 x <vscale x 16 x i1>] callee (
[2 x <vscale x 16 x i1>] arg1,
[4 x <vscale x 16 x i1>] arg2,
[2 x <vscale x 16 x i1>] arg3)
---
.../AArch64/sve-calling-convention-byref.ll | 46 +++++++++++++++++++
1 file changed, 46 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
index 3a9e29f464011..8a7a35fb58013 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
@@ -132,6 +132,52 @@ define [4 x <vscale x 16 x i1>] @caller_with_svepred_arg_2d_4x([4 x <vscale x 16
ret [4 x <vscale x 16 x i1>] %res
}
+; Test that arg1 and arg3 are passed via P0~P3, arg1 is passed indirectly through address on stack in x0
+define aarch64_sve_vector_pcs [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_2d_mixed([2 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2, [2 x <vscale x 16 x i1>] %arg3) nounwind {
+; CHECK: name: callee_with_svepred_arg_2d_mixed
+; CHECK: [[P3:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[P2:%[0-9]+]]:ppr = COPY $p2
+; CHECK: [[X0:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK: [[P1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: [[P0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK: [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[X0]], killed [[OFFSET3]]
+; CHECK: [[P7:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK: [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[X0]], killed [[OFFSET2]]
+; CHECK: [[P6:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK: [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[X0]], killed [[OFFSET1]]
+; CHECK: [[P5:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: [[P4:%[0-9]+]]:ppr = LDR_PXI [[X0]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: [[RES0:%[0-9]+]]:ppr = AND_PPzPP [[P0]], [[P0]], killed [[P4]]
+; CHECK: [[RES1:%[0-9]+]]:ppr = AND_PPzPP [[P1]], [[P1]], killed [[P5]]
+; CHECK: [[RES2:%[0-9]+]]:ppr = AND_PPzPP [[P2]], [[P2]], killed [[P6]]
+; CHECK: [[RES3:%[0-9]+]]:ppr = AND_PPzPP [[P3]], [[P3]], killed [[P7]]
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: $p2 = COPY [[RES2]]
+; CHECK: $p3 = COPY [[RES3]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+ %p0 = extractvalue [2 x <vscale x 16 x i1>] %arg1, 0
+ %p1 = extractvalue [2 x <vscale x 16 x i1>] %arg1, 1
+ %p2 = extractvalue [2 x <vscale x 16 x i1>] %arg3, 0
+ %p3 = extractvalue [2 x <vscale x 16 x i1>] %arg3, 1
+ %p4 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 0
+ %p5 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 1
+ %p6 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 2
+ %p7 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 3
+ %r0 = and <vscale x 16 x i1> %p0, %p4
+ %r1 = and <vscale x 16 x i1> %p1, %p5
+ %r2 = and <vscale x 16 x i1> %p2, %p6
+ %r3 = and <vscale x 16 x i1> %p3, %p7
+ %1 = insertvalue [4 x <vscale x 16 x i1>] undef, <vscale x 16 x i1> %r0, 0
+ %2 = insertvalue [4 x <vscale x 16 x i1>] %1, <vscale x 16 x i1> %r1, 1
+ %3 = insertvalue [4 x <vscale x 16 x i1>] %2, <vscale x 16 x i1> %r2, 2
+ %4 = insertvalue [4 x <vscale x 16 x i1>] %3, <vscale x 16 x i1> %r3, 3
+ ret [4 x <vscale x 16 x i1>] %4
+}
+
; Test that z8 and z9, passed by reference, are loaded from a location that is passed on the stack.
; i.e. x0 = %x0
; :
>From 2312a0a63c944af76736f13164b9e3879837e60f Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Thu, 16 May 2024 15:40:55 -0700
Subject: [PATCH 4/5] [AArch64][SVE] Remove assetions on arg types of 1-element
arrays of scalable vector
Function arguemnts in types of 1-element arrays of scalable vector,
e.g. [1 x <vscale x 16 x i1>], are flagged as both InConsecutiveRegs and
InConsecutiveRegsLast. This triggers asserstions when lowering the argument
through the stack. Remove those assertions since existing code can handle
1-element arrays types as well.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 2 --
.../AArch64/sve-calling-convention-byref.ll | 31 +++++++++++++++++++
2 files changed, 31 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7344387ffe552..8ba12fea19bc7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7178,7 +7178,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
unsigned NumParts = 1;
if (Ins[i].Flags.isInConsecutiveRegs()) {
- assert(!Ins[i].Flags.isInConsecutiveRegsLast());
while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
++NumParts;
}
@@ -8175,7 +8174,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
uint64_t PartSize = StoreSize;
unsigned NumParts = 1;
if (Outs[i].Flags.isInConsecutiveRegs()) {
- assert(!Outs[i].Flags.isInConsecutiveRegsLast());
while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
++NumParts;
StoreSize *= NumParts;
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
index 8a7a35fb58013..d8dc9f46e7fcb 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
@@ -72,6 +72,37 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_many_svepred_arg(<
ret <vscale x 16 x i1> %ret
}
+; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
+; P0 = ldr [x0]
+define aarch64_sve_vector_pcs <vscale x 16 x i1> @callee_with_svepred_arg_2d_4x_1x([4 x <vscale x 16 x i1>] %arg1, [1 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_2d_4x_1x
+; CHECK: [[BASE:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK: [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: $p0 = COPY [[PRED0]]
+; CHECK: RET_ReallyLR implicit $p0
+ %res = extractvalue [1 x <vscale x 16 x i1>] %arg2, 0
+ ret <vscale x 16 x i1> %res
+}
+
+; Test that arg1 is stored to the stack from p0; and the stack location is passed throuch x0 to setup the call:
+; str P0, [stack_loc_for_args]
+; x0 = stack_loc_for_args
+define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_svepred_arg_2d_4x_1x([1 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_2d_4x_1x
+; CHECK: stack:
+; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2,
+; CHECK-NEXT: stack-id: scalable-vector,
+; CHECK: [[PRED0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+; CHECK: STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
+; CHECK: [[STACK:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+; CHECK: $x0 = COPY [[STACK]]
+; CHECK: BL @callee_with_svepred_arg_2d_4x_1x, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0
+; CHECK: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ %res = call <vscale x 16 x i1> @callee_with_svepred_arg_2d_4x_1x([4 x <vscale x 16 x i1>] %arg2, [1 x <vscale x 16 x i1>] %arg1)
+ ret <vscale x 16 x i1> %res
+}
+
; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
; P0 = ldr [x0]
; P1 = ldr [x0 + sizeof(Px)]
>From e8a0cefcf4483492773769e765f0321ffd0efee8 Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Wed, 22 May 2024 17:14:57 -0700
Subject: [PATCH 5/5] [AArch64][SVE] Add test cases for scalable predicate
argument in nxv32i1 types
Check that a <vscale x 32 x i1> predicate argument is assgined to two
P registers and passed through the stacks if not enough P registers are
available.
Also renamed functions that have array of scalable predicate arguments with
explicit argument types, e.g.: 2xv16i1.
---
.../AArch64/sve-calling-convention-byref.ll | 88 ++++++++++++++---
.../CodeGen/AArch64/sve-calling-convention.ll | 96 ++++++++++++++++---
2 files changed, 157 insertions(+), 27 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
index d8dc9f46e7fcb..8ce24ceb33d71 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
@@ -74,8 +74,8 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_many_svepred_arg(<
; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
; P0 = ldr [x0]
-define aarch64_sve_vector_pcs <vscale x 16 x i1> @callee_with_svepred_arg_2d_4x_1x([4 x <vscale x 16 x i1>] %arg1, [1 x <vscale x 16 x i1>] %arg2) {
-; CHECK: name: callee_with_svepred_arg_2d_4x_1x
+define aarch64_sve_vector_pcs <vscale x 16 x i1> @callee_with_svepred_arg_4xv16i1_1xv16i1([4 x <vscale x 16 x i1>] %arg1, [1 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_4xv16i1_1xv16i1
; CHECK: [[BASE:%[0-9]+]]:gpr64common = COPY $x0
; CHECK: [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load (<vscale x 1 x s16>))
; CHECK: $p0 = COPY [[PRED0]]
@@ -87,8 +87,8 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @callee_with_svepred_arg_2d_4x_
; Test that arg1 is stored to the stack from p0; and the stack location is passed throuch x0 to setup the call:
; str P0, [stack_loc_for_args]
; x0 = stack_loc_for_args
-define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_svepred_arg_2d_4x_1x([1 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
-; CHECK: name: caller_with_svepred_arg_2d_4x_1x
+define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_svepred_arg_1xv16i1_4xv16i1([1 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_1xv16i1_4xv16i1
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2,
; CHECK-NEXT: stack-id: scalable-vector,
@@ -97,9 +97,9 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_svepred_arg_2d_4x_
; CHECK: STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
; CHECK: [[STACK:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
; CHECK: $x0 = COPY [[STACK]]
-; CHECK: BL @callee_with_svepred_arg_2d_4x_1x, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0
+; CHECK: BL @callee_with_svepred_arg_4xv16i1_1xv16i1, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0
; CHECK: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
- %res = call <vscale x 16 x i1> @callee_with_svepred_arg_2d_4x_1x([4 x <vscale x 16 x i1>] %arg2, [1 x <vscale x 16 x i1>] %arg1)
+ %res = call <vscale x 16 x i1> @callee_with_svepred_arg_4xv16i1_1xv16i1([4 x <vscale x 16 x i1>] %arg2, [1 x <vscale x 16 x i1>] %arg1)
ret <vscale x 16 x i1> %res
}
@@ -108,8 +108,8 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_svepred_arg_2d_4x_
; P1 = ldr [x0 + sizeof(Px)]
; P2 = ldr [x0 + 2*sizeof(Px)]
; P3 = ldr [x0 + 3*sizeof(Px)]
-define aarch64_sve_vector_pcs [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_2d_4x([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
-; CHECK: name: callee_with_svepred_arg_2d_4x
+define aarch64_sve_vector_pcs [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_4xv16i1_4xv16i1([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_4xv16i1_4xv16i1
; CHECK: [[BASE:%[0-9]+]]:gpr64common = COPY $x0
; CHECK: [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
; CHECK: [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[BASE]], killed [[OFFSET1]]
@@ -135,8 +135,8 @@ define aarch64_sve_vector_pcs [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_
; str P2, [stack_loc_for_args + 2*sizeof(Px)]
; str P3, [stack_loc_for_args + 3*sizeof(Px)]
; x0 = stack_loc_for_args
-define [4 x <vscale x 16 x i1>] @caller_with_svepred_arg_2d_4x([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
-; CHECK: name: caller_with_svepred_arg_2d_4x
+define [4 x <vscale x 16 x i1>] @caller_with_svepred_arg_4xv16i1_4xv16i1([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_4xv16i1_4xv16i1
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
; CHECK-NEXT: stack-id: scalable-vector,
@@ -157,15 +157,75 @@ define [4 x <vscale x 16 x i1>] @caller_with_svepred_arg_2d_4x([4 x <vscale x 16
; CHECK: STR_PXI [[PRED1]], killed [[ADDR1]], 0 :: (store (<vscale x 1 x s16>))
; CHECK: STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
; CHECK: $x0 = COPY [[STACK]]
-; CHECK: BL @callee_with_svepred_arg_2d_4x, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0, implicit-def $p1, implicit-def $p2, implicit-def $p3
+; CHECK: BL @callee_with_svepred_arg_4xv16i1_4xv16i1, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0, implicit-def $p1, implicit-def $p2, implicit-def $p3
; CHECK: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
- %res = call [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_2d_4x([4 x <vscale x 16 x i1>] %arg2, [4 x <vscale x 16 x i1>] %arg1)
+ %res = call [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_4xv16i1_4xv16i1([4 x <vscale x 16 x i1>] %arg2, [4 x <vscale x 16 x i1>] %arg1)
ret [4 x <vscale x 16 x i1>] %res
}
+; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
+; P0 = ldr [x0]
+; P1 = ldr [x0 + sizeof(Px)]
+; P2 = ldr [x0 + 2*sizeof(Px)]
+; P3 = ldr [x0 + 3*sizeof(Px)]
+define aarch64_sve_vector_pcs [2 x <vscale x 32 x i1>] @callee_with_svepred_arg_1xv16i1_2xv32i1([1 x <vscale x 16 x i1>] %arg1, [2 x <vscale x 32 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_1xv16i1_2xv32i1
+; CHECK: [[BASE:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK: [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK: [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[BASE]], killed [[OFFSET1]]
+; CHECK: [[PRED1:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK: [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET2]]
+; CHECK: [[PRED2:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK: [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET3]]
+; CHECK: [[PRED3:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: $p0 = COPY [[PRED0]]
+; CHECK: $p1 = COPY [[PRED1]]
+; CHECK: $p2 = COPY [[PRED2]]
+; CHECK: $p3 = COPY [[PRED3]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+ ret [2 x <vscale x 32 x i1>] %arg2
+}
+
+; Test that arg1 is stored to the stack from p0~p3; and the stack location is passed throuch x0 to setup the call:
+; str P0, [stack_loc_for_args]
+; str P1, [stack_loc_for_args + sizeof(Px)]
+; str P2, [stack_loc_for_args + 2*sizeof(Px)]
+; str P3, [stack_loc_for_args + 3*sizeof(Px)]
+; x0 = stack_loc_for_args
+define [2 x <vscale x 32 x i1>] @caller_with_svepred_arg_2xv32i1_1xv16i1([2 x <vscale x 32 x i1>] %arg1, [1 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_2xv32i1_1xv16i1
+; CHECK: stack:
+; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
+; CHECK-NEXT: stack-id: scalable-vector,
+; CHECK: [[PRED3:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[PRED2:%[0-9]+]]:ppr = COPY $p2
+; CHECK: [[PRED1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: [[PRED0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK: [[STACK:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+; CHECK: [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], killed [[OFFSET3]]
+; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+; CHECK: STR_PXI [[PRED3]], killed [[ADDR3]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK: [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK: [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], killed [[OFFSET2]]
+; CHECK: STR_PXI [[PRED2]], killed [[ADDR2]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK: [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK: [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[STACK]], killed [[OFFSET1]]
+; CHECK: STR_PXI [[PRED1]], killed [[ADDR1]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK: STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
+; CHECK: $x0 = COPY [[STACK]]
+; CHECK: BL @callee_with_svepred_arg_1xv16i1_2xv32i1, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $x0, implicit-def $sp, implicit-def $p0, implicit-def $p1, implicit-def $p2, implicit-def $p3
+; CHECK: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ %res = call [2 x <vscale x 32 x i1>] @callee_with_svepred_arg_1xv16i1_2xv32i1([1 x <vscale x 16 x i1>] %arg2, [2 x <vscale x 32 x i1>] %arg1)
+ ret [2 x <vscale x 32 x i1>] %res
+}
+
; Test that arg1 and arg3 are passed via P0~P3, arg1 is passed indirectly through address on stack in x0
-define aarch64_sve_vector_pcs [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_2d_mixed([2 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2, [2 x <vscale x 16 x i1>] %arg3) nounwind {
-; CHECK: name: callee_with_svepred_arg_2d_mixed
+define aarch64_sve_vector_pcs [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_2xv16i1_4xv16i1_2xv16i1([2 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2, [2 x <vscale x 16 x i1>] %arg3) nounwind {
+; CHECK: name: callee_with_svepred_arg_2xv16i1_4xv16i1_2xv16i1
; CHECK: [[P3:%[0-9]+]]:ppr = COPY $p3
; CHECK: [[P2:%[0-9]+]]:ppr = COPY $p2
; CHECK: [[X0:%[0-9]+]]:gpr64common = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
index e6bb660adbf66..bfb750517cbf9 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
@@ -129,25 +129,51 @@ define <vscale x 4 x i1> @sve_signature_pred(<vscale x 4 x i1> %arg1, <vscale x
}
; Test that scalable predicate argument in [1 x <vscale x 4 x i1>] type are properly assigned to P registers.
-; CHECK-LABEL: name: sve_signature_pred_2d_1x
+; CHECK-LABEL: name: sve_signature_pred_1xv4i1
; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p1
; CHECK: $p0 = COPY [[RES]]
; CHECK: RET_ReallyLR implicit $p0
-define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_1x([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_1xv4i1([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
ret [1 x <vscale x 4 x i1>] %arg2
}
-; Test that upto to two scalable predicate argument in [2 x <vscale x 4 x i1>] type can be assigned to P registers.
-; CHECK-LABEL: name: sve_signature_pred_2d_2x
+; Test that upto to two scalable predicate arguments in [2 x <vscale x 4 x i1>] type can be assigned to P registers.
+; CHECK-LABEL: name: sve_signature_pred_2xv4i1
; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p3
; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p2
; CHECK: $p0 = COPY [[RES0]]
; CHECK: $p1 = COPY [[RES1]]
; CHECK: RET_ReallyLR implicit $p0, implicit $p1
-define [2 x <vscale x 4 x i1>] @sve_signature_pred_2d_2x([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
+define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
ret [2 x <vscale x 4 x i1>] %arg2
}
+; Test that a scalable predicate argument in [1 x <vscale x 32 x i1>] type is assigned to two P registers.
+; CHECK-LABLE: name: sve_signature_pred_1xv32i1
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p2
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [1 x <vscale x 32 x i1>] @sve_signature_pred_1xv32i1([1 x <vscale x 32 x i1>] %arg1, [1 x <vscale x 32 x i1>] %arg2) nounwind {
+ ret [1 x <vscale x 32 x i1>] %arg2
+}
+
+; Test that a scalable predicate argument in [2 x <vscale x 32 x i1>] type is assigned to four P registers.
+; CHECK-LABLE: name: sve_signature_pred_2xv32i1
+; CHECK: [[RES3:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[RES2:%[0-9]+]]:ppr = COPY $p2
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: $p2 = COPY [[RES2]]
+; CHECK: $p3 = COPY [[RES3]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+define [2 x <vscale x 32 x i1>] @sve_signature_pred_2xv32i1([2 x <vscale x 32 x i1>] %arg1) nounwind {
+ ret [2 x <vscale x 32 x i1>] %arg1
+}
+
; CHECK-LABEL: name: sve_signature_vec_caller
; CHECK-DAG: [[ARG2:%[0-9]+]]:zpr = COPY $z1
; CHECK-DAG: [[ARG1:%[0-9]+]]:zpr = COPY $z0
@@ -176,21 +202,21 @@ define <vscale x 4 x i1> @sve_signature_pred_caller(<vscale x 4 x i1> %arg1, <vs
ret <vscale x 4 x i1> %res
}
-; CHECK-LABEL: name: sve_signature_pred_2d_1x_caller
+; CHECK-LABEL: name: sve_signature_pred_1xv4i1_caller
; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p1
; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p0
; CHECK-DAG: $p0 = COPY [[ARG2]]
; CHECK-DAG: $p1 = COPY [[ARG1]]
-; CHECK-NEXT: BL @sve_signature_pred_2d_1x, csr_aarch64_sve_aapcs
+; CHECK-NEXT: BL @sve_signature_pred_1xv4i1, csr_aarch64_sve_aapcs
; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p0
; CHECK: $p0 = COPY [[RES]]
; CHECK: RET_ReallyLR implicit $p0
-define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_1x_caller([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
- %res = call [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_1x([1 x <vscale x 4 x i1>] %arg2, [1 x <vscale x 4 x i1>] %arg1)
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_1xv4i1_caller([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+ %res = call [1 x <vscale x 4 x i1>] @sve_signature_pred_1xv4i1([1 x <vscale x 4 x i1>] %arg2, [1 x <vscale x 4 x i1>] %arg1)
ret [1 x <vscale x 4 x i1>] %res
}
-; CHECK-LABEL: name: sve_signature_pred_2d_2x_caller
+; CHECK-LABEL: name: sve_signature_pred_2xv4i1_caller
; CHECK-DAG: [[ARG2_2:%[0-9]+]]:ppr = COPY $p3
; CHECK-DAG: [[ARG2_1:%[0-9]+]]:ppr = COPY $p2
; CHECK-DAG: [[ARG1_2:%[0-9]+]]:ppr = COPY $p1
@@ -199,17 +225,61 @@ define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_1x_caller([1 x <vscale x 4
; CHECK-DAG: $p1 = COPY [[ARG2_2]]
; CHECK-DAG: $p2 = COPY [[ARG1_1]]
; CHECK-DAG: $p3 = COPY [[ARG1_2]]
-; CHECK-NEXT: BL @sve_signature_pred_2d_2x, csr_aarch64_sve_aapcs
+; CHECK-NEXT: BL @sve_signature_pred_2xv4i1, csr_aarch64_sve_aapcs
; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
; CHECK: $p0 = COPY [[RES0]]
; CHECK: $p1 = COPY [[RES1]]
; CHECK: RET_ReallyLR implicit $p0, implicit $p1
-define [2 x <vscale x 4 x i1>] @sve_signature_pred_2d_2x_caller([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
- %res = call [2 x <vscale x 4 x i1>] @sve_signature_pred_2d_2x([2 x <vscale x 4 x i1>] %arg2, [2 x <vscale x 4 x i1>] %arg1)
+define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1_caller([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
+ %res = call [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>] %arg2, [2 x <vscale x 4 x i1>] %arg1)
ret [2 x <vscale x 4 x i1>] %res
}
+; CHECK-LABEL: name: sve_signature_pred_1xv32i1_caller
+; CHECK-DAG: [[ARG2_2:%[0-9]+]]:ppr = COPY $p3
+; CHECK-DAG: [[ARG2_1:%[0-9]+]]:ppr = COPY $p2
+; CHECK-DAG: [[ARG1_2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1_1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2_1]]
+; CHECK-DAG: $p1 = COPY [[ARG2_2]]
+; CHECK-DAG: $p2 = COPY [[ARG1_1]]
+; CHECK-DAG: $p3 = COPY [[ARG1_2]]
+; CHECK-NEXT: BL @sve_signature_pred_1xv32i1, csr_aarch64_sve_aapcs
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [1 x <vscale x 32 x i1>] @sve_signature_pred_1xv32i1_caller([1 x <vscale x 32 x i1>] %arg1, [1 x <vscale x 32 x i1>] %arg2) nounwind {
+ %res = call [1 x <vscale x 32 x i1>] @sve_signature_pred_1xv32i1([1 x <vscale x 32 x i1>] %arg2, [1 x <vscale x 32 x i1>] %arg1)
+ ret [1 x <vscale x 32 x i1>] %res
+}
+
+; CHECK-LABEL: name: sve_signature_pred_2xv32i1_caller
+; CHECK-DAG: [[ARG3:%[0-9]+]]:ppr = COPY $p3
+; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p2
+; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG0:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG0]]
+; CHECK-DAG: $p1 = COPY [[ARG1]]
+; CHECK-DAG: $p2 = COPY [[ARG2]]
+; CHECK-DAG: $p3 = COPY [[ARG3]]
+; CHECK-NEXT: BL @sve_signature_pred_2xv32i1, csr_aarch64_sve_aapcs
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: [[RES2:%[0-9]+]]:ppr = COPY $p2
+; CHECK: [[RES3:%[0-9]+]]:ppr = COPY $p3
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: $p2 = COPY [[RES2]]
+; CHECK: $p3 = COPY [[RES3]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 32 x i1>] @sve_signature_pred_2xv32i1_caller([2 x <vscale x 32 x i1>] %arg1) {
+ %res = call [2 x <vscale x 32 x i1>] @sve_signature_pred_2xv32i1([2 x <vscale x 32 x i1>] %arg1)
+ ret [2 x <vscale x 32 x i1>] %res
+}
+
; Test that functions returning or taking SVE arguments use the correct
; callee-saved set when using the default C calling convention (as opposed
; to aarch64_sve_vector_pcs)
More information about the llvm-commits
mailing list