[llvm] [AArch64][SVE] Handle consecutive Predicates in CC_AArch64_Custom_Block (PR #90122)
Zhaoshi Zheng via llvm-commits
llvm-commits at lists.llvm.org
Thu May 9 20:06:25 PDT 2024
https://github.com/zhaoshiz updated https://github.com/llvm/llvm-project/pull/90122
>From 27cd3a438d53413c4f5e2ba99a98b4bdec788d2c Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Tue, 26 Mar 2024 16:01:11 -0700
Subject: [PATCH 1/2] [AArch64][SVE] Handle consecutive Predicates in
CC_AArch64_Custom_Block
For 2d masks as function arguments, even in [1 x <vscale x 4 x i1>] type,
they're flagged as InConsecutiveRegs. This fix checks for mask types and
allocate them to P registers.
---
.../AArch64/AArch64CallingConvention.cpp | 14 +++++++++---
.../CodeGen/AArch64/sve-calling-convention.ll | 22 +++++++++++++++++++
2 files changed, 33 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index bfcafc6442d24..9a2838992eb02 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -38,6 +38,8 @@ static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
static const MCPhysReg ZRegList[] = {AArch64::Z0, AArch64::Z1, AArch64::Z2,
AArch64::Z3, AArch64::Z4, AArch64::Z5,
AArch64::Z6, AArch64::Z7};
+static const MCPhysReg PRegList[] = {AArch64::P0, AArch64::P1, AArch64::P2,
+ AArch64::P3};
static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
@@ -140,9 +142,15 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
RegList = DRegList;
else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector())
RegList = QRegList;
- else if (LocVT.isScalableVector())
- RegList = ZRegList;
- else {
+ else if (LocVT.isScalableVector()) {
+ // Scalable masks should be pass by Predicate registers.
+ if (LocVT == MVT::nxv1i1 || LocVT == MVT::nxv2i1 || LocVT == MVT::nxv4i1 ||
+ LocVT == MVT::nxv8i1 || LocVT == MVT::nxv16i1 ||
+ LocVT == MVT::aarch64svcount)
+ RegList = PRegList;
+ else
+ RegList = ZRegList;
+ } else {
// Not an array we want to split up after all.
return false;
}
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
index 0a45244f12be5..a0eee24275f1e 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
@@ -128,6 +128,14 @@ define <vscale x 4 x i1> @sve_signature_pred(<vscale x 4 x i1> %arg1, <vscale x
ret <vscale x 4 x i1> %arg2
}
+; CHECK-LABEL: name: sve_signature_pred_2d
+; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES]]
+; CHECK: RET_ReallyLR implicit $p0
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+ ret [1 x <vscale x 4 x i1>] %arg2
+}
+
; CHECK-LABEL: name: sve_signature_vec_caller
; CHECK-DAG: [[ARG2:%[0-9]+]]:zpr = COPY $z1
; CHECK-DAG: [[ARG1:%[0-9]+]]:zpr = COPY $z0
@@ -156,6 +164,20 @@ define <vscale x 4 x i1> @sve_signature_pred_caller(<vscale x 4 x i1> %arg1, <vs
ret <vscale x 4 x i1> %res
}
+; CHECK-LABEL: name: sve_signature_pred_2d_caller
+; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2]]
+; CHECK-DAG: $p1 = COPY [[ARG1]]
+; CHECK-NEXT: BL @sve_signature_pred_2d, csr_aarch64_sve_aapcs
+; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p0
+; CHECK: $p0 = COPY [[RES]]
+; CHECK: RET_ReallyLR implicit $p0
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_caller([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+ %res = call [1 x <vscale x 4 x i1>] @sve_signature_pred_2d([1 x <vscale x 4 x i1>] %arg2, [1 x <vscale x 4 x i1>] %arg1)
+ ret [1 x <vscale x 4 x i1>] %res
+}
+
; Test that functions returning or taking SVE arguments use the correct
; callee-saved set when using the default C calling convention (as opposed
; to aarch64_sve_vector_pcs)
>From 8264e9af78e5014b342c3e297ba8ae5b6acf72bf Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Thu, 9 May 2024 16:46:24 -0700
Subject: [PATCH 2/2] [AArch64][SVE] Handle consecutive Predicates arguments
through stack
Per AAPCS64, when P0~P4 are exhausted or not able to hold a scalable predicate
argument, the argument is allocated to the stack and a pointer is passed
to the callee.
For consecutive predicates in types like [M x <vscale x N x i1>], we
should handle them in the same way as Z registers, as shown by:
https://reviews.llvm.org/D71216
Reference:
https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#parameter-passing
---
.../AArch64/AArch64CallingConvention.cpp | 15 ++++-
.../AArch64/sve-calling-convention-byref.ll | 60 +++++++++++++++++++
.../CodeGen/AArch64/sve-calling-convention.ll | 44 ++++++++++++--
3 files changed, 110 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index 9a2838992eb02..9a804c12939c4 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -61,11 +61,17 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
// CCAssignFn again we want it to behave as if all remaining registers are
// allocated. This will force the code to pass the tuple indirectly in
// accordance with the PCS.
- bool RegsAllocated[8];
+ bool ZRegsAllocated[8];
for (int I = 0; I < 8; I++) {
- RegsAllocated[I] = State.isAllocated(ZRegList[I]);
+ ZRegsAllocated[I] = State.isAllocated(ZRegList[I]);
State.AllocateReg(ZRegList[I]);
}
+ // The same applies to P registers.
+ bool PRegsAllocated[4];
+ for (int I = 0; I < 4; I++) {
+ PRegsAllocated[I] = State.isAllocated(PRegList[I]);
+ State.AllocateReg(PRegList[I]);
+ }
auto &It = PendingMembers[0];
CCAssignFn *AssignFn =
@@ -81,8 +87,11 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
// Return the register state back to how it was before, leaving any
// unallocated registers available for other smaller types.
for (int I = 0; I < 8; I++)
- if (!RegsAllocated[I])
+ if (!ZRegsAllocated[I])
State.DeallocateReg(ZRegList[I]);
+ for (int I = 0; I < 4; I++)
+ if (!PRegsAllocated[I])
+ State.DeallocateReg(PRegList[I]);
// All pending members have now been allocated
PendingMembers.clear();
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
index 8cb8b1c92fa7e..3a9e29f464011 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
@@ -72,6 +72,66 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_many_svepred_arg(<
ret <vscale x 16 x i1> %ret
}
+; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
+; P0 = ldr [x0]
+; P1 = ldr [x0 + sizeof(Px)]
+; P2 = ldr [x0 + 2*sizeof(Px)]
+; P3 = ldr [x0 + 3*sizeof(Px)]
+define aarch64_sve_vector_pcs [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_2d_4x([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_2d_4x
+; CHECK: [[BASE:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK: [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK: [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[BASE]], killed [[OFFSET1]]
+; CHECK: [[PRED1:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK: [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET2]]
+; CHECK: [[PRED2:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK: [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET3]]
+; CHECK: [[PRED3:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK: $p0 = COPY [[PRED0]]
+; CHECK: $p1 = COPY [[PRED1]]
+; CHECK: $p2 = COPY [[PRED2]]
+; CHECK: $p3 = COPY [[PRED3]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+ ret [4 x <vscale x 16 x i1>] %arg2
+}
+
+; Test that arg1 is stored to the stack from p0~p3; and the stack location is passed throuch x0 to setup the call:
+; str P0, [stack_loc_for_args]
+; str P1, [stack_loc_for_args + sizeof(Px)]
+; str P2, [stack_loc_for_args + 2*sizeof(Px)]
+; str P3, [stack_loc_for_args + 3*sizeof(Px)]
+; x0 = stack_loc_for_args
+define [4 x <vscale x 16 x i1>] @caller_with_svepred_arg_2d_4x([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_2d_4x
+; CHECK: stack:
+; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
+; CHECK-NEXT: stack-id: scalable-vector,
+; CHECK: [[PRED3:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[PRED2:%[0-9]+]]:ppr = COPY $p2
+; CHECK: [[PRED1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: [[PRED0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK: [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK: [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK: [[STACK:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+; CHECK: [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], [[OFFSET3]]
+; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+; CHECK: STR_PXI [[PRED3]], killed [[ADDR3]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK: [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], [[OFFSET2]]
+; CHECK: STR_PXI [[PRED2]], killed [[ADDR2]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK: [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[STACK]], [[OFFSET1]]
+; CHECK: STR_PXI [[PRED1]], killed [[ADDR1]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK: STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
+; CHECK: $x0 = COPY [[STACK]]
+; CHECK: BL @callee_with_svepred_arg_2d_4x, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0, implicit-def $p1, implicit-def $p2, implicit-def $p3
+; CHECK: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ %res = call [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_2d_4x([4 x <vscale x 16 x i1>] %arg2, [4 x <vscale x 16 x i1>] %arg1)
+ ret [4 x <vscale x 16 x i1>] %res
+}
+
; Test that z8 and z9, passed by reference, are loaded from a location that is passed on the stack.
; i.e. x0 = %x0
; :
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
index a0eee24275f1e..e6bb660adbf66 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
@@ -128,14 +128,26 @@ define <vscale x 4 x i1> @sve_signature_pred(<vscale x 4 x i1> %arg1, <vscale x
ret <vscale x 4 x i1> %arg2
}
-; CHECK-LABEL: name: sve_signature_pred_2d
+; Test that scalable predicate argument in [1 x <vscale x 4 x i1>] type are properly assigned to P registers.
+; CHECK-LABEL: name: sve_signature_pred_2d_1x
; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p1
; CHECK: $p0 = COPY [[RES]]
; CHECK: RET_ReallyLR implicit $p0
-define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_1x([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
ret [1 x <vscale x 4 x i1>] %arg2
}
+; Test that upto to two scalable predicate argument in [2 x <vscale x 4 x i1>] type can be assigned to P registers.
+; CHECK-LABEL: name: sve_signature_pred_2d_2x
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p2
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 4 x i1>] @sve_signature_pred_2d_2x([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
+ ret [2 x <vscale x 4 x i1>] %arg2
+}
+
; CHECK-LABEL: name: sve_signature_vec_caller
; CHECK-DAG: [[ARG2:%[0-9]+]]:zpr = COPY $z1
; CHECK-DAG: [[ARG1:%[0-9]+]]:zpr = COPY $z0
@@ -164,20 +176,40 @@ define <vscale x 4 x i1> @sve_signature_pred_caller(<vscale x 4 x i1> %arg1, <vs
ret <vscale x 4 x i1> %res
}
-; CHECK-LABEL: name: sve_signature_pred_2d_caller
+; CHECK-LABEL: name: sve_signature_pred_2d_1x_caller
; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p1
; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p0
; CHECK-DAG: $p0 = COPY [[ARG2]]
; CHECK-DAG: $p1 = COPY [[ARG1]]
-; CHECK-NEXT: BL @sve_signature_pred_2d, csr_aarch64_sve_aapcs
+; CHECK-NEXT: BL @sve_signature_pred_2d_1x, csr_aarch64_sve_aapcs
; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p0
; CHECK: $p0 = COPY [[RES]]
; CHECK: RET_ReallyLR implicit $p0
-define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_caller([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
- %res = call [1 x <vscale x 4 x i1>] @sve_signature_pred_2d([1 x <vscale x 4 x i1>] %arg2, [1 x <vscale x 4 x i1>] %arg1)
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_1x_caller([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+ %res = call [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_1x([1 x <vscale x 4 x i1>] %arg2, [1 x <vscale x 4 x i1>] %arg1)
ret [1 x <vscale x 4 x i1>] %res
}
+; CHECK-LABEL: name: sve_signature_pred_2d_2x_caller
+; CHECK-DAG: [[ARG2_2:%[0-9]+]]:ppr = COPY $p3
+; CHECK-DAG: [[ARG2_1:%[0-9]+]]:ppr = COPY $p2
+; CHECK-DAG: [[ARG1_2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1_1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2_1]]
+; CHECK-DAG: $p1 = COPY [[ARG2_2]]
+; CHECK-DAG: $p2 = COPY [[ARG1_1]]
+; CHECK-DAG: $p3 = COPY [[ARG1_2]]
+; CHECK-NEXT: BL @sve_signature_pred_2d_2x, csr_aarch64_sve_aapcs
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 4 x i1>] @sve_signature_pred_2d_2x_caller([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
+ %res = call [2 x <vscale x 4 x i1>] @sve_signature_pred_2d_2x([2 x <vscale x 4 x i1>] %arg2, [2 x <vscale x 4 x i1>] %arg1)
+ ret [2 x <vscale x 4 x i1>] %res
+}
+
; Test that functions returning or taking SVE arguments use the correct
; callee-saved set when using the default C calling convention (as opposed
; to aarch64_sve_vector_pcs)
More information about the llvm-commits
mailing list