[llvm] [AArch64][SVE] Handle consecutive Predicates in CC_AArch64_Custom_Block (PR #90122)

Thu May 9 20:06:25 PDT 2024

https://github.com/zhaoshiz updated https://github.com/llvm/llvm-project/pull/90122

>From 27cd3a438d53413c4f5e2ba99a98b4bdec788d2c Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Tue, 26 Mar 2024 16:01:11 -0700
Subject: [PATCH 1/2] [AArch64][SVE] Handle consecutive Predicates in
 CC_AArch64_Custom_Block

For 2d masks as function arguments, even in [1 x <vscale x 4 x i1>] type,
they're flagged as InConsecutiveRegs. This fix checks for mask types and
allocate them to P registers.
---
 .../AArch64/AArch64CallingConvention.cpp      | 14 +++++++++---
 .../CodeGen/AArch64/sve-calling-convention.ll | 22 +++++++++++++++++++
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index bfcafc6442d24..9a2838992eb02 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -38,6 +38,8 @@ static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
 static const MCPhysReg ZRegList[] = {AArch64::Z0, AArch64::Z1, AArch64::Z2,
                                      AArch64::Z3, AArch64::Z4, AArch64::Z5,
                                      AArch64::Z6, AArch64::Z7};
+static const MCPhysReg PRegList[] = {AArch64::P0, AArch64::P1, AArch64::P2,
+                                     AArch64::P3};
 
 static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
                              MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
@@ -140,9 +142,15 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
     RegList = DRegList;
   else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector())
     RegList = QRegList;
-  else if (LocVT.isScalableVector())
-    RegList = ZRegList;
-  else {
+  else if (LocVT.isScalableVector()) {
+    // Scalable masks should be pass by Predicate registers.
+    if (LocVT == MVT::nxv1i1 || LocVT == MVT::nxv2i1 || LocVT == MVT::nxv4i1 ||
+        LocVT == MVT::nxv8i1 || LocVT == MVT::nxv16i1 ||
+        LocVT == MVT::aarch64svcount)
+      RegList = PRegList;
+    else
+      RegList = ZRegList;
+  } else {
     // Not an array we want to split up after all.
     return false;
   }
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
index 0a45244f12be5..a0eee24275f1e 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
@@ -128,6 +128,14 @@ define <vscale x 4 x i1> @sve_signature_pred(<vscale x 4 x i1> %arg1, <vscale x
   ret <vscale x 4 x i1> %arg2
 }
 
+; CHECK-LABEL: name: sve_signature_pred_2d
+; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES]]
+; CHECK: RET_ReallyLR implicit $p0
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+  ret [1 x <vscale x 4 x i1>] %arg2
+}
+
 ; CHECK-LABEL: name: sve_signature_vec_caller
 ; CHECK-DAG: [[ARG2:%[0-9]+]]:zpr = COPY $z1
 ; CHECK-DAG: [[ARG1:%[0-9]+]]:zpr = COPY $z0
@@ -156,6 +164,20 @@ define <vscale x 4 x i1> @sve_signature_pred_caller(<vscale x 4 x i1> %arg1, <vs
   ret <vscale x 4 x i1> %res
 }
 
+; CHECK-LABEL: name: sve_signature_pred_2d_caller
+; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2]]
+; CHECK-DAG: $p1 = COPY [[ARG1]]
+; CHECK-NEXT: BL @sve_signature_pred_2d, csr_aarch64_sve_aapcs
+; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p0
+; CHECK: $p0 = COPY [[RES]]
+; CHECK: RET_ReallyLR implicit $p0
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_caller([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+  %res = call [1 x <vscale x 4 x i1>] @sve_signature_pred_2d([1 x <vscale x 4 x i1>] %arg2, [1 x <vscale x 4 x i1>] %arg1)
+  ret [1 x <vscale x 4 x i1>] %res
+}
+
 ; Test that functions returning or taking SVE arguments use the correct
 ; callee-saved set when using the default C calling convention (as opposed
 ; to aarch64_sve_vector_pcs)

>From 8264e9af78e5014b342c3e297ba8ae5b6acf72bf Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Thu, 9 May 2024 16:46:24 -0700
Subject: [PATCH 2/2] [AArch64][SVE] Handle consecutive Predicates arguments
 through stack

Per AAPCS64, when P0~P4 are exhausted or not able to hold a scalable predicate
argument, the argument is allocated to the stack and a pointer is passed
to the callee.

For consecutive predicates in types like [M x <vscale x N x i1>], we
should handle them in the same way as Z registers, as shown by:
https://reviews.llvm.org/D71216

Reference:
https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#parameter-passing
---
 .../AArch64/AArch64CallingConvention.cpp      | 15 ++++-
 .../AArch64/sve-calling-convention-byref.ll   | 60 +++++++++++++++++++
 .../CodeGen/AArch64/sve-calling-convention.ll | 44 ++++++++++++--
 3 files changed, 110 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index 9a2838992eb02..9a804c12939c4 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -61,11 +61,17 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
     // CCAssignFn again we want it to behave as if all remaining registers are
     // allocated. This will force the code to pass the tuple indirectly in
     // accordance with the PCS.
-    bool RegsAllocated[8];
+    bool ZRegsAllocated[8];
     for (int I = 0; I < 8; I++) {
-      RegsAllocated[I] = State.isAllocated(ZRegList[I]);
+      ZRegsAllocated[I] = State.isAllocated(ZRegList[I]);
       State.AllocateReg(ZRegList[I]);
     }
+    // The same applies to P registers.
+    bool PRegsAllocated[4];
+    for (int I = 0; I < 4; I++) {
+      PRegsAllocated[I] = State.isAllocated(PRegList[I]);
+      State.AllocateReg(PRegList[I]);
+    }
 
     auto &It = PendingMembers[0];
     CCAssignFn *AssignFn =
@@ -81,8 +87,11 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
     // Return the register state back to how it was before, leaving any
     // unallocated registers available for other smaller types.
     for (int I = 0; I < 8; I++)
-      if (!RegsAllocated[I])
+      if (!ZRegsAllocated[I])
         State.DeallocateReg(ZRegList[I]);
+    for (int I = 0; I < 4; I++)
+      if (!PRegsAllocated[I])
+        State.DeallocateReg(PRegList[I]);
 
     // All pending members have now been allocated
     PendingMembers.clear();
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
index 8cb8b1c92fa7e..3a9e29f464011 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
@@ -72,6 +72,66 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_many_svepred_arg(<
   ret <vscale x 16 x i1> %ret
 }
 
+; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
+;     P0 = ldr [x0]
+;     P1 = ldr [x0 +   sizeof(Px)]
+;     P2 = ldr [x0 + 2*sizeof(Px)]
+;     P3 = ldr [x0 + 3*sizeof(Px)]
+define aarch64_sve_vector_pcs [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_2d_4x([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_2d_4x
+; CHECK:    [[BASE:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[BASE]], killed [[OFFSET1]]
+; CHECK:    [[PRED1:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET2]]
+; CHECK:    [[PRED2:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET3]]
+; CHECK:    [[PRED3:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    $p0 = COPY [[PRED0]]
+; CHECK:    $p1 = COPY [[PRED1]]
+; CHECK:    $p2 = COPY [[PRED2]]
+; CHECK:    $p3 = COPY [[PRED3]]
+; CHECK:    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+  ret [4 x <vscale x 16 x i1>] %arg2
+}
+
+; Test that arg1 is stored to the stack from p0~p3; and the stack location is passed throuch x0 to setup the call:
+;     str P0, [stack_loc_for_args]
+;     str P1, [stack_loc_for_args +   sizeof(Px)]
+;     str P2, [stack_loc_for_args + 2*sizeof(Px)]
+;     str P3, [stack_loc_for_args + 3*sizeof(Px)]
+;     x0 = stack_loc_for_args
+define [4 x <vscale x 16 x i1>] @caller_with_svepred_arg_2d_4x([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_2d_4x
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
+; CHECK-NEXT:     stack-id: scalable-vector,
+; CHECK:    [[PRED3:%[0-9]+]]:ppr = COPY $p3
+; CHECK:    [[PRED2:%[0-9]+]]:ppr = COPY $p2
+; CHECK:    [[PRED1:%[0-9]+]]:ppr = COPY $p1
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = COPY $p0
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[STACK:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], [[OFFSET3]]
+; CHECK:    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+; CHECK:    STR_PXI [[PRED3]], killed [[ADDR3]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], [[OFFSET2]]
+; CHECK:    STR_PXI [[PRED2]], killed [[ADDR2]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[STACK]], [[OFFSET1]]
+; CHECK:    STR_PXI [[PRED1]], killed [[ADDR1]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
+; CHECK:    $x0 = COPY [[STACK]]
+; CHECK:    BL @callee_with_svepred_arg_2d_4x, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0, implicit-def $p1, implicit-def $p2, implicit-def $p3
+; CHECK:    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  %res = call [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_2d_4x([4 x <vscale x 16 x i1>] %arg2, [4 x <vscale x 16 x i1>] %arg1)
+  ret [4 x <vscale x 16 x i1>] %res
+}
+
 ; Test that z8 and z9, passed by reference, are loaded from a location that is passed on the stack.
 ; i.e.     x0 =   %x0
 ;             :
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
index a0eee24275f1e..e6bb660adbf66 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
@@ -128,14 +128,26 @@ define <vscale x 4 x i1> @sve_signature_pred(<vscale x 4 x i1> %arg1, <vscale x
   ret <vscale x 4 x i1> %arg2
 }
 
-; CHECK-LABEL: name: sve_signature_pred_2d
+; Test that scalable predicate argument in [1 x <vscale x 4 x i1>] type are properly assigned to P registers.
+; CHECK-LABEL: name: sve_signature_pred_2d_1x
 ; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p1
 ; CHECK: $p0 = COPY [[RES]]
 ; CHECK: RET_ReallyLR implicit $p0
-define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_1x([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
   ret [1 x <vscale x 4 x i1>] %arg2
 }
 
+; Test that upto to two scalable predicate argument in [2 x <vscale x 4 x i1>] type can be assigned to P registers.
+; CHECK-LABEL: name: sve_signature_pred_2d_2x
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p2
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 4 x i1>] @sve_signature_pred_2d_2x([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
+  ret [2 x <vscale x 4 x i1>] %arg2
+}
+
 ; CHECK-LABEL: name: sve_signature_vec_caller
 ; CHECK-DAG: [[ARG2:%[0-9]+]]:zpr = COPY $z1
 ; CHECK-DAG: [[ARG1:%[0-9]+]]:zpr = COPY $z0
@@ -164,20 +176,40 @@ define <vscale x 4 x i1> @sve_signature_pred_caller(<vscale x 4 x i1> %arg1, <vs
   ret <vscale x 4 x i1> %res
 }
 
-; CHECK-LABEL: name: sve_signature_pred_2d_caller
+; CHECK-LABEL: name: sve_signature_pred_2d_1x_caller
 ; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p1
 ; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p0
 ; CHECK-DAG: $p0 = COPY [[ARG2]]
 ; CHECK-DAG: $p1 = COPY [[ARG1]]
-; CHECK-NEXT: BL @sve_signature_pred_2d, csr_aarch64_sve_aapcs
+; CHECK-NEXT: BL @sve_signature_pred_2d_1x, csr_aarch64_sve_aapcs
 ; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p0
 ; CHECK: $p0 = COPY [[RES]]
 ; CHECK: RET_ReallyLR implicit $p0
-define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_caller([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
-  %res = call [1 x <vscale x 4 x i1>] @sve_signature_pred_2d([1 x <vscale x 4 x i1>] %arg2, [1 x <vscale x 4 x i1>] %arg1)
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_1x_caller([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+  %res = call [1 x <vscale x 4 x i1>] @sve_signature_pred_2d_1x([1 x <vscale x 4 x i1>] %arg2, [1 x <vscale x 4 x i1>] %arg1)
   ret [1 x <vscale x 4 x i1>] %res
 }
 
+; CHECK-LABEL: name: sve_signature_pred_2d_2x_caller
+; CHECK-DAG: [[ARG2_2:%[0-9]+]]:ppr = COPY $p3
+; CHECK-DAG: [[ARG2_1:%[0-9]+]]:ppr = COPY $p2
+; CHECK-DAG: [[ARG1_2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1_1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2_1]]
+; CHECK-DAG: $p1 = COPY [[ARG2_2]]
+; CHECK-DAG: $p2 = COPY [[ARG1_1]]
+; CHECK-DAG: $p3 = COPY [[ARG1_2]]
+; CHECK-NEXT: BL @sve_signature_pred_2d_2x, csr_aarch64_sve_aapcs
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 4 x i1>] @sve_signature_pred_2d_2x_caller([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
+  %res = call [2 x <vscale x 4 x i1>] @sve_signature_pred_2d_2x([2 x <vscale x 4 x i1>] %arg2, [2 x <vscale x 4 x i1>] %arg1)
+  ret [2 x <vscale x 4 x i1>] %res
+}
+
 ; Test that functions returning or taking SVE arguments use the correct
 ; callee-saved set when using the default C calling convention (as opposed
 ; to aarch64_sve_vector_pcs)