[llvm] f492471 - [AArch64][SVE] Lower function arguments in types of arrays of predicates (#90122)

Fri May 24 10:09:53 PDT 2024

Author: Zhaoshi Zheng
Date: 2024-05-24T10:09:49-07:00
New Revision: f49247129f3e873841bc6c3fec4bdc7c9d6f1dd7

URL: https://github.com/llvm/llvm-project/commit/f49247129f3e873841bc6c3fec4bdc7c9d6f1dd7
DIFF: https://github.com/llvm/llvm-project/commit/f49247129f3e873841bc6c3fec4bdc7c9d6f1dd7.diff

LOG: [AArch64][SVE] Lower function arguments in types of arrays of predicates (#90122)

Function arguments in types of arrays of scalable predicates, even in [1 x <vscale x 16 x i1>] type, should be allocated to available P0 ~ P3 registers, or to the stack and a point of the stack location is passed to the callee.

See details in https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#parameter-passing, step C.7 and C.8

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
    llvm/test/CodeGen/AArch64/sve-calling-convention.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index bfcafc6442d24..9a804c12939c4 100644

--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -38,6 +38,8 @@ static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
 static const MCPhysReg ZRegList[] = {AArch64::Z0, AArch64::Z1, AArch64::Z2,
                                      AArch64::Z3, AArch64::Z4, AArch64::Z5,
                                      AArch64::Z6, AArch64::Z7};
+static const MCPhysReg PRegList[] = {AArch64::P0, AArch64::P1, AArch64::P2,
+                                     AArch64::P3};
 
 static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
                              MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
@@ -59,11 +61,17 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
     // CCAssignFn again we want it to behave as if all remaining registers are
     // allocated. This will force the code to pass the tuple indirectly in
     // accordance with the PCS.
-    bool RegsAllocated[8];
+    bool ZRegsAllocated[8];
     for (int I = 0; I < 8; I++) {
-      RegsAllocated[I] = State.isAllocated(ZRegList[I]);
+      ZRegsAllocated[I] = State.isAllocated(ZRegList[I]);
       State.AllocateReg(ZRegList[I]);
     }
+    // The same applies to P registers.
+    bool PRegsAllocated[4];
+    for (int I = 0; I < 4; I++) {
+      PRegsAllocated[I] = State.isAllocated(PRegList[I]);
+      State.AllocateReg(PRegList[I]);
+    }
 
     auto &It = PendingMembers[0];
     CCAssignFn *AssignFn =
@@ -79,8 +87,11 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
     // Return the register state back to how it was before, leaving any
     // unallocated registers available for other smaller types.
     for (int I = 0; I < 8; I++)
-      if (!RegsAllocated[I])
+      if (!ZRegsAllocated[I])
         State.DeallocateReg(ZRegList[I]);
+    for (int I = 0; I < 4; I++)
+      if (!PRegsAllocated[I])
+        State.DeallocateReg(PRegList[I]);
 
     // All pending members have now been allocated
     PendingMembers.clear();
@@ -140,9 +151,15 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
     RegList = DRegList;
   else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector())
     RegList = QRegList;
-  else if (LocVT.isScalableVector())
-    RegList = ZRegList;
-  else {
+  else if (LocVT.isScalableVector()) {
+    // Scalable masks should be pass by Predicate registers.
+    if (LocVT == MVT::nxv1i1 || LocVT == MVT::nxv2i1 || LocVT == MVT::nxv4i1 ||
+        LocVT == MVT::nxv8i1 || LocVT == MVT::nxv16i1 ||
+        LocVT == MVT::aarch64svcount)
+      RegList = PRegList;
+    else
+      RegList = ZRegList;
+  } else {
     // Not an array we want to split up after all.
     return false;
   }

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index df17767138d90..25ba8d8500306 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7248,7 +7248,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
       unsigned NumParts = 1;
       if (Ins[i].Flags.isInConsecutiveRegs()) {
-        assert(!Ins[i].Flags.isInConsecutiveRegsLast());
         while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
           ++NumParts;
       }
@@ -8245,7 +8244,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       uint64_t PartSize = StoreSize;
       unsigned NumParts = 1;
       if (Outs[i].Flags.isInConsecutiveRegs()) {
-        assert(!Outs[i].Flags.isInConsecutiveRegsLast());
         while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
           ++NumParts;
         StoreSize *= NumParts;

diff  --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
index 8cb8b1c92fa7e..8ce24ceb33d71 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
@@ -72,6 +72,203 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_many_svepred_arg(<
   ret <vscale x 16 x i1> %ret
 }
 
+; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
+;     P0 = ldr [x0]
+define aarch64_sve_vector_pcs <vscale x 16 x i1> @callee_with_svepred_arg_4xv16i1_1xv16i1([4 x <vscale x 16 x i1>] %arg1, [1 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_4xv16i1_1xv16i1
+; CHECK:    [[BASE:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    $p0 = COPY [[PRED0]]
+; CHECK:    RET_ReallyLR implicit $p0
+  %res = extractvalue [1 x <vscale x 16 x i1>] %arg2, 0
+  ret <vscale x 16 x i1> %res
+}
+
+; Test that arg1 is stored to the stack from p0; and the stack location is passed throuch x0 to setup the call:
+;     str P0, [stack_loc_for_args]
+;     x0 = stack_loc_for_args
+define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_svepred_arg_1xv16i1_4xv16i1([1 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_1xv16i1_4xv16i1
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2,
+; CHECK-NEXT:     stack-id: scalable-vector,
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = COPY $p0
+; CHECK:    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+; CHECK:    STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
+; CHECK:    [[STACK:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+; CHECK:    $x0 = COPY [[STACK]]
+; CHECK:    BL @callee_with_svepred_arg_4xv16i1_1xv16i1, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0
+; CHECK:    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  %res = call <vscale x 16 x i1> @callee_with_svepred_arg_4xv16i1_1xv16i1([4 x <vscale x 16 x i1>] %arg2, [1 x <vscale x 16 x i1>] %arg1)
+  ret <vscale x 16 x i1> %res
+}
+
+; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
+;     P0 = ldr [x0]
+;     P1 = ldr [x0 +   sizeof(Px)]
+;     P2 = ldr [x0 + 2*sizeof(Px)]
+;     P3 = ldr [x0 + 3*sizeof(Px)]
+define aarch64_sve_vector_pcs [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_4xv16i1_4xv16i1([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_4xv16i1_4xv16i1
+; CHECK:    [[BASE:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[BASE]], killed [[OFFSET1]]
+; CHECK:    [[PRED1:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET2]]
+; CHECK:    [[PRED2:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET3]]
+; CHECK:    [[PRED3:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    $p0 = COPY [[PRED0]]
+; CHECK:    $p1 = COPY [[PRED1]]
+; CHECK:    $p2 = COPY [[PRED2]]
+; CHECK:    $p3 = COPY [[PRED3]]
+; CHECK:    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+  ret [4 x <vscale x 16 x i1>] %arg2
+}
+
+; Test that arg1 is stored to the stack from p0~p3; and the stack location is passed throuch x0 to setup the call:
+;     str P0, [stack_loc_for_args]
+;     str P1, [stack_loc_for_args +   sizeof(Px)]
+;     str P2, [stack_loc_for_args + 2*sizeof(Px)]
+;     str P3, [stack_loc_for_args + 3*sizeof(Px)]
+;     x0 = stack_loc_for_args
+define [4 x <vscale x 16 x i1>] @caller_with_svepred_arg_4xv16i1_4xv16i1([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_4xv16i1_4xv16i1
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
+; CHECK-NEXT:     stack-id: scalable-vector,
+; CHECK:    [[PRED3:%[0-9]+]]:ppr = COPY $p3
+; CHECK:    [[PRED2:%[0-9]+]]:ppr = COPY $p2
+; CHECK:    [[PRED1:%[0-9]+]]:ppr = COPY $p1
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = COPY $p0
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[STACK:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], [[OFFSET3]]
+; CHECK:    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+; CHECK:    STR_PXI [[PRED3]], killed [[ADDR3]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], [[OFFSET2]]
+; CHECK:    STR_PXI [[PRED2]], killed [[ADDR2]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[STACK]], [[OFFSET1]]
+; CHECK:    STR_PXI [[PRED1]], killed [[ADDR1]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
+; CHECK:    $x0 = COPY [[STACK]]
+; CHECK:    BL @callee_with_svepred_arg_4xv16i1_4xv16i1, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0, implicit-def $p1, implicit-def $p2, implicit-def $p3
+; CHECK:    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  %res = call [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_4xv16i1_4xv16i1([4 x <vscale x 16 x i1>] %arg2, [4 x <vscale x 16 x i1>] %arg1)
+  ret [4 x <vscale x 16 x i1>] %res
+}
+
+; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
+;     P0 = ldr [x0]
+;     P1 = ldr [x0 +   sizeof(Px)]
+;     P2 = ldr [x0 + 2*sizeof(Px)]
+;     P3 = ldr [x0 + 3*sizeof(Px)]
+define aarch64_sve_vector_pcs [2 x <vscale x 32 x i1>] @callee_with_svepred_arg_1xv16i1_2xv32i1([1 x <vscale x 16 x i1>] %arg1, [2 x <vscale x 32 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_1xv16i1_2xv32i1
+; CHECK:    [[BASE:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[BASE]], killed [[OFFSET1]]
+; CHECK:    [[PRED1:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET2]]
+; CHECK:    [[PRED2:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET3]]
+; CHECK:    [[PRED3:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    $p0 = COPY [[PRED0]]
+; CHECK:    $p1 = COPY [[PRED1]]
+; CHECK:    $p2 = COPY [[PRED2]]
+; CHECK:    $p3 = COPY [[PRED3]]
+; CHECK:    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+  ret [2 x <vscale x 32 x i1>] %arg2
+}
+
+; Test that arg1 is stored to the stack from p0~p3; and the stack location is passed throuch x0 to setup the call:
+;     str P0, [stack_loc_for_args]
+;     str P1, [stack_loc_for_args +   sizeof(Px)]
+;     str P2, [stack_loc_for_args + 2*sizeof(Px)]
+;     str P3, [stack_loc_for_args + 3*sizeof(Px)]
+;     x0 = stack_loc_for_args
+define [2 x <vscale x 32 x i1>] @caller_with_svepred_arg_2xv32i1_1xv16i1([2 x <vscale x 32 x i1>] %arg1, [1 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_2xv32i1_1xv16i1
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
+; CHECK-NEXT:     stack-id: scalable-vector,
+; CHECK:    [[PRED3:%[0-9]+]]:ppr = COPY $p3
+; CHECK:    [[PRED2:%[0-9]+]]:ppr = COPY $p2
+; CHECK:    [[PRED1:%[0-9]+]]:ppr = COPY $p1
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = COPY $p0
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[STACK:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], killed [[OFFSET3]]
+; CHECK:    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+; CHECK:    STR_PXI [[PRED3]], killed [[ADDR3]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], killed [[OFFSET2]]
+; CHECK:    STR_PXI [[PRED2]], killed [[ADDR2]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[STACK]], killed [[OFFSET1]]
+; CHECK:    STR_PXI [[PRED1]], killed [[ADDR1]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
+; CHECK:    $x0 = COPY [[STACK]]
+; CHECK:    BL @callee_with_svepred_arg_1xv16i1_2xv32i1, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $x0, implicit-def $sp, implicit-def $p0, implicit-def $p1, implicit-def $p2, implicit-def $p3
+; CHECK:    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  %res = call [2 x <vscale x 32 x i1>] @callee_with_svepred_arg_1xv16i1_2xv32i1([1 x <vscale x 16 x i1>] %arg2, [2 x <vscale x 32 x i1>] %arg1)
+  ret [2 x <vscale x 32 x i1>] %res
+}
+
+; Test that arg1 and arg3 are passed via P0~P3, arg1 is passed indirectly through address on stack in x0
+define aarch64_sve_vector_pcs [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_2xv16i1_4xv16i1_2xv16i1([2 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2, [2 x <vscale x 16 x i1>] %arg3) nounwind {
+; CHECK: name: callee_with_svepred_arg_2xv16i1_4xv16i1_2xv16i1
+; CHECK:    [[P3:%[0-9]+]]:ppr = COPY $p3
+; CHECK:    [[P2:%[0-9]+]]:ppr = COPY $p2
+; CHECK:    [[X0:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK:    [[P1:%[0-9]+]]:ppr = COPY $p1
+; CHECK:    [[P0:%[0-9]+]]:ppr = COPY $p0
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[X0]], killed [[OFFSET3]]
+; CHECK:    [[P7:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[X0]], killed [[OFFSET2]]
+; CHECK:    [[P6:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[X0]], killed [[OFFSET1]]
+; CHECK:    [[P5:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[P4:%[0-9]+]]:ppr = LDR_PXI [[X0]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[RES0:%[0-9]+]]:ppr = AND_PPzPP [[P0]], [[P0]], killed [[P4]]
+; CHECK:    [[RES1:%[0-9]+]]:ppr = AND_PPzPP [[P1]], [[P1]], killed [[P5]]
+; CHECK:    [[RES2:%[0-9]+]]:ppr = AND_PPzPP [[P2]], [[P2]], killed [[P6]]
+; CHECK:    [[RES3:%[0-9]+]]:ppr = AND_PPzPP [[P3]], [[P3]], killed [[P7]]
+; CHECK:    $p0 = COPY [[RES0]]
+; CHECK:    $p1 = COPY [[RES1]]
+; CHECK:    $p2 = COPY [[RES2]]
+; CHECK:    $p3 = COPY [[RES3]]
+; CHECK:    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+  %p0 = extractvalue [2 x <vscale x 16 x i1>] %arg1, 0
+  %p1 = extractvalue [2 x <vscale x 16 x i1>] %arg1, 1
+  %p2 = extractvalue [2 x <vscale x 16 x i1>] %arg3, 0
+  %p3 = extractvalue [2 x <vscale x 16 x i1>] %arg3, 1
+  %p4 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 0
+  %p5 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 1
+  %p6 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 2
+  %p7 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 3
+  %r0 = and <vscale x 16 x i1> %p0, %p4
+  %r1 = and <vscale x 16 x i1> %p1, %p5
+  %r2 = and <vscale x 16 x i1> %p2, %p6
+  %r3 = and <vscale x 16 x i1> %p3, %p7
+  %1 = insertvalue  [4 x <vscale x 16 x i1>] undef, <vscale x 16 x i1> %r0, 0
+  %2 = insertvalue  [4 x <vscale x 16 x i1>]    %1, <vscale x 16 x i1> %r1, 1
+  %3 = insertvalue  [4 x <vscale x 16 x i1>]    %2, <vscale x 16 x i1> %r2, 2
+  %4 = insertvalue  [4 x <vscale x 16 x i1>]    %3, <vscale x 16 x i1> %r3, 3
+  ret [4 x <vscale x 16 x i1>] %4
+}
+
 ; Test that z8 and z9, passed by reference, are loaded from a location that is passed on the stack.
 ; i.e.     x0 =   %x0
 ;             :

diff  --git a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
index 0a45244f12be5..bfb750517cbf9 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
@@ -128,6 +128,52 @@ define <vscale x 4 x i1> @sve_signature_pred(<vscale x 4 x i1> %arg1, <vscale x
   ret <vscale x 4 x i1> %arg2
 }
 
+; Test that scalable predicate argument in [1 x <vscale x 4 x i1>] type are properly assigned to P registers.
+; CHECK-LABEL: name: sve_signature_pred_1xv4i1
+; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES]]
+; CHECK: RET_ReallyLR implicit $p0
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_1xv4i1([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+  ret [1 x <vscale x 4 x i1>] %arg2
+}
+
+; Test that upto to two scalable predicate arguments in [2 x <vscale x 4 x i1>] type can be assigned to P registers.
+; CHECK-LABEL: name: sve_signature_pred_2xv4i1
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p2
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
+  ret [2 x <vscale x 4 x i1>] %arg2
+}
+
+; Test that a scalable predicate argument in [1 x <vscale x 32 x i1>] type is assigned to two P registers.
+; CHECK-LABLE: name: sve_signature_pred_1xv32i1
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p2
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [1 x <vscale x 32 x i1>] @sve_signature_pred_1xv32i1([1 x <vscale x 32 x i1>] %arg1, [1 x <vscale x 32 x i1>] %arg2) nounwind {
+  ret [1 x <vscale x 32 x i1>] %arg2
+}
+
+; Test that a scalable predicate argument in [2 x <vscale x 32 x i1>] type is assigned to four P registers.
+; CHECK-LABLE: name: sve_signature_pred_2xv32i1
+; CHECK: [[RES3:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[RES2:%[0-9]+]]:ppr = COPY $p2
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: $p2 = COPY [[RES2]]
+; CHECK: $p3 = COPY [[RES3]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+define [2 x <vscale x 32 x i1>] @sve_signature_pred_2xv32i1([2 x <vscale x 32 x i1>] %arg1) nounwind {
+  ret [2 x <vscale x 32 x i1>] %arg1
+}
+
 ; CHECK-LABEL: name: sve_signature_vec_caller
 ; CHECK-DAG: [[ARG2:%[0-9]+]]:zpr = COPY $z1
 ; CHECK-DAG: [[ARG1:%[0-9]+]]:zpr = COPY $z0
@@ -156,6 +202,84 @@ define <vscale x 4 x i1> @sve_signature_pred_caller(<vscale x 4 x i1> %arg1, <vs
   ret <vscale x 4 x i1> %res
 }
 
+; CHECK-LABEL: name: sve_signature_pred_1xv4i1_caller
+; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2]]
+; CHECK-DAG: $p1 = COPY [[ARG1]]
+; CHECK-NEXT: BL @sve_signature_pred_1xv4i1, csr_aarch64_sve_aapcs
+; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p0
+; CHECK: $p0 = COPY [[RES]]
+; CHECK: RET_ReallyLR implicit $p0
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_1xv4i1_caller([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+  %res = call [1 x <vscale x 4 x i1>] @sve_signature_pred_1xv4i1([1 x <vscale x 4 x i1>] %arg2, [1 x <vscale x 4 x i1>] %arg1)
+  ret [1 x <vscale x 4 x i1>] %res
+}
+
+; CHECK-LABEL: name: sve_signature_pred_2xv4i1_caller
+; CHECK-DAG: [[ARG2_2:%[0-9]+]]:ppr = COPY $p3
+; CHECK-DAG: [[ARG2_1:%[0-9]+]]:ppr = COPY $p2
+; CHECK-DAG: [[ARG1_2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1_1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2_1]]
+; CHECK-DAG: $p1 = COPY [[ARG2_2]]
+; CHECK-DAG: $p2 = COPY [[ARG1_1]]
+; CHECK-DAG: $p3 = COPY [[ARG1_2]]
+; CHECK-NEXT: BL @sve_signature_pred_2xv4i1, csr_aarch64_sve_aapcs
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1_caller([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
+  %res = call [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>] %arg2, [2 x <vscale x 4 x i1>] %arg1)
+  ret [2 x <vscale x 4 x i1>] %res
+}
+
+; CHECK-LABEL: name: sve_signature_pred_1xv32i1_caller
+; CHECK-DAG: [[ARG2_2:%[0-9]+]]:ppr = COPY $p3
+; CHECK-DAG: [[ARG2_1:%[0-9]+]]:ppr = COPY $p2
+; CHECK-DAG: [[ARG1_2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1_1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2_1]]
+; CHECK-DAG: $p1 = COPY [[ARG2_2]]
+; CHECK-DAG: $p2 = COPY [[ARG1_1]]
+; CHECK-DAG: $p3 = COPY [[ARG1_2]]
+; CHECK-NEXT: BL @sve_signature_pred_1xv32i1, csr_aarch64_sve_aapcs
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [1 x <vscale x 32 x i1>] @sve_signature_pred_1xv32i1_caller([1 x <vscale x 32 x i1>] %arg1, [1 x <vscale x 32 x i1>] %arg2) nounwind {
+  %res = call [1 x <vscale x 32 x i1>] @sve_signature_pred_1xv32i1([1 x <vscale x 32 x i1>] %arg2, [1 x <vscale x 32 x i1>] %arg1)
+  ret [1 x <vscale x 32 x i1>] %res
+}
+
+; CHECK-LABEL: name: sve_signature_pred_2xv32i1_caller
+; CHECK-DAG: [[ARG3:%[0-9]+]]:ppr = COPY $p3
+; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p2
+; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG0:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG0]]
+; CHECK-DAG: $p1 = COPY [[ARG1]]
+; CHECK-DAG: $p2 = COPY [[ARG2]]
+; CHECK-DAG: $p3 = COPY [[ARG3]]
+; CHECK-NEXT: BL @sve_signature_pred_2xv32i1, csr_aarch64_sve_aapcs
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: [[RES2:%[0-9]+]]:ppr = COPY $p2
+; CHECK: [[RES3:%[0-9]+]]:ppr = COPY $p3
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: $p2 = COPY [[RES2]]
+; CHECK: $p3 = COPY [[RES3]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 32 x i1>] @sve_signature_pred_2xv32i1_caller([2 x <vscale x 32 x i1>] %arg1) {
+  %res = call [2 x <vscale x 32 x i1>] @sve_signature_pred_2xv32i1([2 x <vscale x 32 x i1>] %arg1)
+  ret [2 x <vscale x 32 x i1>] %res
+}
+
 ; Test that functions returning or taking SVE arguments use the correct
 ; callee-saved set when using the default C calling convention (as opposed
 ; to aarch64_sve_vector_pcs)