[llvm] 3f78605 - [PowerPC] Add paired vector load and store builtins and intrinsics
Baptiste Saleil via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 13 10:35:53 PST 2020
Author: Baptiste Saleil
Date: 2020-11-13T12:35:10-06:00
New Revision: 3f78605a8cb121d005c0ad11cce83cf58be983f2
URL: https://github.com/llvm/llvm-project/commit/3f78605a8cb121d005c0ad11cce83cf58be983f2
DIFF: https://github.com/llvm/llvm-project/commit/3f78605a8cb121d005c0ad11cce83cf58be983f2.diff
LOG: [PowerPC] Add paired vector load and store builtins and intrinsics
This patch adds the Clang builtins and LLVM intrinsics to load and store vector pairs.
Differential Revision: https://reviews.llvm.org/D90799
Added:
llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll
llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll
Modified:
clang/include/clang/Basic/BuiltinsPPC.def
clang/lib/CodeGen/CGBuiltin.cpp
clang/test/CodeGen/builtins-ppc-mma.c
clang/test/Sema/ppc-mma-types.c
llvm/include/llvm/IR/IntrinsicsPowerPC.td
llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
llvm/lib/Target/PowerPC/PPCISelLowering.cpp
llvm/lib/Target/PowerPC/PPCISelLowering.h
llvm/lib/Target/PowerPC/PPCInstrInfo.td
llvm/lib/Target/PowerPC/PPCInstrPrefix.td
llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
index f35a49b681cc..78ce77043b6f 100644
--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -738,6 +738,8 @@ MMA_BUILTIN(pmxvbf16ger2pp, "vW512*VVi15i15i3", true)
MMA_BUILTIN(pmxvbf16ger2pn, "vW512*VVi15i15i3", true)
MMA_BUILTIN(pmxvbf16ger2np, "vW512*VVi15i15i3", true)
MMA_BUILTIN(pmxvbf16ger2nn, "vW512*VVi15i15i3", true)
+MMA_BUILTIN(lxvp, "W256SLLiW256C*", false)
+MMA_BUILTIN(stxvp, "vW256SLLiW256C*", false)
// FIXME: Obviously incomplete.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 0b8259221d8f..0ea149e0cbde 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14776,6 +14776,19 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
break;
#include "clang/Basic/BuiltinsPPC.def"
}
+ if (BuiltinID == PPC::BI__builtin_mma_lxvp ||
+ BuiltinID == PPC::BI__builtin_mma_stxvp) {
+ if (BuiltinID == PPC::BI__builtin_mma_lxvp) {
+ Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
+ Ops[0] = Builder.CreateGEP(Ops[1], Ops[0]);
+ } else {
+ Ops[2] = Builder.CreateBitCast(Ops[2], Int8PtrTy);
+ Ops[1] = Builder.CreateGEP(Ops[2], Ops[1]);
+ }
+ Ops.pop_back();
+ llvm::Function *F = CGM.getIntrinsic(ID);
+ return Builder.CreateCall(F, Ops, "");
+ }
SmallVector<Value*, 4> CallOps;
if (Accumulate) {
Address Addr = EmitPointerWithAlignment(E->getArg(0));
diff --git a/clang/test/CodeGen/builtins-ppc-mma.c b/clang/test/CodeGen/builtins-ppc-mma.c
index 820f72653876..88ca36aa6714 100644
--- a/clang/test/CodeGen/builtins-ppc-mma.c
+++ b/clang/test/CodeGen/builtins-ppc-mma.c
@@ -1036,3 +1036,162 @@ void test65(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
__builtin_mma_pmxvbf16ger2nn(&vq, vc, vc, 0, 0, 0);
*((__vector_quad *)resp) = vq;
}
+
+// CHECK-LABEL: @test66(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP1]], i8* [[TMP2]])
+// CHECK-NEXT: ret void
+//
+void test66(const __vector_pair *vpp, const __vector_pair *vp2) {
+ __vector_pair vp = __builtin_mma_lxvp(0LL, vpp);
+ __builtin_mma_stxvp(vp, 0LL, vp2);
+}
+
+// CHECK-LABEL: @test67(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[OFFSET:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[OFFSET]]
+// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT: ret void
+//
+void test67(const __vector_pair *vpp, signed long long offset, const __vector_pair *vp2) {
+ __vector_pair vp = __builtin_mma_lxvp(offset, vpp);
+ __builtin_mma_stxvp(vp, offset, vp2);
+}
+
+// CHECK-LABEL: @test68(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 18
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 18
+// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT: ret void
+//
+void test68(const __vector_pair *vpp, const __vector_pair *vp2) {
+ __vector_pair vp = __builtin_mma_lxvp(18LL, vpp);
+ __builtin_mma_stxvp(vp, 18LL, vp2);
+}
+
+// CHECK-LABEL: @test69(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 1
+// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT: ret void
+//
+void test69(const __vector_pair *vpp, const __vector_pair *vp2) {
+ __vector_pair vp = __builtin_mma_lxvp(1LL, vpp);
+ __builtin_mma_stxvp(vp, 1LL, vp2);
+}
+
+// CHECK-LABEL: @test70(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 42
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 42
+// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT: ret void
+//
+void test70(const __vector_pair *vpp, const __vector_pair *vp2) {
+ __vector_pair vp = __builtin_mma_lxvp(42LL, vpp);
+ __builtin_mma_stxvp(vp, 42LL, vp2);
+}
+
+// CHECK-LABEL: @test71(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr <256 x i1>, <256 x i1>* [[VPP:%.*]], i64 128
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <256 x i1>* [[TMP0]] to i8*
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = getelementptr <256 x i1>, <256 x i1>* [[VP2:%.*]], i64 128
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast <256 x i1>* [[TMP3]] to i8*
+// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT: ret void
+//
+void test71(const __vector_pair *vpp, const __vector_pair *vp2) {
+ __vector_pair vp = __builtin_mma_lxvp(32768LL, vpp);
+ __builtin_mma_stxvp(vp, 32768LL, vp2);
+}
+
+// CHECK-LABEL: @test72(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 32799
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 32799
+// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT: ret void
+//
+void test72(const __vector_pair *vpp, const __vector_pair *vp2) {
+ __vector_pair vp = __builtin_mma_lxvp(32799LL, vpp);
+ __builtin_mma_stxvp(vp, 32799LL, vp2);
+}
+
+// CHECK-LABEL: @test73(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>*
+// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2:!tbaa !.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i64 8
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP1]], <256 x i1> [[TMP4]], <16 x i8> [[VC:%.*]], i32 0, i32 0)
+// CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>*
+// CHECK-NEXT: store <512 x i1> [[TMP5]], <512 x i1>* [[TMP6]], align 64, [[TBAA2]]
+// CHECK-NEXT: ret void
+//
+void test73(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
+ __vector_quad vq = *((__vector_quad *)vqp);
+ __vector_pair vp = __builtin_mma_lxvp(8LL, vpp);
+ __builtin_mma_pmxvf64gernn(&vq, vp, vc, 0, 0);
+ *((__vector_quad *)resp) = vq;
+}
+
+// CHECK-LABEL: @test74(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>*
+// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2]]
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT: [[TMP3:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP2]])
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]])
+// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>*
+// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, [[TBAA2]]
+// CHECK-NEXT: ret void
+//
+void test74(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
+ __vector_quad vq = *((__vector_quad *)vqp);
+ __vector_pair vp = __builtin_mma_lxvp(0LL, vpp);
+ __builtin_mma_xvf64gernp(&vq, vp, vc);
+ *((__vector_quad *)resp) = vq;
+}
+
+// CHECK-LABEL: @test75(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>*
+// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2:!tbaa !.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[OFFS:%.*]]
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP4]], <16 x i8> [[VC:%.*]])
+// CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>*
+// CHECK-NEXT: store <512 x i1> [[TMP5]], <512 x i1>* [[TMP6]], align 64, [[TBAA2]]
+// CHECK-NEXT: ret void
+//
+void test75(unsigned char *vqp, signed long long offs, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
+ __vector_quad vq = *((__vector_quad *)vqp);
+ __vector_pair vp = __builtin_mma_lxvp(offs, vpp);
+ __builtin_mma_xvf64gernp(&vq, vp, vc);
+ *((__vector_quad *)resp) = vq;
+}
diff --git a/clang/test/Sema/ppc-mma-types.c b/clang/test/Sema/ppc-mma-types.c
index 96644a4d9bbd..840e34845f58 100644
--- a/clang/test/Sema/ppc-mma-types.c
+++ b/clang/test/Sema/ppc-mma-types.c
@@ -319,3 +319,17 @@ void testVPOperators4(int v, void *ptr) {
__vector_pair vp2 = (__vector_pair)vpp; // expected-error {{used type '__vector_pair' where arithmetic or pointer type is required}}
}
+void testBuiltinTypes1(const __vector_pair *vpp, const __vector_pair *vp2, float f) {
+ __vector_pair vp = __builtin_mma_lxvp(f, vpp); // expected-error {{passing 'float' to parameter of incompatible type 'long long'}}
+ __builtin_mma_stxvp(vp, 32799, vp2); // expected-error {{passing 'int' to parameter of incompatible type 'long long'}}
+}
+
+void testBuiltinTypes2(__vector_pair *vpp, const __vector_pair *vp2, unsigned char c) {
+ __vector_pair vp = __builtin_mma_lxvp(6LL, vpp); // expected-error {{passing '__vector_pair *' to parameter of incompatible type 'const __vector_pair *'}}
+ __builtin_mma_stxvp(vp, c, vp2); // expected-error {{passing 'unsigned char' to parameter of incompatible type 'long long'}}
+}
+
+void testBuiltinTypes3(vector int v, __vector_pair *vp2, signed long long ll, unsigned short s) {
+ __vector_pair vp = __builtin_mma_lxvp(ll, v); // expected-error {{passing '__vector int' (vector of 4 'int' values) to parameter of incompatible type 'const __vector_pair *'}}
+ __builtin_mma_stxvp(vp, ll, s); // expected-error {{passing 'unsigned short' to parameter of incompatible type 'const __vector_pair *'}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index ac994548c506..fa5000d42482 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1422,6 +1422,14 @@ let TargetPrefix = "ppc" in {
def int_ppc_mma_xxsetaccz :
Intrinsic<[llvm_v512i1_ty], [], [IntrNoMem]>;
+ def int_ppc_mma_lxvp :
+ Intrinsic<[llvm_v256i1_ty], [llvm_ptr_ty],
+ [IntrReadMem, IntrArgMemOnly]>;
+
+ def int_ppc_mma_stxvp :
+ Intrinsic<[], [llvm_v256i1_ty, llvm_ptr_ty],
+ [IntrWriteMem, IntrArgMemOnly]>;
+
// MMA Reduced-Precision: Outer Product Intrinsic Definitions.
defm int_ppc_mma_xvi4ger8 :
PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index c67cf897c397..a66a015ac2ef 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -293,6 +293,13 @@ namespace {
Align(16));
}
+ /// SelectAddrImmX34 - Returns true if the address N can be represented by
+ /// a base register plus a signed 34-bit displacement. Suitable for use by
+ /// PSTXVP and friends.
+ bool SelectAddrImmX34(SDValue N, SDValue &Disp, SDValue &Base) {
+ return PPCLowering->SelectAddressRegImm34(N, Disp, Base, *CurDAG);
+ }
+
// Select an address into a single register.
bool SelectAddr(SDValue N, SDValue &Base) {
Base = N;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 5b5504a458ed..2a77d53a7817 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2399,6 +2399,20 @@ bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
return false;
}
+/// isIntS34Immediate - This method tests if value of node given can be
+/// accurately represented as a sign extension from a 34-bit value. If so,
+/// this returns true and the immediate.
+bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {
+ if (!isa<ConstantSDNode>(N))
+ return false;
+
+ Imm = (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+ return isInt<34>(Imm);
+}
+bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) {
+ return isIntS34Immediate(Op.getNode(), Imm);
+}
+
/// SelectAddressRegReg - Given the specified addressed, check to see if it
/// can be represented as an indexed [r+r] operation. Returns false if it
/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
@@ -2599,6 +2613,55 @@ bool PPCTargetLowering::SelectAddressRegImm(
return true; // [r+0]
}
+/// Similar to the 16-bit case but for instructions that take a 34-bit
+/// displacement field (prefixed loads/stores).
+bool PPCTargetLowering::SelectAddressRegImm34(SDValue N, SDValue &Disp,
+ SDValue &Base,
+ SelectionDAG &DAG) const {
+ // Only on 64-bit targets.
+ if (N.getValueType() != MVT::i64)
+ return false;
+
+ SDLoc dl(N);
+ int64_t Imm = 0;
+
+ if (N.getOpcode() == ISD::ADD) {
+ if (!isIntS34Immediate(N.getOperand(1), Imm))
+ return false;
+ Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+ if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
+ Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+ else
+ Base = N.getOperand(0);
+ return true;
+ }
+
+ if (N.getOpcode() == ISD::OR) {
+ if (!isIntS34Immediate(N.getOperand(1), Imm))
+ return false;
+ // If this is an or of disjoint bitfields, we can codegen this as an add
+ // (for better address arithmetic) if the LHS and RHS of the OR are
+ // provably disjoint.
+ KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
+ if ((LHSKnown.Zero.getZExtValue() | ~(uint64_t)Imm) != ~0ULL)
+ return false;
+ if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
+ Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+ else
+ Base = N.getOperand(0);
+ Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+ return true;
+ }
+
+ if (isIntS34Immediate(N, Imm)) { // If the address is a 34-bit const.
+ Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+ Base = DAG.getRegister(PPC::ZERO8, N.getValueType());
+ return true;
+ }
+
+ return false;
+}
+
/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
/// represented as an indexed [r+r] operation.
bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 70dcef4658ff..ca7c68624c68 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -770,6 +770,8 @@ namespace llvm {
bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
SelectionDAG &DAG,
MaybeAlign EncodingAlignment) const;
+ bool SelectAddressRegImm34(SDValue N, SDValue &Disp, SDValue &Base,
+ SelectionDAG &DAG) const;
/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
/// represented as an indexed [r+r] operation.
@@ -1325,6 +1327,8 @@ namespace llvm {
bool isIntS16Immediate(SDNode *N, int16_t &Imm);
bool isIntS16Immediate(SDValue Op, int16_t &Imm);
+ bool isIntS34Immediate(SDNode *N, int64_t &Imm);
+ bool isIntS34Immediate(SDValue Op, int64_t &Imm);
bool convertToNonDenormSingle(APInt &ArgAPInt);
bool convertToNonDenormSingle(APFloat &ArgAPFloat);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 1a128f1ddf0d..2e77d04d4a79 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1031,11 +1031,13 @@ def pred : Operand<OtherVT> {
// Define PowerPC specific addressing mode.
// d-form
-def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>; // "stb"
+def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>; // "stb"
// ds-form
-def iaddrX4 : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std"
+def iaddrX4 : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std"
// dq-form
-def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16", [], []>; // "stxv"
+def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16", [], []>; // "stxv"
+// 8LS:d-form
+def iaddrX34 : ComplexPattern<iPTR, 2, "SelectAddrImmX34", [], []>; // "pstxvp"
// Below forms are all x-form addressing mode, use three
diff erent ones so we
// can make a accurate check for x-form instructions in ISEL.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
index 9f20bfcebe3c..e1b76bb3bd00 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -1654,6 +1654,24 @@ let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops, PrefixInstrs] i
"pstxvp $XTp, $D_RA", IIC_LdStLFD>;
}
+let Predicates = [PairedVectorMemops] in {
+ // Intrinsics for Paired Vector Loads.
+ def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX16:$src)), (LXVP memrix16:$src)>;
+ def : Pat<(v256i1 (int_ppc_mma_lxvp xaddrX16:$src)), (LXVPX xaddrX16:$src)>;
+ let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+ def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX34:$src)), (PLXVP memri34:$src)>;
+ }
+ // Intrinsics for Paired Vector Stores.
+ def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX16:$dst),
+ (STXVP $XSp, memrix16:$dst)>;
+ def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, xaddrX16:$dst),
+ (STXVPX $XSp, xaddrX16:$dst)>;
+ let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+ def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX34:$dst),
+ (PSTXVP $XSp, memri34:$dst)>;
+ }
+}
+
// TODO: We have an added complexity of 500 here. This is only a temporary
// solution to have tablegen consider these patterns first. The way we do
// addressing for PowerPC is complex depending on available D form, X form, or
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index 4f1825bfc1c5..ccbaea88d2f1 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -60,6 +60,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsPowerPC.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
@@ -277,8 +278,11 @@ static Value *GetPointerOperand(Value *MemI) {
} else if (StoreInst *SMemI = dyn_cast<StoreInst>(MemI)) {
return SMemI->getPointerOperand();
} else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(MemI)) {
- if (IMemI->getIntrinsicID() == Intrinsic::prefetch)
+ if (IMemI->getIntrinsicID() == Intrinsic::prefetch ||
+ IMemI->getIntrinsicID() == Intrinsic::ppc_mma_lxvp)
return IMemI->getArgOperand(0);
+ if (IMemI->getIntrinsicID() == Intrinsic::ppc_mma_stxvp)
+ return IMemI->getArgOperand(1);
}
return nullptr;
@@ -345,9 +349,13 @@ SmallVector<Bucket, 16> PPCLoopInstrFormPrep::collectCandidates(
MemI = SMemI;
PtrValue = SMemI->getPointerOperand();
} else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(&J)) {
- if (IMemI->getIntrinsicID() == Intrinsic::prefetch) {
+ if (IMemI->getIntrinsicID() == Intrinsic::prefetch ||
+ IMemI->getIntrinsicID() == Intrinsic::ppc_mma_lxvp) {
MemI = IMemI;
PtrValue = IMemI->getArgOperand(0);
+ } else if (IMemI->getIntrinsicID() == Intrinsic::ppc_mma_stxvp) {
+ MemI = IMemI;
+ PtrValue = IMemI->getArgOperand(1);
} else continue;
} else continue;
@@ -827,6 +835,11 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
if (ST && ST->hasAltivec() &&
PtrValue->getType()->getPointerElementType()->isVectorTy())
return false;
+ // There are no update forms for P10 lxvp/stxvp intrinsic.
+ auto *II = dyn_cast<IntrinsicInst>(I);
+ if (II && ((II->getIntrinsicID() == Intrinsic::ppc_mma_lxvp) ||
+ II->getIntrinsicID() == Intrinsic::ppc_mma_stxvp))
+ return false;
// See getPreIndexedAddressParts, the displacement for LDU/STDU has to
// be 4's multiple (DS-form). For i64 loads/stores when the displacement
// fits in a 16-bit signed field but isn't a multiple of 4, it will be
@@ -864,7 +877,13 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
// Check if a load/store has DQ form.
auto isDQFormCandidate = [&] (const Instruction *I, const Value *PtrValue) {
assert((PtrValue && I) && "Invalid parameter!");
- return !isa<IntrinsicInst>(I) && ST && ST->hasP9Vector() &&
+ // Check if it is a P10 lxvp/stxvp intrinsic.
+ auto *II = dyn_cast<IntrinsicInst>(I);
+ if (II)
+ return II->getIntrinsicID() == Intrinsic::ppc_mma_lxvp ||
+ II->getIntrinsicID() == Intrinsic::ppc_mma_stxvp;
+ // Check if it is a P9 vector load/store.
+ return ST && ST->hasP9Vector() &&
(PtrValue->getType()->getPointerElementType()->isVectorTy());
};
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index f3134460514c..be4f3354ede4 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -1223,7 +1223,8 @@ bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
case Intrinsic::ppc_vsx_lxvd2x_be:
case Intrinsic::ppc_vsx_lxvw4x_be:
case Intrinsic::ppc_vsx_lxvl:
- case Intrinsic::ppc_vsx_lxvll: {
+ case Intrinsic::ppc_vsx_lxvll:
+ case Intrinsic::ppc_mma_lxvp: {
Info.PtrVal = Inst->getArgOperand(0);
Info.ReadMem = true;
Info.WriteMem = false;
@@ -1239,7 +1240,8 @@ bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
case Intrinsic::ppc_vsx_stxvd2x_be:
case Intrinsic::ppc_vsx_stxvw4x_be:
case Intrinsic::ppc_vsx_stxvl:
- case Intrinsic::ppc_vsx_stxvll: {
+ case Intrinsic::ppc_vsx_stxvll:
+ case Intrinsic::ppc_mma_stxvp: {
Info.PtrVal = Inst->getArgOperand(1);
Info.ReadMem = false;
Info.WriteMem = true;
diff --git a/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll b/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll
new file mode 100644
index 000000000000..a9041d8d9782
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | FileCheck %s \
+; RUN: --check-prefix=CHECK-BE
+
+; This test checks that LSR properly recognizes lxvp/stxvp as load/store
+; intrinsics to avoid generating x-form instructions instead of d-forms.
+
+declare <256 x i1> @llvm.ppc.mma.lxvp(i8*)
+declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*)
+define void @foo(i32 zeroext %n, <256 x i1>* %ptr, <256 x i1>* %ptr2) {
+; CHECK-LABEL: foo:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: cmplwi r3, 0
+; CHECK-NEXT: beqlr cr0
+; CHECK-NEXT: # %bb.1: # %for.body.lr.ph
+; CHECK-NEXT: clrldi r6, r3, 32
+; CHECK-NEXT: addi r3, r4, 64
+; CHECK-NEXT: addi r4, r5, 64
+; CHECK-NEXT: mtctr r6
+; CHECK-NEXT: .p2align 4
+; CHECK-NEXT: .LBB0_2: # %for.body
+; CHECK-NEXT: #
+; CHECK-NEXT: lxvp vsp0, -64(r3)
+; CHECK-NEXT: lxvp vsp2, -32(r3)
+; CHECK-NEXT: lxvp vsp4, 0(r3)
+; CHECK-NEXT: lxvp vsp6, 32(r3)
+; CHECK-NEXT: addi r3, r3, 1
+; CHECK-NEXT: stxvp vsp0, -64(r4)
+; CHECK-NEXT: stxvp vsp2, -32(r4)
+; CHECK-NEXT: stxvp vsp4, 0(r4)
+; CHECK-NEXT: stxvp vsp6, 32(r4)
+; CHECK-NEXT: addi r4, r4, 1
+; CHECK-NEXT: bdnz .LBB0_2
+; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: foo:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: cmplwi r3, 0
+; CHECK-BE-NEXT: beqlr cr0
+; CHECK-BE-NEXT: # %bb.1: # %for.body.lr.ph
+; CHECK-BE-NEXT: clrldi r6, r3, 32
+; CHECK-BE-NEXT: addi r3, r4, 64
+; CHECK-BE-NEXT: addi r4, r5, 64
+; CHECK-BE-NEXT: mtctr r6
+; CHECK-BE-NEXT: .p2align 4
+; CHECK-BE-NEXT: .LBB0_2: # %for.body
+; CHECK-BE-NEXT: #
+; CHECK-BE-NEXT: lxvp vsp0, -64(r3)
+; CHECK-BE-NEXT: lxvp vsp2, -32(r3)
+; CHECK-BE-NEXT: lxvp vsp4, 0(r3)
+; CHECK-BE-NEXT: lxvp vsp6, 32(r3)
+; CHECK-BE-NEXT: addi r3, r3, 1
+; CHECK-BE-NEXT: stxvp vsp0, -64(r4)
+; CHECK-BE-NEXT: stxvp vsp2, -32(r4)
+; CHECK-BE-NEXT: stxvp vsp4, 0(r4)
+; CHECK-BE-NEXT: stxvp vsp6, 32(r4)
+; CHECK-BE-NEXT: addi r4, r4, 1
+; CHECK-BE-NEXT: bdnz .LBB0_2
+; CHECK-BE-NEXT: # %bb.3: # %for.cond.cleanup
+; CHECK-BE-NEXT: blr
+entry:
+ %cmp35.not = icmp eq i32 %n, 0
+ br i1 %cmp35.not, label %for.cond.cleanup, label %for.body.lr.ph
+
+for.body.lr.ph:
+ %0 = bitcast <256 x i1>* %ptr to i8*
+ %1 = bitcast <256 x i1>* %ptr2 to i8*
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+ %2 = getelementptr i8, i8* %0, i64 %indvars.iv
+ %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2)
+ %add2 = add nuw nsw i64 %indvars.iv, 32
+ %4 = getelementptr i8, i8* %0, i64 %add2
+ %5 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %4)
+ %add4 = add nuw nsw i64 %indvars.iv, 64
+ %6 = getelementptr i8, i8* %0, i64 %add4
+ %7 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %6)
+ %add6 = add nuw nsw i64 %indvars.iv, 96
+ %8 = getelementptr i8, i8* %0, i64 %add6
+ %9 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %8)
+ %10 = getelementptr i8, i8* %1, i64 %indvars.iv
+ tail call void @llvm.ppc.mma.stxvp(<256 x i1> %3, i8* %10)
+ %11 = getelementptr i8, i8* %1, i64 %add2
+ tail call void @llvm.ppc.mma.stxvp(<256 x i1> %5, i8* %11)
+ %12 = getelementptr i8, i8* %1, i64 %add4
+ tail call void @llvm.ppc.mma.stxvp(<256 x i1> %7, i8* %12)
+ %13 = getelementptr i8, i8* %1, i64 %add6
+ tail call void @llvm.ppc.mma.stxvp(<256 x i1> %9, i8* %13)
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
diff --git a/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll
new file mode 100644
index 000000000000..816a28a61241
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -disable-lsr \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 < %s | FileCheck %s
+; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -disable-lsr \
+; RUN: -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr10 < %s | FileCheck %s \
+; RUN: --check-prefix=CHECK-BE
+
+; This test checks the PPCLoopInstrFormPrep pass supports the lxvp and stxvp
+; intrinsics so we generate more dq-form instructions instead of x-forms.
+
+%_elem_type_of_x = type <{ double }>
+%_elem_type_of_y = type <{ double }>
+
+define void @foo(i64* %.n, [0 x %_elem_type_of_x]* %.x, [0 x %_elem_type_of_y]* %.y, <2 x double>* %.sum) {
+; CHECK-LABEL: foo:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ld r5, 0(r3)
+; CHECK-NEXT: cmpdi r5, 1
+; CHECK-NEXT: bltlr cr0
+; CHECK-NEXT: # %bb.1: # %_loop_1_do_.lr.ph
+; CHECK-NEXT: addi r3, r4, 1
+; CHECK-NEXT: addi r4, r5, -1
+; CHECK-NEXT: lxv vs0, 0(r6)
+; CHECK-NEXT: rldicl r4, r4, 60, 4
+; CHECK-NEXT: addi r4, r4, 1
+; CHECK-NEXT: mtctr r4
+; CHECK-NEXT: .p2align 5
+; CHECK-NEXT: .LBB0_2: # %_loop_1_do_
+; CHECK-NEXT: #
+; CHECK-NEXT: lxvp vsp2, 0(r3)
+; CHECK-NEXT: lxvp vsp4, 32(r3)
+; CHECK-NEXT: addi r3, r3, 128
+; CHECK-NEXT: xvadddp vs0, vs0, vs3
+; CHECK-NEXT: xvadddp vs0, vs0, vs2
+; CHECK-NEXT: xvadddp vs0, vs0, vs5
+; CHECK-NEXT: xvadddp vs0, vs0, vs4
+; CHECK-NEXT: bdnz .LBB0_2
+; CHECK-NEXT: # %bb.3: # %_loop_1_loopHeader_._return_bb_crit_edge
+; CHECK-NEXT: stxv vs0, 0(r6)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: foo:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: ld r5, 0(r3)
+; CHECK-BE-NEXT: cmpdi r5, 1
+; CHECK-BE-NEXT: bltlr cr0
+; CHECK-BE-NEXT: # %bb.1: # %_loop_1_do_.lr.ph
+; CHECK-BE-NEXT: addi r3, r4, 1
+; CHECK-BE-NEXT: addi r4, r5, -1
+; CHECK-BE-NEXT: lxv vs0, 0(r6)
+; CHECK-BE-NEXT: rldicl r4, r4, 60, 4
+; CHECK-BE-NEXT: addi r4, r4, 1
+; CHECK-BE-NEXT: mtctr r4
+; CHECK-BE-NEXT: .p2align 5
+; CHECK-BE-NEXT: .LBB0_2: # %_loop_1_do_
+; CHECK-BE-NEXT: #
+; CHECK-BE-NEXT: lxvp vsp2, 0(r3)
+; CHECK-BE-NEXT: lxvp vsp4, 32(r3)
+; CHECK-BE-NEXT: addi r3, r3, 128
+; CHECK-BE-NEXT: xvadddp vs0, vs0, vs2
+; CHECK-BE-NEXT: xvadddp vs0, vs0, vs3
+; CHECK-BE-NEXT: xvadddp vs0, vs0, vs4
+; CHECK-BE-NEXT: xvadddp vs0, vs0, vs5
+; CHECK-BE-NEXT: bdnz .LBB0_2
+; CHECK-BE-NEXT: # %bb.3: # %_loop_1_loopHeader_._return_bb_crit_edge
+; CHECK-BE-NEXT: stxv vs0, 0(r6)
+; CHECK-BE-NEXT: blr
+entry:
+ %_val_n_2 = load i64, i64* %.n, align 8
+ %_grt_tmp7 = icmp slt i64 %_val_n_2, 1
+ br i1 %_grt_tmp7, label %_return_bb, label %_loop_1_do_.lr.ph
+
+_loop_1_do_.lr.ph: ; preds = %entry
+ %x_rvo_based_addr_5 = getelementptr inbounds [0 x %_elem_type_of_x], [0 x %_elem_type_of_x]* %.x, i64 0, i64 -1
+ %.sum.promoted = load <2 x double>, <2 x double>* %.sum, align 16
+ br label %_loop_1_do_
+
+_loop_1_do_: ; preds = %_loop_1_do_.lr.ph, %_loop_1_do_
+ %_val_sum_9 = phi <2 x double> [ %.sum.promoted, %_loop_1_do_.lr.ph ], [ %_add_tmp49, %_loop_1_do_ ]
+ %i.08 = phi i64 [ 1, %_loop_1_do_.lr.ph ], [ %_loop_1_update_loop_ix, %_loop_1_do_ ]
+ %x_ix_dim_0_6 = getelementptr %_elem_type_of_x, %_elem_type_of_x* %x_rvo_based_addr_5, i64 %i.08
+ %x_ix_dim_0_ = bitcast %_elem_type_of_x* %x_ix_dim_0_6 to i8*
+ %0 = getelementptr i8, i8* %x_ix_dim_0_, i64 1
+ %1 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %0)
+ %2 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %1)
+ %.fca.0.extract1 = extractvalue { <16 x i8>, <16 x i8> } %2, 0
+ %.fca.1.extract2 = extractvalue { <16 x i8>, <16 x i8> } %2, 1
+ %3 = getelementptr i8, i8* %x_ix_dim_0_, i64 33
+ %4 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %3)
+ %5 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %4)
+ %.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %5, 0
+ %.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %5, 1
+ %6 = bitcast <16 x i8> %.fca.0.extract1 to <2 x double>
+ %_add_tmp23 = fadd contract <2 x double> %_val_sum_9, %6
+ %7 = bitcast <16 x i8> %.fca.1.extract2 to <2 x double>
+ %_add_tmp32 = fadd contract <2 x double> %_add_tmp23, %7
+ %8 = bitcast <16 x i8> %.fca.0.extract to <2 x double>
+ %_add_tmp40 = fadd contract <2 x double> %_add_tmp32, %8
+ %9 = bitcast <16 x i8> %.fca.1.extract to <2 x double>
+ %_add_tmp49 = fadd contract <2 x double> %_add_tmp40, %9
+ %_loop_1_update_loop_ix = add nuw nsw i64 %i.08, 16
+ %_grt_tmp = icmp sgt i64 %_loop_1_update_loop_ix, %_val_n_2
+ br i1 %_grt_tmp, label %_loop_1_loopHeader_._return_bb_crit_edge, label %_loop_1_do_
+
+_loop_1_loopHeader_._return_bb_crit_edge: ; preds = %_loop_1_do_
+ store <2 x double> %_add_tmp49, <2 x double>* %.sum, align 16
+ br label %_return_bb
+
+_return_bb: ; preds = %_loop_1_loopHeader_._return_bb_crit_edge, %entry
+ ret void
+}
+
+declare <256 x i1> @llvm.ppc.mma.lxvp(i8*)
+declare { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1>)
diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
index a0f855200b68..0eb633ab3f2c 100644
--- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
@@ -698,3 +698,315 @@ entry:
declare <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1>, <256 x i1>, <16 x i8>, i32, i32)
declare <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1>, <256 x i1>, <16 x i8>)
+
+; Function Attrs: nounwind
+define void @test_ldst_1(<256 x i1>* %vpp, <256 x i1>* %vp2) {
+; CHECK-LABEL: test_ldst_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxvp vsp0, 0(r3)
+; CHECK-NEXT: stxvp vsp0, 0(r4)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_ldst_1:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxvp vsp0, 0(r3)
+; CHECK-BE-NEXT: stxvp vsp0, 0(r4)
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = bitcast <256 x i1>* %vpp to i8*
+ %1 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %0)
+ %2 = bitcast <256 x i1>* %vp2 to i8*
+ tail call void @llvm.ppc.mma.stxvp(<256 x i1> %1, i8* %2)
+ ret void
+}
+
+; Function Attrs: argmemonly nounwind readonly
+declare <256 x i1> @llvm.ppc.mma.lxvp(i8*)
+
+; Function Attrs: argmemonly nounwind writeonly
+declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*)
+
+; Function Attrs: nounwind
+define void @test_ldst_2(<256 x i1>* %vpp, i64 %offset, <256 x i1>* %vp2) {
+; CHECK-LABEL: test_ldst_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxvpx vsp0, r3, r4
+; CHECK-NEXT: stxvpx vsp0, r5, r4
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_ldst_2:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxvpx vsp0, r3, r4
+; CHECK-BE-NEXT: stxvpx vsp0, r5, r4
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = bitcast <256 x i1>* %vpp to i8*
+ %1 = getelementptr i8, i8* %0, i64 %offset
+ %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+ %3 = bitcast <256 x i1>* %vp2 to i8*
+ %4 = getelementptr i8, i8* %3, i64 %offset
+ tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+ ret void
+}
+
+; Function Attrs: nounwind
+define void @test_ldst_3(<256 x i1>* %vpp, <256 x i1>* %vp2) {
+; CHECK-LABEL: test_ldst_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li r5, 18
+; CHECK-NEXT: lxvpx vsp0, r3, r5
+; CHECK-NEXT: stxvpx vsp0, r4, r5
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_ldst_3:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: li r5, 18
+; CHECK-BE-NEXT: lxvpx vsp0, r3, r5
+; CHECK-BE-NEXT: stxvpx vsp0, r4, r5
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = bitcast <256 x i1>* %vpp to i8*
+ %1 = getelementptr i8, i8* %0, i64 18
+ %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+ %3 = bitcast <256 x i1>* %vp2 to i8*
+ %4 = getelementptr i8, i8* %3, i64 18
+ tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+ ret void
+}
+
+; Function Attrs: nounwind
+define void @test_ldst_4(<256 x i1>* %vpp, <256 x i1>* %vp2) {
+; CHECK-LABEL: test_ldst_4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li r5, 1
+; CHECK-NEXT: lxvpx vsp0, r3, r5
+; CHECK-NEXT: stxvpx vsp0, r4, r5
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_ldst_4:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: li r5, 1
+; CHECK-BE-NEXT: lxvpx vsp0, r3, r5
+; CHECK-BE-NEXT: stxvpx vsp0, r4, r5
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = bitcast <256 x i1>* %vpp to i8*
+ %1 = getelementptr i8, i8* %0, i64 1
+ %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+ %3 = bitcast <256 x i1>* %vp2 to i8*
+ %4 = getelementptr i8, i8* %3, i64 1
+ tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+ ret void
+}
+
+; Function Attrs: nounwind
+define void @test_ldst_5(<256 x i1>* %vpp, <256 x i1>* %vp2) {
+; CHECK-LABEL: test_ldst_5:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li r5, 42
+; CHECK-NEXT: lxvpx vsp0, r3, r5
+; CHECK-NEXT: stxvpx vsp0, r4, r5
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_ldst_5:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: li r5, 42
+; CHECK-BE-NEXT: lxvpx vsp0, r3, r5
+; CHECK-BE-NEXT: stxvpx vsp0, r4, r5
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = bitcast <256 x i1>* %vpp to i8*
+ %1 = getelementptr i8, i8* %0, i64 42
+ %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+ %3 = bitcast <256 x i1>* %vp2 to i8*
+ %4 = getelementptr i8, i8* %3, i64 42
+ tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+ ret void
+}
+
+; Function Attrs: nounwind
+define void @test_ldst_6(<256 x i1>* %vpp, <256 x i1>* %vp2) {
+; CHECK-LABEL: test_ldst_6:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxvp vsp0, 4096(r3)
+; CHECK-NEXT: stxvp vsp0, 4096(r4)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_ldst_6:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxvp vsp0, 4096(r3)
+; CHECK-BE-NEXT: stxvp vsp0, 4096(r4)
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = getelementptr <256 x i1>, <256 x i1>* %vpp, i64 128
+ %1 = bitcast <256 x i1>* %0 to i8*
+ %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+ %3 = getelementptr <256 x i1>, <256 x i1>* %vp2, i64 128
+ %4 = bitcast <256 x i1>* %3 to i8*
+ tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+ ret void
+}
+
+; Function Attrs: nounwind
+define void @test_ldst_7(<256 x i1>* %vpp, <256 x i1>* %vp2) {
+; FIXME: A prefixed load (plxvp) is expected here as the offset in this
+; test case is a constant that fits within 34-bits.
+; CHECK-LABEL: test_ldst_7:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li r5, 0
+; CHECK-NEXT: ori r5, r5, 32799
+; CHECK-NEXT: lxvpx vsp0, r3, r5
+; CHECK-NEXT: stxvpx vsp0, r4, r5
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_ldst_7:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: li r5, 0
+; CHECK-BE-NEXT: ori r5, r5, 32799
+; CHECK-BE-NEXT: lxvpx vsp0, r3, r5
+; CHECK-BE-NEXT: stxvpx vsp0, r4, r5
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = bitcast <256 x i1>* %vpp to i8*
+ %1 = getelementptr i8, i8* %0, i64 32799
+ %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+ %3 = bitcast <256 x i1>* %vp2 to i8*
+ %4 = getelementptr i8, i8* %3, i64 32799
+ tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+ ret void
+}
+
+; Function Attrs: nofree nounwind
+define void @test_ldst_8(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) {
+; CHECK-LABEL: test_ldst_8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxv vs1, 32(r3)
+; CHECK-NEXT: lxv vs0, 48(r3)
+; CHECK-NEXT: lxv vs3, 0(r3)
+; CHECK-NEXT: lxv vs2, 16(r3)
+; CHECK-NEXT: li r3, 8
+; CHECK-NEXT: lxvpx vsp4, r4, r3
+; CHECK-NEXT: xxmtacc acc0
+; CHECK-NEXT: pmxvf64gernn acc0, vsp4, v2, 0, 0
+; CHECK-NEXT: xxmfacc acc0
+; CHECK-NEXT: stxv vs0, 48(r7)
+; CHECK-NEXT: stxv vs1, 32(r7)
+; CHECK-NEXT: stxv vs2, 16(r7)
+; CHECK-NEXT: stxv vs3, 0(r7)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_ldst_8:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r3)
+; CHECK-BE-NEXT: lxv vs3, 48(r3)
+; CHECK-BE-NEXT: lxv vs2, 32(r3)
+; CHECK-BE-NEXT: li r3, 8
+; CHECK-BE-NEXT: lxvpx vsp4, r4, r3
+; CHECK-BE-NEXT: xxmtacc acc0
+; CHECK-BE-NEXT: pmxvf64gernn acc0, vsp4, v2, 0, 0
+; CHECK-BE-NEXT: xxmfacc acc0
+; CHECK-BE-NEXT: stxv vs1, 16(r7)
+; CHECK-BE-NEXT: stxv vs0, 0(r7)
+; CHECK-BE-NEXT: stxv vs3, 48(r7)
+; CHECK-BE-NEXT: stxv vs2, 32(r7)
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = bitcast i8* %vqp to <512 x i1>*
+ %1 = load <512 x i1>, <512 x i1>* %0, align 64
+ %2 = bitcast <256 x i1>* %vpp to i8*
+ %3 = getelementptr i8, i8* %2, i64 8
+ %4 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %3)
+ %5 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %1, <256 x i1> %4, <16 x i8> %vc, i32 0, i32 0)
+ %6 = bitcast i8* %resp to <512 x i1>*
+ store <512 x i1> %5, <512 x i1>* %6, align 64
+ ret void
+}
+
+; Function Attrs: nofree nounwind
+define void @test_ldst_9(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) {
+; CHECK-LABEL: test_ldst_9:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxv vs1, 32(r3)
+; CHECK-NEXT: lxv vs0, 48(r3)
+; CHECK-NEXT: lxv vs3, 0(r3)
+; CHECK-NEXT: lxv vs2, 16(r3)
+; CHECK-NEXT: lxvp vsp4, 0(r4)
+; CHECK-NEXT: xxmtacc acc0
+; CHECK-NEXT: xvf64gernp acc0, vsp4, v2
+; CHECK-NEXT: xxmfacc acc0
+; CHECK-NEXT: stxv vs0, 48(r7)
+; CHECK-NEXT: stxv vs1, 32(r7)
+; CHECK-NEXT: stxv vs2, 16(r7)
+; CHECK-NEXT: stxv vs3, 0(r7)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_ldst_9:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r3)
+; CHECK-BE-NEXT: lxv vs3, 48(r3)
+; CHECK-BE-NEXT: lxv vs2, 32(r3)
+; CHECK-BE-NEXT: lxvp vsp4, 0(r4)
+; CHECK-BE-NEXT: xxmtacc acc0
+; CHECK-BE-NEXT: xvf64gernp acc0, vsp4, v2
+; CHECK-BE-NEXT: xxmfacc acc0
+; CHECK-BE-NEXT: stxv vs1, 16(r7)
+; CHECK-BE-NEXT: stxv vs0, 0(r7)
+; CHECK-BE-NEXT: stxv vs3, 48(r7)
+; CHECK-BE-NEXT: stxv vs2, 32(r7)
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = bitcast i8* %vqp to <512 x i1>*
+ %1 = load <512 x i1>, <512 x i1>* %0, align 64
+ %2 = bitcast <256 x i1>* %vpp to i8*
+ %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2)
+ %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc)
+ %5 = bitcast i8* %resp to <512 x i1>*
+ store <512 x i1> %4, <512 x i1>* %5, align 64
+ ret void
+}
+
+; Function Attrs: nofree nounwind
+define void @test_ldst_10(i8* nocapture readonly %vqp, i64 %offs, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) {
+; CHECK-LABEL: test_ldst_10:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxv vs1, 32(r3)
+; CHECK-NEXT: lxv vs0, 48(r3)
+; CHECK-NEXT: lxv vs3, 0(r3)
+; CHECK-NEXT: lxv vs2, 16(r3)
+; CHECK-NEXT: lxvp vsp4, 0(r5)
+; CHECK-NEXT: xxmtacc acc0
+; CHECK-NEXT: xvf64gernp acc0, vsp4, v2
+; CHECK-NEXT: xxmfacc acc0
+; CHECK-NEXT: stxv vs0, 48(r9)
+; CHECK-NEXT: stxv vs1, 32(r9)
+; CHECK-NEXT: stxv vs2, 16(r9)
+; CHECK-NEXT: stxv vs3, 0(r9)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_ldst_10:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r3)
+; CHECK-BE-NEXT: lxv vs3, 48(r3)
+; CHECK-BE-NEXT: lxv vs2, 32(r3)
+; CHECK-BE-NEXT: lxvp vsp4, 0(r5)
+; CHECK-BE-NEXT: xxmtacc acc0
+; CHECK-BE-NEXT: xvf64gernp acc0, vsp4, v2
+; CHECK-BE-NEXT: xxmfacc acc0
+; CHECK-BE-NEXT: stxv vs1, 16(r9)
+; CHECK-BE-NEXT: stxv vs0, 0(r9)
+; CHECK-BE-NEXT: stxv vs3, 48(r9)
+; CHECK-BE-NEXT: stxv vs2, 32(r9)
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = bitcast i8* %vqp to <512 x i1>*
+ %1 = load <512 x i1>, <512 x i1>* %0, align 64
+ %2 = bitcast <256 x i1>* %vpp to i8*
+ %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2)
+ %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc)
+ %5 = bitcast i8* %resp to <512 x i1>*
+ store <512 x i1> %4, <512 x i1>* %5, align 64
+ ret void
+}
More information about the llvm-commits
mailing list