[llvm] 5a8b196 - [PowerPC] handle more splat loads without stack operation
Chen Zheng via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 2 22:33:13 PDT 2021
Author: Chen Zheng
Date: 2021-11-03T05:17:41Z
New Revision: 5a8b19634002f104c4d389ee53bce3ba858eda60
URL: https://github.com/llvm/llvm-project/commit/5a8b19634002f104c4d389ee53bce3ba858eda60
DIFF: https://github.com/llvm/llvm-project/commit/5a8b19634002f104c4d389ee53bce3ba858eda60.diff
LOG: [PowerPC] handle more splat loads without stack operation
This mostly improves splat loads code generation on Power7
Reviewed By: jsji
Differential Revision: https://reviews.llvm.org/D106555
Added:
Modified:
llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
llvm/lib/Target/PowerPC/PPCISelLowering.cpp
llvm/lib/Target/PowerPC/PPCISelLowering.h
llvm/lib/Target/PowerPC/PPCInstrVSX.td
llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
llvm/test/CodeGen/PowerPC/load-and-splat.ll
llvm/test/CodeGen/PowerPC/scalar_vector_test_3.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 93fae891dd9b3..e192efd9dff21 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -5825,6 +5825,68 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
return;
}
}
+ case PPCISD::LD_SPLAT: {
+ // For v16i8 and v8i16, if target has no direct move, we can still handle
+ // this without using stack.
+ if (Subtarget->hasAltivec() && !Subtarget->hasDirectMove()) {
+ SDValue ZeroReg =
+ CurDAG->getRegister(Subtarget->isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+ Subtarget->isPPC64() ? MVT::i64 : MVT::i32);
+ unsigned LIOpcode = Subtarget->isPPC64() ? PPC::LI8 : PPC::LI;
+ EVT Type = N->getValueType(0);
+ if (Type == MVT::v16i8 || Type == MVT::v8i16) {
+ // v16i8 LD_SPLAT addr
+ // ======>
+ // Mask = LVSR/LVSL 0, addr
+ // LoadLow = LXV 0, addr
+ // Perm = VPERM LoadLow, LoadLow, Mask
+ // Splat = VSPLTB 15/0, Perm
+ //
+ // v8i16 LD_SPLAT addr
+ // ======>
+ // Mask = LVSR/LVSL 0, addr
+ // LoadLow = LXV 0, addr
+ // LoadHigh = LXV (LI, 1), addr
+ // Perm = VPERM LoadLow, LoadHigh, Mask
+ // Splat = VSPLTH 7/0, Perm
+ unsigned SplatOp = (Type == MVT::v16i8) ? PPC::VSPLTB : PPC::VSPLTH;
+ unsigned SplatElemIndex =
+ Subtarget->isLittleEndian() ? ((Type == MVT::v16i8) ? 15 : 7) : 0;
+
+ SDNode *Mask = CurDAG->getMachineNode(
+ Subtarget->isLittleEndian() ? PPC::LVSR : PPC::LVSL, dl, Type,
+ ZeroReg, N->getOperand(1));
+
+ SDNode *LoadLow = CurDAG->getMachineNode(
+ PPC::LVX, dl, MVT::v16i8, MVT::Other,
+ {ZeroReg, N->getOperand(1), N->getOperand(0)});
+
+ SDNode *LoadHigh = LoadLow;
+ if (Type == MVT::v8i16) {
+ LoadHigh = CurDAG->getMachineNode(
+ PPC::LVX, dl, MVT::v16i8, MVT::Other,
+ {SDValue(CurDAG->getMachineNode(
+ LIOpcode, dl, MVT::i32,
+ CurDAG->getTargetConstant(1, dl, MVT::i8)),
+ 0),
+ N->getOperand(1), SDValue(LoadLow, 1)});
+ }
+
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(LoadHigh, 1));
+ transferMemOperands(N, LoadHigh);
+
+ SDNode *Perm =
+ CurDAG->getMachineNode(PPC::VPERM, dl, Type, SDValue(LoadLow, 0),
+ SDValue(LoadHigh, 0), SDValue(Mask, 0));
+ CurDAG->SelectNodeTo(
+ N, SplatOp, Type,
+ CurDAG->getTargetConstant(SplatElemIndex, dl, MVT::i8),
+ SDValue(Perm, 0));
+ return;
+ }
+ }
+ break;
+ }
}
SelectCode(N);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 36d08415290a9..f2727b2381098 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1712,6 +1712,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG";
case PPCISD::XXMFACC: return "PPCISD::XXMFACC";
case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT";
+ case PPCISD::ZEXT_LD_SPLAT: return "PPCISD::ZEXT_LD_SPLAT";
+ case PPCISD::SEXT_LD_SPLAT: return "PPCISD::SEXT_LD_SPLAT";
case PPCISD::FNMSUB: return "PPCISD::FNMSUB";
case PPCISD::STRICT_FADDRTZ:
return "PPCISD::STRICT_FADDRTZ";
@@ -9060,6 +9062,34 @@ bool llvm::checkConvertToNonDenormSingle(APFloat &ArgAPFloat) {
return (!LosesInfo && !APFloatToConvert.isDenormal());
}
+static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op,
+ unsigned &Opcode) {
+ const SDNode *InputNode = Op.getOperand(0).getNode();
+ if (!InputNode || !ISD::isUNINDEXEDLoad(InputNode))
+ return false;
+
+ if (!Subtarget.hasVSX())
+ return false;
+
+ EVT Ty = Op->getValueType(0);
+ if (Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32 ||
+ Ty == MVT::v8i16 || Ty == MVT::v16i8)
+ return true;
+
+ if (Ty == MVT::v2i64) {
+ // check the extend type if the input is i32 while the output vector type is
+ // v2i64.
+ if (cast<LoadSDNode>(Op.getOperand(0))->getMemoryVT() == MVT::i32) {
+ if (ISD::isZEXTLoad(InputNode))
+ Opcode = PPCISD::ZEXT_LD_SPLAT;
+ if (ISD::isSEXTLoad(InputNode))
+ Opcode = PPCISD::SEXT_LD_SPLAT;
+ }
+ return true;
+ }
+ return false;
+}
+
// If this is a case we can't handle, return null and let the default
// expansion code take care of it. If we CAN select this case, and if it
// selects to a single instruction, return Op. Otherwise, if we can codegen
@@ -9123,17 +9153,17 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
}
if (!BVNIsConstantSplat || SplatBitSize > 32) {
+ unsigned NewOpcode = PPCISD::LD_SPLAT;
- bool IsPermutedLoad = false;
- const SDValue *InputLoad =
- getNormalLoadInput(Op.getOperand(0), IsPermutedLoad);
// Handle load-and-splat patterns as we have instructions that will do this
// in one go.
- if (InputLoad && DAG.isSplatValue(Op, true)) {
+ if (DAG.isSplatValue(Op, true) &&
+ isValidSplatLoad(Subtarget, Op, NewOpcode)) {
+ const SDValue *InputLoad = &Op.getOperand(0);
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
- // We have handling for 4 and 8 byte elements.
- unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits();
+ unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits() *
+ ((NewOpcode == PPCISD::LD_SPLAT) ? 1 : 2);
// Checking for a single use of this load, we have to check for vector
// width (128 bits) / ElementSize uses (since each operand of the
@@ -9142,18 +9172,54 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
for (SDValue BVInOp : Op->ops())
if (BVInOp.isUndef())
NumUsesOfInputLD--;
+
+ // Execlude somes case where LD_SPLAT is worse than scalar_to_vector:
+ // Below cases should also happen for "lfiwzx/lfiwax + LE target + index
+ // 1" and "lxvrhx + BE target + index 7" and "lxvrbx + BE target + index
+ // 15", but funciton IsValidSplatLoad() now will only return true when
+ // the data at index 0 is not nullptr. So we will not get into trouble for
+ // these cases.
+ //
+ // case 1 - lfiwzx/lfiwax
+ // 1.1: load result is i32 and is sign/zero extend to i64;
+ // 1.2: build a v2i64 vector type with above loaded value;
+ // 1.3: the vector has only one value at index 0, others are all undef;
+ // 1.4: on BE target, so that lfiwzx/lfiwax does not need any permute.
+ if (NumUsesOfInputLD == 1 &&
+ (Op->getValueType(0) == MVT::v2i64 && NewOpcode != PPCISD::LD_SPLAT &&
+ !Subtarget.isLittleEndian() && Subtarget.hasVSX() &&
+ Subtarget.hasLFIWAX()))
+ return SDValue();
+
+ // case 2 - lxvrhx
+ // 2.1: load result is i16;
+ // 2.2: build a v8i16 vector with above loaded value;
+ // 2.3: the vector has only one value at index 0, others are all undef;
+ // 2.4: on LE target, so that lxvrhx does not need any permute.
+ if (NumUsesOfInputLD == 1 && Subtarget.isLittleEndian() &&
+ Subtarget.isISA3_1() && Op->getValueType(0) == MVT::v16i8)
+ return SDValue();
+
+ // case 3 - lxvrbx
+ // 3.1: load result is i8;
+ // 3.2: build a v16i8 vector with above loaded value;
+ // 3.3: the vector has only one value at index 0, others are all undef;
+ // 3.4: on LE target, so that lxvrbx does not need any permute.
+ if (NumUsesOfInputLD == 1 && Subtarget.isLittleEndian() &&
+ Subtarget.isISA3_1() && Op->getValueType(0) == MVT::v8i16)
+ return SDValue();
+
assert(NumUsesOfInputLD > 0 && "No uses of input LD of a build_vector?");
if (InputLoad->getNode()->hasNUsesOfValue(NumUsesOfInputLD, 0) &&
- ((Subtarget.hasVSX() && ElementSize == 64) ||
- (Subtarget.hasP9Vector() && ElementSize == 32))) {
+ Subtarget.hasVSX()) {
SDValue Ops[] = {
LD->getChain(), // Chain
LD->getBasePtr(), // Ptr
DAG.getValueType(Op.getValueType()) // VT
};
SDValue LdSplt = DAG.getMemIntrinsicNode(
- PPCISD::LD_SPLAT, dl, DAG.getVTList(Op.getValueType(), MVT::Other),
- Ops, LD->getMemoryVT(), LD->getMemOperand());
+ NewOpcode, dl, DAG.getVTList(Op.getValueType(), MVT::Other), Ops,
+ LD->getMemoryVT(), LD->getMemOperand());
// Replace all uses of the output chain of the original load with the
// output chain of the new load.
DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1),
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index f557f4031667e..450bc48ec1439 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -559,6 +559,14 @@ namespace llvm {
/// instructions such as LXVDSX, LXVWSX.
LD_SPLAT,
+ /// VSRC, CHAIN = ZEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
+ /// that zero-extends.
+ ZEXT_LD_SPLAT,
+
+ /// VSRC, CHAIN = SEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
+ /// that sign-extends.
+ SEXT_LD_SPLAT,
+
/// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
/// Maps directly to an stxvd2x instruction that will be preceded by
/// an xxswapd.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index b00e58010b06b..c25e9f8da933f 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -138,6 +138,10 @@ def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def PPCzextldsplat : SDNode<"PPCISD::ZEXT_LD_SPLAT", SDT_PPCldsplat,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def PPCsextldsplat : SDNode<"PPCISD::SEXT_LD_SPLAT", SDT_PPCldsplat,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def PPCSToV : SDNode<"PPCISD::SCALAR_TO_VECTOR_PERMUTED",
SDTypeProfile<1, 1, []>, []>;
@@ -2827,10 +2831,20 @@ def : Pat<(v4f32 (build_vector (f32 (fpround f64:$A)), (f32 (fpround f64:$A)),
def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)),
(v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>;
+
+// Splat loads.
def : Pat<(v2f64 (PPCldsplat ForceXForm:$A)),
(v2f64 (LXVDSX ForceXForm:$A))>;
+def : Pat<(v4f32 (PPCldsplat ForceXForm:$A)),
+ (v4f32 (XXSPLTW (SUBREG_TO_REG (i64 1), (LFIWZX ForceXForm:$A), sub_64), 1))>;
def : Pat<(v2i64 (PPCldsplat ForceXForm:$A)),
(v2i64 (LXVDSX ForceXForm:$A))>;
+def : Pat<(v4i32 (PPCldsplat ForceXForm:$A)),
+ (v4i32 (XXSPLTW (SUBREG_TO_REG (i64 1), (LFIWZX ForceXForm:$A), sub_64), 1))>;
+def : Pat<(v2i64 (PPCzextldsplat ForceXForm:$A)),
+ (v2i64 (XXPERMDIs (LFIWZX ForceXForm:$A), 0))>;
+def : Pat<(v2i64 (PPCsextldsplat ForceXForm:$A)),
+ (v2i64 (XXPERMDIs (LFIWAX ForceXForm:$A), 0))>;
// Build vectors of floating point converted to i64.
def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)),
@@ -3540,6 +3554,14 @@ def : Pat<(v16i8 (PPCmtvsrz i32:$A)),
def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A,
immSExt5NonZero:$A, immSExt5NonZero:$A)),
(v4i32 (VSPLTISW imm:$A))>;
+
+// Splat loads.
+// Note that, we use MTVSRD without checking PPC64 because we only care the
+// lowest 16/8 bits.
+def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)),
+ (v8i16 (VSPLTHs 3, (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (LHZX ForceXForm:$A), sub_32))))>;
+def : Pat<(v16i8 (PPCldsplat ForceXForm:$A)),
+ (v16i8 (VSPLTBs 7, (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (LBZX ForceXForm:$A), sub_32))))>;
} // HasVSX, HasDirectMove
// Big endian VSX subtarget with direct moves.
@@ -4087,6 +4109,10 @@ def : Pat<(v4f32 (PPCldsplat ForceXForm:$A)),
(v4f32 (LXVWSX ForceXForm:$A))>;
def : Pat<(v4i32 (PPCldsplat ForceXForm:$A)),
(v4i32 (LXVWSX ForceXForm:$A))>;
+def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)),
+ (v8i16 (VSPLTHs 3, (LXSIHZX ForceXForm:$A)))>;
+def : Pat<(v16i8 (PPCldsplat ForceXForm:$A)),
+ (v16i8 (VSPLTBs 7, (LXSIBZX ForceXForm:$A)))>;
} // HasVSX, HasP9Vector
// Any Power9 VSX subtarget with equivalent length but better Power10 VSX
diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index 4bbb6ed85a6ce..f644120019f8b 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -603,14 +603,24 @@ bool PPCMIPeephole::simplifyCode(void) {
ToErase = &MI;
Simplified = true;
}
- } else if ((Immed == 0 || Immed == 3) && DefOpc == PPC::XXPERMDIs &&
+ } else if ((Immed == 0 || Immed == 3 || Immed == 2) &&
+ DefOpc == PPC::XXPERMDIs &&
(DefMI->getOperand(2).getImm() == 0 ||
DefMI->getOperand(2).getImm() == 3)) {
+ ToErase = &MI;
+ Simplified = true;
+ // Swap of a splat, convert to copy.
+ if (Immed == 2) {
+ LLVM_DEBUG(dbgs() << "Optimizing swap(splat) => copy(splat): ");
+ LLVM_DEBUG(MI.dump());
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
+ MI.getOperand(0).getReg())
+ .add(MI.getOperand(1));
+ break;
+ }
// Splat fed by another splat - switch the output of the first
// and remove the second.
DefMI->getOperand(0).setReg(MI.getOperand(0).getReg());
- ToErase = &MI;
- Simplified = true;
LLVM_DEBUG(dbgs() << "Removing redundant splat: ");
LLVM_DEBUG(MI.dump());
}
diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
index 0d5bb96e57fde..655ab5c08a962 100644
--- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
+++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
@@ -640,24 +640,20 @@ entry:
define dso_local <16 x i8> @no_RAUW_in_combine_during_legalize(i32* nocapture readonly %ptr, i32 signext %offset) local_unnamed_addr #0 {
; CHECK-P8-LABEL: no_RAUW_in_combine_during_legalize:
; CHECK-P8: # %bb.0: # %entry
-; CHECK-P8-NEXT: addis r5, r2, .LCPI16_0 at toc@ha
; CHECK-P8-NEXT: sldi r4, r4, 2
-; CHECK-P8-NEXT: xxlxor v4, v4, v4
-; CHECK-P8-NEXT: addi r5, r5, .LCPI16_0 at toc@l
-; CHECK-P8-NEXT: lxsiwzx v2, r3, r4
-; CHECK-P8-NEXT: lvx v3, 0, r5
-; CHECK-P8-NEXT: vperm v2, v4, v2, v3
+; CHECK-P8-NEXT: xxlxor v3, v3, v3
+; CHECK-P8-NEXT: lfiwzx f0, r3, r4
+; CHECK-P8-NEXT: xxspltd v2, f0, 0
+; CHECK-P8-NEXT: vmrglb v2, v3, v2
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: no_RAUW_in_combine_during_legalize:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: sldi r4, r4, 2
-; CHECK-P9-NEXT: xxlxor v4, v4, v4
-; CHECK-P9-NEXT: lxsiwzx v2, r3, r4
-; CHECK-P9-NEXT: addis r3, r2, .LCPI16_0 at toc@ha
-; CHECK-P9-NEXT: addi r3, r3, .LCPI16_0 at toc@l
-; CHECK-P9-NEXT: lxv v3, 0(r3)
-; CHECK-P9-NEXT: vperm v2, v4, v2, v3
+; CHECK-P9-NEXT: xxlxor v3, v3, v3
+; CHECK-P9-NEXT: lfiwzx f0, r3, r4
+; CHECK-P9-NEXT: xxspltd v2, f0, 0
+; CHECK-P9-NEXT: vmrglb v2, v3, v2
; CHECK-P9-NEXT: blr
;
; CHECK-P9-BE-LABEL: no_RAUW_in_combine_during_legalize:
@@ -682,12 +678,9 @@ define dso_local <16 x i8> @no_RAUW_in_combine_during_legalize(i32* nocapture re
; CHECK-P7-LABEL: no_RAUW_in_combine_during_legalize:
; CHECK-P7: # %bb.0: # %entry
; CHECK-P7-NEXT: sldi r4, r4, 2
-; CHECK-P7-NEXT: addi r5, r1, -16
; CHECK-P7-NEXT: xxlxor v3, v3, v3
-; CHECK-P7-NEXT: lwzx r3, r3, r4
-; CHECK-P7-NEXT: std r3, -16(r1)
-; CHECK-P7-NEXT: lxvd2x vs0, 0, r5
-; CHECK-P7-NEXT: xxswapd v2, vs0
+; CHECK-P7-NEXT: lfiwzx f0, r3, r4
+; CHECK-P7-NEXT: xxspltd v2, f0, 0
; CHECK-P7-NEXT: vmrglb v2, v3, v2
; CHECK-P7-NEXT: blr
entry:
@@ -831,7 +824,7 @@ entry:
define dso_local void @testByteSplat() #0 {
; CHECK-P8-LABEL: testByteSplat:
; CHECK-P8: # %bb.0: # %entry
-; CHECK-P8-NEXT: lbz r3, 0(r3)
+; CHECK-P8-NEXT: lbzx r3, 0, r3
; CHECK-P8-NEXT: mtvsrd v2, r3
; CHECK-P8-NEXT: vspltb v2, v2, 7
; CHECK-P8-NEXT: stvx v2, 0, r3
@@ -863,10 +856,9 @@ define dso_local void @testByteSplat() #0 {
;
; CHECK-P7-LABEL: testByteSplat:
; CHECK-P7: # %bb.0: # %entry
-; CHECK-P7-NEXT: lbz r3, 0(r3)
-; CHECK-P7-NEXT: stb r3, -16(r1)
-; CHECK-P7-NEXT: addi r3, r1, -16
-; CHECK-P7-NEXT: lvx v2, 0, r3
+; CHECK-P7-NEXT: lvsr v2, 0, r3
+; CHECK-P7-NEXT: lvx v3, 0, r3
+; CHECK-P7-NEXT: vperm v2, v3, v3, v2
; CHECK-P7-NEXT: vspltb v2, v2, 15
; CHECK-P7-NEXT: stvx v2, 0, r3
; CHECK-P7-NEXT: blr
diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
index dca65cdb2612f..0f3f1109186a5 100644
--- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll
+++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
@@ -59,11 +59,9 @@ define dso_local void @test2(<4 x float>* nocapture %c, float* nocapture readonl
;
; P7-LABEL: test2:
; P7: # %bb.0: # %entry
-; P7-NEXT: lwz r4, 12(r4)
-; P7-NEXT: addi r5, r1, -16
-; P7-NEXT: stw r4, -16(r1)
-; P7-NEXT: lxvw4x vs0, 0, r5
-; P7-NEXT: xxspltw vs0, vs0, 0
+; P7-NEXT: addi r4, r4, 12
+; P7-NEXT: lfiwzx f0, 0, r4
+; P7-NEXT: xxspltw vs0, vs0, 1
; P7-NEXT: stxvw4x vs0, 0, r3
; P7-NEXT: blr
entry:
@@ -94,11 +92,9 @@ define dso_local void @test3(<4 x i32>* nocapture %c, i32* nocapture readonly %a
;
; P7-LABEL: test3:
; P7: # %bb.0: # %entry
-; P7-NEXT: lwz r4, 12(r4)
-; P7-NEXT: addi r5, r1, -16
-; P7-NEXT: stw r4, -16(r1)
-; P7-NEXT: lxvw4x vs0, 0, r5
-; P7-NEXT: xxspltw vs0, vs0, 0
+; P7-NEXT: addi r4, r4, 12
+; P7-NEXT: lfiwzx f0, 0, r4
+; P7-NEXT: xxspltw vs0, vs0, 1
; P7-NEXT: stxvw4x vs0, 0, r3
; P7-NEXT: blr
entry:
@@ -110,6 +106,7 @@ entry:
ret void
}
+
; v2i64
define dso_local void @test4(<2 x i64>* nocapture %c, i64* nocapture readonly %a) local_unnamed_addr {
; P9-LABEL: test4:
@@ -146,24 +143,21 @@ define void @test5(<2 x i64>* %a, i32* %in) {
; P9-LABEL: test5:
; P9: # %bb.0: # %entry
; P9-NEXT: lfiwax f0, 0, r4
-; P9-NEXT: xxspltd vs0, vs0, 0
+; P9-NEXT: xxspltd vs0, f0, 0
; P9-NEXT: stxv vs0, 0(r3)
; P9-NEXT: blr
;
; P8-LABEL: test5:
; P8: # %bb.0: # %entry
; P8-NEXT: lfiwax f0, 0, r4
-; P8-NEXT: xxspltd vs0, vs0, 0
+; P8-NEXT: xxspltd vs0, f0, 0
; P8-NEXT: stxvd2x vs0, 0, r3
; P8-NEXT: blr
;
; P7-LABEL: test5:
; P7: # %bb.0: # %entry
-; P7-NEXT: lwa r4, 0(r4)
-; P7-NEXT: addi r5, r1, -16
-; P7-NEXT: std r4, -8(r1)
-; P7-NEXT: std r4, -16(r1)
-; P7-NEXT: lxvd2x vs0, 0, r5
+; P7-NEXT: lfiwax f0, 0, r4
+; P7-NEXT: xxspltd vs0, f0, 0
; P7-NEXT: stxvd2x vs0, 0, r3
; P7-NEXT: blr
entry:
@@ -180,24 +174,21 @@ define void @test6(<2 x i64>* %a, i32* %in) {
; P9-LABEL: test6:
; P9: # %bb.0: # %entry
; P9-NEXT: lfiwzx f0, 0, r4
-; P9-NEXT: xxspltd vs0, vs0, 0
+; P9-NEXT: xxspltd vs0, f0, 0
; P9-NEXT: stxv vs0, 0(r3)
; P9-NEXT: blr
;
; P8-LABEL: test6:
; P8: # %bb.0: # %entry
; P8-NEXT: lfiwzx f0, 0, r4
-; P8-NEXT: xxspltd vs0, vs0, 0
+; P8-NEXT: xxspltd vs0, f0, 0
; P8-NEXT: stxvd2x vs0, 0, r3
; P8-NEXT: blr
;
; P7-LABEL: test6:
; P7: # %bb.0: # %entry
-; P7-NEXT: lwz r4, 0(r4)
-; P7-NEXT: addi r5, r1, -16
-; P7-NEXT: std r4, -8(r1)
-; P7-NEXT: std r4, -16(r1)
-; P7-NEXT: lxvd2x vs0, 0, r5
+; P7-NEXT: lfiwzx f0, 0, r4
+; P7-NEXT: xxspltd vs0, f0, 0
; P7-NEXT: stxvd2x vs0, 0, r3
; P7-NEXT: blr
entry:
@@ -220,7 +211,7 @@ define void @test7(<8 x i16>* %a, i16* %in) {
;
; P8-LABEL: test7:
; P8: # %bb.0: # %entry
-; P8-NEXT: lhz r4, 0(r4)
+; P8-NEXT: lhzx r4, 0, r4
; P8-NEXT: mtvsrd v2, r4
; P8-NEXT: vsplth v2, v2, 3
; P8-NEXT: stvx v2, 0, r3
@@ -228,10 +219,11 @@ define void @test7(<8 x i16>* %a, i16* %in) {
;
; P7-LABEL: test7:
; P7: # %bb.0: # %entry
-; P7-NEXT: lhz r4, 0(r4)
-; P7-NEXT: addi r5, r1, -16
-; P7-NEXT: sth r4, -16(r1)
-; P7-NEXT: lxvw4x v2, 0, r5
+; P7-NEXT: li r5, 1
+; P7-NEXT: lvx v2, 0, r4
+; P7-NEXT: lvsl v4, 0, r4
+; P7-NEXT: lvx v3, r5, r4
+; P7-NEXT: vperm v2, v2, v3, v4
; P7-NEXT: vsplth v2, v2, 0
; P7-NEXT: stxvw4x v2, 0, r3
; P7-NEXT: blr
@@ -254,7 +246,7 @@ define void @test8(<16 x i8>* %a, i8* %in) {
;
; P8-LABEL: test8:
; P8: # %bb.0: # %entry
-; P8-NEXT: lbz r4, 0(r4)
+; P8-NEXT: lbzx r4, 0, r4
; P8-NEXT: mtvsrd v2, r4
; P8-NEXT: vspltb v2, v2, 7
; P8-NEXT: stvx v2, 0, r3
@@ -262,10 +254,9 @@ define void @test8(<16 x i8>* %a, i8* %in) {
;
; P7-LABEL: test8:
; P7: # %bb.0: # %entry
-; P7-NEXT: lbz r4, 0(r4)
-; P7-NEXT: addi r5, r1, -16
-; P7-NEXT: stb r4, -16(r1)
-; P7-NEXT: lxvw4x v2, 0, r5
+; P7-NEXT: lvsl v2, 0, r4
+; P7-NEXT: lvx v3, 0, r4
+; P7-NEXT: vperm v2, v3, v3, v2
; P7-NEXT: vspltb v2, v2, 0
; P7-NEXT: stxvw4x v2, 0, r3
; P7-NEXT: blr
diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_3.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_3.ll
index 77d0116c1b7d9..e2291ea2a9fb3 100644
--- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_3.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_3.ll
@@ -204,25 +204,25 @@ define <2 x i64> @s2v_test6(i32* nocapture readonly %ptr) {
; P9LE-LABEL: s2v_test6:
; P9LE: # %bb.0: # %entry
; P9LE-NEXT: lfiwax f0, 0, r3
-; P9LE-NEXT: xxspltd v2, vs0, 0
+; P9LE-NEXT: xxspltd v2, f0, 0
; P9LE-NEXT: blr
;
; P9BE-LABEL: s2v_test6:
; P9BE: # %bb.0: # %entry
; P9BE-NEXT: lfiwax f0, 0, r3
-; P9BE-NEXT: xxspltd v2, vs0, 0
+; P9BE-NEXT: xxspltd v2, f0, 0
; P9BE-NEXT: blr
;
; P8LE-LABEL: s2v_test6:
; P8LE: # %bb.0: # %entry
; P8LE-NEXT: lfiwax f0, 0, r3
-; P8LE-NEXT: xxspltd v2, vs0, 0
+; P8LE-NEXT: xxspltd v2, f0, 0
; P8LE-NEXT: blr
;
; P8BE-LABEL: s2v_test6:
; P8BE: # %bb.0: # %entry
; P8BE-NEXT: lfiwax f0, 0, r3
-; P8BE-NEXT: xxspltd v2, vs0, 0
+; P8BE-NEXT: xxspltd v2, f0, 0
; P8BE-NEXT: blr
@@ -240,25 +240,25 @@ define <2 x i64> @s2v_test7(i32* nocapture readonly %ptr) {
; P9LE-LABEL: s2v_test7:
; P9LE: # %bb.0: # %entry
; P9LE-NEXT: lfiwax f0, 0, r3
-; P9LE-NEXT: xxspltd v2, vs0, 0
+; P9LE-NEXT: xxspltd v2, f0, 0
; P9LE-NEXT: blr
;
; P9BE-LABEL: s2v_test7:
; P9BE: # %bb.0: # %entry
; P9BE-NEXT: lfiwax f0, 0, r3
-; P9BE-NEXT: xxspltd v2, vs0, 0
+; P9BE-NEXT: xxspltd v2, f0, 0
; P9BE-NEXT: blr
;
; P8LE-LABEL: s2v_test7:
; P8LE: # %bb.0: # %entry
; P8LE-NEXT: lfiwax f0, 0, r3
-; P8LE-NEXT: xxspltd v2, vs0, 0
+; P8LE-NEXT: xxspltd v2, f0, 0
; P8LE-NEXT: blr
;
; P8BE-LABEL: s2v_test7:
; P8BE: # %bb.0: # %entry
; P8BE-NEXT: lfiwax f0, 0, r3
-; P8BE-NEXT: xxspltd v2, vs0, 0
+; P8BE-NEXT: xxspltd v2, f0, 0
; P8BE-NEXT: blr
More information about the llvm-commits
mailing list