[llvm] 71be020 - [SelectionDAG][PowerPC] Memset reuse vector element for tail store

Ting Wang via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 5 22:54:50 PDT 2023


Author: Ting Wang
Date: 2023-09-06T01:52:38-04:00
New Revision: 71be020dda2c97c2733e45f4b1003d1c135b3b43

URL: https://github.com/llvm/llvm-project/commit/71be020dda2c97c2733e45f4b1003d1c135b3b43
DIFF: https://github.com/llvm/llvm-project/commit/71be020dda2c97c2733e45f4b1003d1c135b3b43.diff

LOG: [SelectionDAG][PowerPC] Memset reuse vector element for tail store

On PPC there are instructions to store element from vector(e.g.
stxsdx/stxsiwx), and these instructions can be leveraged to avoid tail
constant in memset and constant splat array initialization.

This patch tries to explore these opportunities.

Reviewed By: shchenz

Differential Revision: https://reviews.llvm.org/D138883

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/lib/Target/PowerPC/PPCISelLowering.h
    llvm/lib/Target/PowerPC/PPCInstrP10.td
    llvm/test/CodeGen/PowerPC/memset-tail.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index fe0bce8df329747..12b280d5b1a0bcd 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -845,6 +845,15 @@ class TargetLoweringBase {
     return false;
   }
 
+  /// Return true if the target shall perform extract vector element and store
+  /// given that the vector is known to be splat of constant.
+  /// \p Index[out] gives the index of the vector element to be extracted when
+  /// this is true.
+  virtual bool shallExtractConstSplatVectorElementToStore(
+      Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const {
+    return false;
+  }
+
   /// Return true if inserting a scalar into a variable element of an undef
   /// vector is more efficiently handled by splatting the scalar instead.
   virtual bool shouldSplatInsEltVarIndex(EVT) const {

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 12cb4a5f7f83e7b..30bee510e1e78bb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7757,13 +7757,28 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
     }
 
     // If this store is smaller than the largest store see whether we can get
-    // the smaller value for free with a truncate.
+    // the smaller value for free with a truncate or extract vector element and
+    // then store.
     SDValue Value = MemSetValue;
     if (VT.bitsLT(LargestVT)) {
+      unsigned Index;
+      unsigned NElts = LargestVT.getSizeInBits() / VT.getSizeInBits();
+      EVT SVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), NElts);
       if (!LargestVT.isVector() && !VT.isVector() &&
           TLI.isTruncateFree(LargestVT, VT))
         Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue);
-      else
+      else if (LargestVT.isVector() && !VT.isVector() &&
+               TLI.shallExtractConstSplatVectorElementToStore(
+                   LargestVT.getTypeForEVT(*DAG.getContext()),
+                   VT.getSizeInBits(), Index) &&
+               TLI.isTypeLegal(SVT) &&
+               LargestVT.getSizeInBits() == SVT.getSizeInBits()) {
+        // Target which can combine store(extractelement VectorTy, Idx) can get
+        // the smaller value for free.
+        SDValue TailValue = DAG.getNode(ISD::BITCAST, dl, SVT, MemSetValue);
+        Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, TailValue,
+                            DAG.getVectorIdxConstant(Index, dl));
+      } else
         Value = getMemsetValue(Src, VT, DAG, dl);
     }
     assert(Value.getValueType() == VT && "Value with wrong type.");

diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 33b2867b5dd8b88..6bc89891c0dc44d 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1635,6 +1635,27 @@ bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
   return VT.isScalarInteger();
 }
 
+bool PPCTargetLowering::shallExtractConstSplatVectorElementToStore(
+    Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const {
+  if (!Subtarget.isPPC64() || !Subtarget.hasVSX())
+    return false;
+
+  if (auto *VTy = dyn_cast<VectorType>(VectorTy)) {
+    if (VTy->getScalarType()->isIntegerTy()) {
+      // ElemSizeInBits 8/16 can fit in immediate field, not needed here.
+      if (ElemSizeInBits == 32) {
+        Index = Subtarget.isLittleEndian() ? 2 : 1;
+        return true;
+      }
+      if (ElemSizeInBits == 64) {
+        Index = Subtarget.isLittleEndian() ? 1 : 0;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((PPCISD::NodeType)Opcode) {
   case PPCISD::FIRST_NUMBER:    break;
@@ -17086,10 +17107,20 @@ EVT PPCTargetLowering::getOptimalMemOpType(
   if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
     // We should use Altivec/VSX loads and stores when available. For unaligned
     // addresses, unaligned VSX loads are only fast starting with the P8.
-    if (Subtarget.hasAltivec() && Op.size() >= 16 &&
-        (Op.isAligned(Align(16)) ||
-         ((Op.isMemset() && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))
-      return MVT::v4i32;
+    if (Subtarget.hasAltivec() && Op.size() >= 16) {
+      if (Op.isMemset() && Subtarget.hasVSX()) {
+        uint64_t TailSize = Op.size() % 16;
+        // For memset lowering, EXTRACT_VECTOR_ELT tries to return constant
+        // element if vector element type matches tail store. For tail size
+        // 3/4, the tail store is i32, v4i32 cannot be used, need a legal one.
+        if (TailSize > 2 && TailSize <= 4) {
+          return MVT::v8i16;
+        }
+        return MVT::v4i32;
+      }
+      if (Op.isAligned(Align(16)) || Subtarget.hasP8Vector())
+        return MVT::v4i32;
+    }
   }
 
   if (Subtarget.isPPC64()) {

diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 6ef2d607ff9d3bc..8cbefbdb917359e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -791,6 +791,11 @@ namespace llvm {
       return true;
     }
 
+    bool
+    shallExtractConstSplatVectorElementToStore(Type *VectorTy,
+                                               unsigned ElemSizeInBits,
+                                               unsigned &Index) const override;
+
     bool isCtlzFast() const override {
       return true;
     }

diff  --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index fdfb762eec13e82..a5429b38dfbe2d6 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -2031,8 +2031,15 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, IsLittleEndian] in {
             (v8i16 (COPY_TO_REGCLASS (LXVRHX ForceXForm:$src), VSRC))>;
   def : Pat<(v16i8 (scalar_to_vector (i32 (extloadi8 ForceXForm:$src)))),
             (v16i8 (COPY_TO_REGCLASS (LXVRBX ForceXForm:$src), VSRC))>;
+  def : Pat<(store (i64 (extractelt v2i64:$A, 1)), ForceXForm:$src),
+            (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), ForceXForm:$src)>;
  }
 
+let Predicates = [IsISA3_1, IsBigEndian] in {
+  def : Pat<(store (i64 (extractelt v2i64:$A, 0)), ForceXForm:$src),
+            (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), ForceXForm:$src)>;
+}
+
 // FIXME: The swap is overkill when the shift amount is a constant.
 // We should just fix the constant in the DAG.
 let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in {

diff  --git a/llvm/test/CodeGen/PowerPC/memset-tail.ll b/llvm/test/CodeGen/PowerPC/memset-tail.ll
index 4c94d543569122e..ae14768219a9287 100644
--- a/llvm/test/CodeGen/PowerPC/memset-tail.ll
+++ b/llvm/test/CodeGen/PowerPC/memset-tail.ll
@@ -169,59 +169,45 @@ define dso_local void @memsetTailV1B8(ptr nocapture noundef writeonly %p) local_
 ; P8-BE-LABEL: memsetTailV1B8:
 ; P8-BE:       # %bb.0: # %entry
 ; P8-BE-NEXT:    vspltisb 2, 15
-; P8-BE-NEXT:    lis 4, 3855
-; P8-BE-NEXT:    ori 4, 4, 3855
-; P8-BE-NEXT:    rldimi 4, 4, 32, 0
+; P8-BE-NEXT:    li 4, 16
+; P8-BE-NEXT:    stxsdx 34, 3, 4
 ; P8-BE-NEXT:    stxvw4x 34, 0, 3
-; P8-BE-NEXT:    std 4, 16(3)
 ; P8-BE-NEXT:    blr
 ;
 ; P9-BE-LABEL: memsetTailV1B8:
 ; P9-BE:       # %bb.0: # %entry
-; P9-BE-NEXT:    lis 4, 3855
 ; P9-BE-NEXT:    xxspltib 0, 15
-; P9-BE-NEXT:    ori 4, 4, 3855
 ; P9-BE-NEXT:    stxv 0, 0(3)
-; P9-BE-NEXT:    rldimi 4, 4, 32, 0
-; P9-BE-NEXT:    std 4, 16(3)
+; P9-BE-NEXT:    stfd 0, 16(3)
 ; P9-BE-NEXT:    blr
 ;
 ; P10-BE-LABEL: memsetTailV1B8:
 ; P10-BE:       # %bb.0: # %entry
-; P10-BE-NEXT:    pli 4, 252645135
-; P10-BE-NEXT:    rldimi 4, 4, 32, 0
-; P10-BE-NEXT:    std 4, 16(3)
 ; P10-BE-NEXT:    xxspltib 0, 15
 ; P10-BE-NEXT:    stxv 0, 0(3)
+; P10-BE-NEXT:    stfd 0, 16(3)
 ; P10-BE-NEXT:    blr
 ;
 ; P8-LE-LABEL: memsetTailV1B8:
 ; P8-LE:       # %bb.0: # %entry
-; P8-LE-NEXT:    lis 4, 3855
 ; P8-LE-NEXT:    vspltisb 2, 15
-; P8-LE-NEXT:    ori 4, 4, 3855
-; P8-LE-NEXT:    rldimi 4, 4, 32, 0
-; P8-LE-NEXT:    std 4, 16(3)
+; P8-LE-NEXT:    li 4, 16
+; P8-LE-NEXT:    stxsdx 34, 3, 4
 ; P8-LE-NEXT:    stxvd2x 34, 0, 3
 ; P8-LE-NEXT:    blr
 ;
 ; P9-LE-LABEL: memsetTailV1B8:
 ; P9-LE:       # %bb.0: # %entry
-; P9-LE-NEXT:    lis 4, 3855
 ; P9-LE-NEXT:    xxspltib 0, 15
-; P9-LE-NEXT:    ori 4, 4, 3855
 ; P9-LE-NEXT:    stxv 0, 0(3)
-; P9-LE-NEXT:    rldimi 4, 4, 32, 0
-; P9-LE-NEXT:    std 4, 16(3)
+; P9-LE-NEXT:    stfd 0, 16(3)
 ; P9-LE-NEXT:    blr
 ;
 ; P10-LE-LABEL: memsetTailV1B8:
 ; P10-LE:       # %bb.0: # %entry
-; P10-LE-NEXT:    pli 4, 252645135
-; P10-LE-NEXT:    rldimi 4, 4, 32, 0
-; P10-LE-NEXT:    std 4, 16(3)
 ; P10-LE-NEXT:    xxspltib 0, 15
 ; P10-LE-NEXT:    stxv 0, 0(3)
+; P10-LE-NEXT:    stfd 0, 16(3)
 ; P10-LE-NEXT:    blr
 entry:
   tail call void @llvm.memset.p0.i64(ptr %p, i8 15, i64 24, i1 false)
@@ -231,63 +217,45 @@ entry:
 define dso_local void @memsetTailV1B7(ptr nocapture noundef writeonly %p) local_unnamed_addr {
 ; P8-BE-LABEL: memsetTailV1B7:
 ; P8-BE:       # %bb.0: # %entry
-; P8-BE-NEXT:    lis 4, 3855
 ; P8-BE-NEXT:    vspltisb 2, 15
-; P8-BE-NEXT:    li 5, 15
-; P8-BE-NEXT:    ori 4, 4, 3855
-; P8-BE-NEXT:    rldimi 4, 4, 32, 0
-; P8-BE-NEXT:    stdx 4, 3, 5
+; P8-BE-NEXT:    li 4, 15
+; P8-BE-NEXT:    stxsdx 34, 3, 4
 ; P8-BE-NEXT:    stxvw4x 34, 0, 3
 ; P8-BE-NEXT:    blr
 ;
 ; P9-BE-LABEL: memsetTailV1B7:
 ; P9-BE:       # %bb.0: # %entry
-; P9-BE-NEXT:    lis 4, 3855
-; P9-BE-NEXT:    li 5, 15
-; P9-BE-NEXT:    ori 4, 4, 3855
-; P9-BE-NEXT:    rldimi 4, 4, 32, 0
-; P9-BE-NEXT:    stdx 4, 3, 5
 ; P9-BE-NEXT:    xxspltib 0, 15
+; P9-BE-NEXT:    stfd 0, 15(3)
 ; P9-BE-NEXT:    stxv 0, 0(3)
 ; P9-BE-NEXT:    blr
 ;
 ; P10-BE-LABEL: memsetTailV1B7:
 ; P10-BE:       # %bb.0: # %entry
-; P10-BE-NEXT:    pli 4, 252645135
-; P10-BE-NEXT:    rldimi 4, 4, 32, 0
-; P10-BE-NEXT:    pstd 4, 15(3), 0
 ; P10-BE-NEXT:    xxspltib 0, 15
+; P10-BE-NEXT:    stfd 0, 15(3)
 ; P10-BE-NEXT:    stxv 0, 0(3)
 ; P10-BE-NEXT:    blr
 ;
 ; P8-LE-LABEL: memsetTailV1B7:
 ; P8-LE:       # %bb.0: # %entry
-; P8-LE-NEXT:    lis 4, 3855
 ; P8-LE-NEXT:    vspltisb 2, 15
-; P8-LE-NEXT:    li 5, 15
-; P8-LE-NEXT:    ori 4, 4, 3855
-; P8-LE-NEXT:    rldimi 4, 4, 32, 0
-; P8-LE-NEXT:    stdx 4, 3, 5
+; P8-LE-NEXT:    li 4, 15
+; P8-LE-NEXT:    stxsdx 34, 3, 4
 ; P8-LE-NEXT:    stxvd2x 34, 0, 3
 ; P8-LE-NEXT:    blr
 ;
 ; P9-LE-LABEL: memsetTailV1B7:
 ; P9-LE:       # %bb.0: # %entry
-; P9-LE-NEXT:    lis 4, 3855
-; P9-LE-NEXT:    li 5, 15
-; P9-LE-NEXT:    ori 4, 4, 3855
-; P9-LE-NEXT:    rldimi 4, 4, 32, 0
-; P9-LE-NEXT:    stdx 4, 3, 5
 ; P9-LE-NEXT:    xxspltib 0, 15
+; P9-LE-NEXT:    stfd 0, 15(3)
 ; P9-LE-NEXT:    stxv 0, 0(3)
 ; P9-LE-NEXT:    blr
 ;
 ; P10-LE-LABEL: memsetTailV1B7:
 ; P10-LE:       # %bb.0: # %entry
-; P10-LE-NEXT:    pli 4, 252645135
-; P10-LE-NEXT:    rldimi 4, 4, 32, 0
-; P10-LE-NEXT:    pstd 4, 15(3), 0
 ; P10-LE-NEXT:    xxspltib 0, 15
+; P10-LE-NEXT:    stfd 0, 15(3)
 ; P10-LE-NEXT:    stxv 0, 0(3)
 ; P10-LE-NEXT:    blr
 entry:
@@ -299,52 +267,48 @@ define dso_local void @memsetTailV1B4(ptr nocapture noundef writeonly %p) local_
 ; P8-BE-LABEL: memsetTailV1B4:
 ; P8-BE:       # %bb.0: # %entry
 ; P8-BE-NEXT:    vspltisb 2, 15
-; P8-BE-NEXT:    lis 4, 3855
-; P8-BE-NEXT:    ori 4, 4, 3855
-; P8-BE-NEXT:    stw 4, 16(3)
+; P8-BE-NEXT:    li 4, 16
+; P8-BE-NEXT:    stxsiwx 34, 3, 4
 ; P8-BE-NEXT:    stxvw4x 34, 0, 3
 ; P8-BE-NEXT:    blr
 ;
 ; P9-BE-LABEL: memsetTailV1B4:
 ; P9-BE:       # %bb.0: # %entry
-; P9-BE-NEXT:    lis 4, 3855
-; P9-BE-NEXT:    ori 4, 4, 3855
-; P9-BE-NEXT:    stw 4, 16(3)
 ; P9-BE-NEXT:    xxspltib 0, 15
+; P9-BE-NEXT:    li 4, 16
+; P9-BE-NEXT:    stfiwx 0, 3, 4
 ; P9-BE-NEXT:    stxv 0, 0(3)
 ; P9-BE-NEXT:    blr
 ;
 ; P10-BE-LABEL: memsetTailV1B4:
 ; P10-BE:       # %bb.0: # %entry
-; P10-BE-NEXT:    pli 4, 252645135
-; P10-BE-NEXT:    stw 4, 16(3)
 ; P10-BE-NEXT:    xxspltib 0, 15
+; P10-BE-NEXT:    li 4, 16
+; P10-BE-NEXT:    stfiwx 0, 3, 4
 ; P10-BE-NEXT:    stxv 0, 0(3)
 ; P10-BE-NEXT:    blr
 ;
 ; P8-LE-LABEL: memsetTailV1B4:
 ; P8-LE:       # %bb.0: # %entry
 ; P8-LE-NEXT:    vspltisb 2, 15
-; P8-LE-NEXT:    lis 4, 3855
-; P8-LE-NEXT:    ori 4, 4, 3855
-; P8-LE-NEXT:    stw 4, 16(3)
+; P8-LE-NEXT:    li 4, 16
+; P8-LE-NEXT:    stxsiwx 34, 3, 4
 ; P8-LE-NEXT:    stxvd2x 34, 0, 3
 ; P8-LE-NEXT:    blr
 ;
 ; P9-LE-LABEL: memsetTailV1B4:
 ; P9-LE:       # %bb.0: # %entry
-; P9-LE-NEXT:    lis 4, 3855
-; P9-LE-NEXT:    ori 4, 4, 3855
-; P9-LE-NEXT:    stw 4, 16(3)
 ; P9-LE-NEXT:    xxspltib 0, 15
+; P9-LE-NEXT:    li 4, 16
+; P9-LE-NEXT:    stfiwx 0, 3, 4
 ; P9-LE-NEXT:    stxv 0, 0(3)
 ; P9-LE-NEXT:    blr
 ;
 ; P10-LE-LABEL: memsetTailV1B4:
 ; P10-LE:       # %bb.0: # %entry
-; P10-LE-NEXT:    pli 4, 252645135
-; P10-LE-NEXT:    stw 4, 16(3)
 ; P10-LE-NEXT:    xxspltib 0, 15
+; P10-LE-NEXT:    li 4, 16
+; P10-LE-NEXT:    stfiwx 0, 3, 4
 ; P10-LE-NEXT:    stxv 0, 0(3)
 ; P10-LE-NEXT:    blr
 entry:
@@ -356,52 +320,48 @@ define dso_local void @memsetTailV1B3(ptr nocapture noundef writeonly %p) local_
 ; P8-BE-LABEL: memsetTailV1B3:
 ; P8-BE:       # %bb.0: # %entry
 ; P8-BE-NEXT:    vspltisb 2, 15
-; P8-BE-NEXT:    lis 4, 3855
-; P8-BE-NEXT:    ori 4, 4, 3855
+; P8-BE-NEXT:    li 4, 15
+; P8-BE-NEXT:    stxsiwx 34, 3, 4
 ; P8-BE-NEXT:    stxvw4x 34, 0, 3
-; P8-BE-NEXT:    stw 4, 15(3)
 ; P8-BE-NEXT:    blr
 ;
 ; P9-BE-LABEL: memsetTailV1B3:
 ; P9-BE:       # %bb.0: # %entry
-; P9-BE-NEXT:    lis 4, 3855
-; P9-BE-NEXT:    ori 4, 4, 3855
-; P9-BE-NEXT:    stw 4, 15(3)
 ; P9-BE-NEXT:    xxspltib 0, 15
+; P9-BE-NEXT:    li 4, 15
+; P9-BE-NEXT:    stfiwx 0, 3, 4
 ; P9-BE-NEXT:    stxv 0, 0(3)
 ; P9-BE-NEXT:    blr
 ;
 ; P10-BE-LABEL: memsetTailV1B3:
 ; P10-BE:       # %bb.0: # %entry
-; P10-BE-NEXT:    pli 4, 252645135
-; P10-BE-NEXT:    stw 4, 15(3)
 ; P10-BE-NEXT:    xxspltib 0, 15
+; P10-BE-NEXT:    li 4, 15
+; P10-BE-NEXT:    stfiwx 0, 3, 4
 ; P10-BE-NEXT:    stxv 0, 0(3)
 ; P10-BE-NEXT:    blr
 ;
 ; P8-LE-LABEL: memsetTailV1B3:
 ; P8-LE:       # %bb.0: # %entry
 ; P8-LE-NEXT:    vspltisb 2, 15
-; P8-LE-NEXT:    lis 4, 3855
-; P8-LE-NEXT:    ori 4, 4, 3855
-; P8-LE-NEXT:    stw 4, 15(3)
+; P8-LE-NEXT:    li 4, 15
+; P8-LE-NEXT:    stxsiwx 34, 3, 4
 ; P8-LE-NEXT:    stxvd2x 34, 0, 3
 ; P8-LE-NEXT:    blr
 ;
 ; P9-LE-LABEL: memsetTailV1B3:
 ; P9-LE:       # %bb.0: # %entry
-; P9-LE-NEXT:    lis 4, 3855
-; P9-LE-NEXT:    ori 4, 4, 3855
-; P9-LE-NEXT:    stw 4, 15(3)
 ; P9-LE-NEXT:    xxspltib 0, 15
+; P9-LE-NEXT:    li 4, 15
+; P9-LE-NEXT:    stfiwx 0, 3, 4
 ; P9-LE-NEXT:    stxv 0, 0(3)
 ; P9-LE-NEXT:    blr
 ;
 ; P10-LE-LABEL: memsetTailV1B3:
 ; P10-LE:       # %bb.0: # %entry
-; P10-LE-NEXT:    pli 4, 252645135
-; P10-LE-NEXT:    stw 4, 15(3)
 ; P10-LE-NEXT:    xxspltib 0, 15
+; P10-LE-NEXT:    li 4, 15
+; P10-LE-NEXT:    stfiwx 0, 3, 4
 ; P10-LE-NEXT:    stxv 0, 0(3)
 ; P10-LE-NEXT:    blr
 entry:
@@ -682,30 +642,22 @@ define dso_local void @memset2TailV1B8(ptr nocapture noundef writeonly %p) local
 ; P8-BE:       # %bb.0: # %entry
 ; P8-BE-NEXT:    ld 4, L..C3(2) # %const.0
 ; P8-BE-NEXT:    lxvw4x 0, 0, 4
-; P8-BE-NEXT:    lis 4, -23131
-; P8-BE-NEXT:    ori 4, 4, 42405
-; P8-BE-NEXT:    rldimi 4, 4, 32, 0
+; P8-BE-NEXT:    stfd 0, 16(3)
 ; P8-BE-NEXT:    stxvw4x 0, 0, 3
-; P8-BE-NEXT:    std 4, 16(3)
 ; P8-BE-NEXT:    blr
 ;
 ; P9-BE-LABEL: memset2TailV1B8:
 ; P9-BE:       # %bb.0: # %entry
-; P9-BE-NEXT:    lis 4, -23131
 ; P9-BE-NEXT:    xxspltib 0, 165
-; P9-BE-NEXT:    ori 4, 4, 42405
 ; P9-BE-NEXT:    stxv 0, 0(3)
-; P9-BE-NEXT:    rldimi 4, 4, 32, 0
-; P9-BE-NEXT:    std 4, 16(3)
+; P9-BE-NEXT:    stfd 0, 16(3)
 ; P9-BE-NEXT:    blr
 ;
 ; P10-BE-LABEL: memset2TailV1B8:
 ; P10-BE:       # %bb.0: # %entry
-; P10-BE-NEXT:    pli 4, 2779096485
-; P10-BE-NEXT:    rldimi 4, 4, 32, 0
-; P10-BE-NEXT:    std 4, 16(3)
 ; P10-BE-NEXT:    xxspltib 0, 165
 ; P10-BE-NEXT:    stxv 0, 0(3)
+; P10-BE-NEXT:    stfd 0, 16(3)
 ; P10-BE-NEXT:    blr
 ;
 ; P8-LE-LABEL: memset2TailV1B8:
@@ -713,30 +665,22 @@ define dso_local void @memset2TailV1B8(ptr nocapture noundef writeonly %p) local
 ; P8-LE-NEXT:    addis 4, 2, .LCPI12_0 at toc@ha
 ; P8-LE-NEXT:    addi 4, 4, .LCPI12_0 at toc@l
 ; P8-LE-NEXT:    lxvd2x 0, 0, 4
-; P8-LE-NEXT:    lis 4, -23131
-; P8-LE-NEXT:    ori 4, 4, 42405
-; P8-LE-NEXT:    rldimi 4, 4, 32, 0
-; P8-LE-NEXT:    std 4, 16(3)
+; P8-LE-NEXT:    stfd 0, 16(3)
 ; P8-LE-NEXT:    stxvd2x 0, 0, 3
 ; P8-LE-NEXT:    blr
 ;
 ; P9-LE-LABEL: memset2TailV1B8:
 ; P9-LE:       # %bb.0: # %entry
-; P9-LE-NEXT:    lis 4, -23131
 ; P9-LE-NEXT:    xxspltib 0, 165
-; P9-LE-NEXT:    ori 4, 4, 42405
 ; P9-LE-NEXT:    stxv 0, 0(3)
-; P9-LE-NEXT:    rldimi 4, 4, 32, 0
-; P9-LE-NEXT:    std 4, 16(3)
+; P9-LE-NEXT:    stfd 0, 16(3)
 ; P9-LE-NEXT:    blr
 ;
 ; P10-LE-LABEL: memset2TailV1B8:
 ; P10-LE:       # %bb.0: # %entry
-; P10-LE-NEXT:    pli 4, 2779096485
-; P10-LE-NEXT:    rldimi 4, 4, 32, 0
-; P10-LE-NEXT:    std 4, 16(3)
 ; P10-LE-NEXT:    xxspltib 0, 165
 ; P10-LE-NEXT:    stxv 0, 0(3)
+; P10-LE-NEXT:    stfd 0, 16(3)
 ; P10-LE-NEXT:    blr
 entry:
   tail call void @llvm.memset.p0.i64(ptr %p, i8 165, i64 24, i1 false)
@@ -747,65 +691,45 @@ define dso_local void @memset2TailV1B7(ptr nocapture noundef writeonly %p) local
 ; P8-BE-LABEL: memset2TailV1B7:
 ; P8-BE:       # %bb.0: # %entry
 ; P8-BE-NEXT:    ld 4, L..C4(2) # %const.0
-; P8-BE-NEXT:    lis 5, -23131
 ; P8-BE-NEXT:    lxvw4x 0, 0, 4
-; P8-BE-NEXT:    ori 4, 5, 42405
-; P8-BE-NEXT:    li 5, 15
-; P8-BE-NEXT:    rldimi 4, 4, 32, 0
-; P8-BE-NEXT:    stdx 4, 3, 5
+; P8-BE-NEXT:    stfd 0, 15(3)
 ; P8-BE-NEXT:    stxvw4x 0, 0, 3
 ; P8-BE-NEXT:    blr
 ;
 ; P9-BE-LABEL: memset2TailV1B7:
 ; P9-BE:       # %bb.0: # %entry
-; P9-BE-NEXT:    lis 4, -23131
-; P9-BE-NEXT:    li 5, 15
-; P9-BE-NEXT:    ori 4, 4, 42405
-; P9-BE-NEXT:    rldimi 4, 4, 32, 0
-; P9-BE-NEXT:    stdx 4, 3, 5
 ; P9-BE-NEXT:    xxspltib 0, 165
+; P9-BE-NEXT:    stfd 0, 15(3)
 ; P9-BE-NEXT:    stxv 0, 0(3)
 ; P9-BE-NEXT:    blr
 ;
 ; P10-BE-LABEL: memset2TailV1B7:
 ; P10-BE:       # %bb.0: # %entry
-; P10-BE-NEXT:    pli 4, 2779096485
-; P10-BE-NEXT:    rldimi 4, 4, 32, 0
-; P10-BE-NEXT:    pstd 4, 15(3), 0
 ; P10-BE-NEXT:    xxspltib 0, 165
+; P10-BE-NEXT:    stfd 0, 15(3)
 ; P10-BE-NEXT:    stxv 0, 0(3)
 ; P10-BE-NEXT:    blr
 ;
 ; P8-LE-LABEL: memset2TailV1B7:
 ; P8-LE:       # %bb.0: # %entry
 ; P8-LE-NEXT:    addis 4, 2, .LCPI13_0 at toc@ha
-; P8-LE-NEXT:    lis 5, -23131
 ; P8-LE-NEXT:    addi 4, 4, .LCPI13_0 at toc@l
 ; P8-LE-NEXT:    lxvd2x 0, 0, 4
-; P8-LE-NEXT:    ori 4, 5, 42405
-; P8-LE-NEXT:    li 5, 15
-; P8-LE-NEXT:    rldimi 4, 4, 32, 0
-; P8-LE-NEXT:    stdx 4, 3, 5
+; P8-LE-NEXT:    stfd 0, 15(3)
 ; P8-LE-NEXT:    stxvd2x 0, 0, 3
 ; P8-LE-NEXT:    blr
 ;
 ; P9-LE-LABEL: memset2TailV1B7:
 ; P9-LE:       # %bb.0: # %entry
-; P9-LE-NEXT:    lis 4, -23131
-; P9-LE-NEXT:    li 5, 15
-; P9-LE-NEXT:    ori 4, 4, 42405
-; P9-LE-NEXT:    rldimi 4, 4, 32, 0
-; P9-LE-NEXT:    stdx 4, 3, 5
 ; P9-LE-NEXT:    xxspltib 0, 165
+; P9-LE-NEXT:    stfd 0, 15(3)
 ; P9-LE-NEXT:    stxv 0, 0(3)
 ; P9-LE-NEXT:    blr
 ;
 ; P10-LE-LABEL: memset2TailV1B7:
 ; P10-LE:       # %bb.0: # %entry
-; P10-LE-NEXT:    pli 4, 2779096485
-; P10-LE-NEXT:    rldimi 4, 4, 32, 0
-; P10-LE-NEXT:    pstd 4, 15(3), 0
 ; P10-LE-NEXT:    xxspltib 0, 165
+; P10-LE-NEXT:    stfd 0, 15(3)
 ; P10-LE-NEXT:    stxv 0, 0(3)
 ; P10-LE-NEXT:    blr
 entry:
@@ -818,26 +742,24 @@ define dso_local void @memset2TailV1B4(ptr nocapture noundef writeonly %p) local
 ; P8-BE:       # %bb.0: # %entry
 ; P8-BE-NEXT:    ld 4, L..C5(2) # %const.0
 ; P8-BE-NEXT:    lxvw4x 0, 0, 4
-; P8-BE-NEXT:    lis 4, -23131
-; P8-BE-NEXT:    ori 4, 4, 42405
-; P8-BE-NEXT:    stw 4, 16(3)
+; P8-BE-NEXT:    li 4, 16
+; P8-BE-NEXT:    stfiwx 0, 3, 4
 ; P8-BE-NEXT:    stxvw4x 0, 0, 3
 ; P8-BE-NEXT:    blr
 ;
 ; P9-BE-LABEL: memset2TailV1B4:
 ; P9-BE:       # %bb.0: # %entry
-; P9-BE-NEXT:    lis 4, -23131
-; P9-BE-NEXT:    ori 4, 4, 42405
-; P9-BE-NEXT:    stw 4, 16(3)
 ; P9-BE-NEXT:    xxspltib 0, 165
+; P9-BE-NEXT:    li 4, 16
+; P9-BE-NEXT:    stfiwx 0, 3, 4
 ; P9-BE-NEXT:    stxv 0, 0(3)
 ; P9-BE-NEXT:    blr
 ;
 ; P10-BE-LABEL: memset2TailV1B4:
 ; P10-BE:       # %bb.0: # %entry
-; P10-BE-NEXT:    pli 4, -1515870811
-; P10-BE-NEXT:    stw 4, 16(3)
 ; P10-BE-NEXT:    xxspltib 0, 165
+; P10-BE-NEXT:    li 4, 16
+; P10-BE-NEXT:    stfiwx 0, 3, 4
 ; P10-BE-NEXT:    stxv 0, 0(3)
 ; P10-BE-NEXT:    blr
 ;
@@ -846,26 +768,24 @@ define dso_local void @memset2TailV1B4(ptr nocapture noundef writeonly %p) local
 ; P8-LE-NEXT:    addis 4, 2, .LCPI14_0 at toc@ha
 ; P8-LE-NEXT:    addi 4, 4, .LCPI14_0 at toc@l
 ; P8-LE-NEXT:    lxvd2x 0, 0, 4
-; P8-LE-NEXT:    lis 4, -23131
-; P8-LE-NEXT:    ori 4, 4, 42405
-; P8-LE-NEXT:    stw 4, 16(3)
+; P8-LE-NEXT:    li 4, 16
+; P8-LE-NEXT:    stfiwx 0, 3, 4
 ; P8-LE-NEXT:    stxvd2x 0, 0, 3
 ; P8-LE-NEXT:    blr
 ;
 ; P9-LE-LABEL: memset2TailV1B4:
 ; P9-LE:       # %bb.0: # %entry
-; P9-LE-NEXT:    lis 4, -23131
-; P9-LE-NEXT:    ori 4, 4, 42405
-; P9-LE-NEXT:    stw 4, 16(3)
 ; P9-LE-NEXT:    xxspltib 0, 165
+; P9-LE-NEXT:    li 4, 16
+; P9-LE-NEXT:    stfiwx 0, 3, 4
 ; P9-LE-NEXT:    stxv 0, 0(3)
 ; P9-LE-NEXT:    blr
 ;
 ; P10-LE-LABEL: memset2TailV1B4:
 ; P10-LE:       # %bb.0: # %entry
-; P10-LE-NEXT:    pli 4, -1515870811
-; P10-LE-NEXT:    stw 4, 16(3)
 ; P10-LE-NEXT:    xxspltib 0, 165
+; P10-LE-NEXT:    li 4, 16
+; P10-LE-NEXT:    stfiwx 0, 3, 4
 ; P10-LE-NEXT:    stxv 0, 0(3)
 ; P10-LE-NEXT:    blr
 entry:
@@ -878,26 +798,24 @@ define dso_local void @memset2TailV1B3(ptr nocapture noundef writeonly %p) local
 ; P8-BE:       # %bb.0: # %entry
 ; P8-BE-NEXT:    ld 4, L..C6(2) # %const.0
 ; P8-BE-NEXT:    lxvw4x 0, 0, 4
-; P8-BE-NEXT:    lis 4, -23131
-; P8-BE-NEXT:    ori 4, 4, 42405
-; P8-BE-NEXT:    stw 4, 15(3)
+; P8-BE-NEXT:    li 4, 15
+; P8-BE-NEXT:    stfiwx 0, 3, 4
 ; P8-BE-NEXT:    stxvw4x 0, 0, 3
 ; P8-BE-NEXT:    blr
 ;
 ; P9-BE-LABEL: memset2TailV1B3:
 ; P9-BE:       # %bb.0: # %entry
-; P9-BE-NEXT:    lis 4, -23131
-; P9-BE-NEXT:    ori 4, 4, 42405
-; P9-BE-NEXT:    stw 4, 15(3)
 ; P9-BE-NEXT:    xxspltib 0, 165
+; P9-BE-NEXT:    li 4, 15
+; P9-BE-NEXT:    stfiwx 0, 3, 4
 ; P9-BE-NEXT:    stxv 0, 0(3)
 ; P9-BE-NEXT:    blr
 ;
 ; P10-BE-LABEL: memset2TailV1B3:
 ; P10-BE:       # %bb.0: # %entry
-; P10-BE-NEXT:    pli 4, -1515870811
-; P10-BE-NEXT:    stw 4, 15(3)
 ; P10-BE-NEXT:    xxspltib 0, 165
+; P10-BE-NEXT:    li 4, 15
+; P10-BE-NEXT:    stfiwx 0, 3, 4
 ; P10-BE-NEXT:    stxv 0, 0(3)
 ; P10-BE-NEXT:    blr
 ;
@@ -906,26 +824,24 @@ define dso_local void @memset2TailV1B3(ptr nocapture noundef writeonly %p) local
 ; P8-LE-NEXT:    addis 4, 2, .LCPI15_0 at toc@ha
 ; P8-LE-NEXT:    addi 4, 4, .LCPI15_0 at toc@l
 ; P8-LE-NEXT:    lxvd2x 0, 0, 4
-; P8-LE-NEXT:    lis 4, -23131
-; P8-LE-NEXT:    ori 4, 4, 42405
-; P8-LE-NEXT:    stw 4, 15(3)
+; P8-LE-NEXT:    li 4, 15
+; P8-LE-NEXT:    stfiwx 0, 3, 4
 ; P8-LE-NEXT:    stxvd2x 0, 0, 3
 ; P8-LE-NEXT:    blr
 ;
 ; P9-LE-LABEL: memset2TailV1B3:
 ; P9-LE:       # %bb.0: # %entry
-; P9-LE-NEXT:    lis 4, -23131
-; P9-LE-NEXT:    ori 4, 4, 42405
-; P9-LE-NEXT:    stw 4, 15(3)
 ; P9-LE-NEXT:    xxspltib 0, 165
+; P9-LE-NEXT:    li 4, 15
+; P9-LE-NEXT:    stfiwx 0, 3, 4
 ; P9-LE-NEXT:    stxv 0, 0(3)
 ; P9-LE-NEXT:    blr
 ;
 ; P10-LE-LABEL: memset2TailV1B3:
 ; P10-LE:       # %bb.0: # %entry
-; P10-LE-NEXT:    pli 4, -1515870811
-; P10-LE-NEXT:    stw 4, 15(3)
 ; P10-LE-NEXT:    xxspltib 0, 165
+; P10-LE-NEXT:    li 4, 15
+; P10-LE-NEXT:    stfiwx 0, 3, 4
 ; P10-LE-NEXT:    stxv 0, 0(3)
 ; P10-LE-NEXT:    blr
 entry:


        


More information about the llvm-commits mailing list