[llvm] 8d9c13f - Revert "[PowerPC] Implement instruction clustering for stores"

Tue Sep 8 02:28:19 PDT 2020

Author: Qiu Chaofan
Date: 2020-09-08T17:24:08+08:00
New Revision: 8d9c13f37d2081c11186718ae8b5aef8b507d152

URL: https://github.com/llvm/llvm-project/commit/8d9c13f37d2081c11186718ae8b5aef8b507d152
DIFF: https://github.com/llvm/llvm-project/commit/8d9c13f37d2081c11186718ae8b5aef8b507d152.diff

LOG: Revert "[PowerPC] Implement instruction clustering for stores"

This reverts commit 3c0b3250230b3847a2a47dfeacfdb794c2285f02, (along
with ea795304 and bb39eb9e) since it breaks test with UB sanitizer.

Added: 
    

Modified: 
    llvm/lib/Target/PowerPC/PPC.td
    llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
    llvm/lib/Target/PowerPC/PPCInstrInfo.h
    llvm/lib/Target/PowerPC/PPCSubtarget.cpp
    llvm/lib/Target/PowerPC/PPCSubtarget.h
    llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
    llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll

Removed: 
    llvm/test/CodeGen/PowerPC/fusion-load-store.ll


################################################################################
diff  --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index 1b38a6f1d13d9..a617715d4bd86 100644

--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -174,9 +174,6 @@ def FeatureAddisLoadFusion : SubtargetFeature<"fuse-addis-load",
                                               "HasAddisLoadFusion", "true",
                                               "Power8 Addis-Load fusion",
                                               [FeatureFusion]>;
-def FeatureStoreFusion : SubtargetFeature<"fuse-store", "HasStoreFusion", "true",
-                                          "Target supports store clustering",
-                                          [FeatureFusion]>;
 def FeatureUnalignedFloats :
   SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess",
                    "true", "CPU does not trap on unaligned FP access">;
@@ -348,12 +345,10 @@ def ProcessorFeatures {
   // Power10
   // For P10 CPU we assume that all of the existing features from Power9
   // still exist with the exception of those we know are Power9 specific.
-  list<SubtargetFeature> FusionFeatures = [FeatureStoreFusion];
   list<SubtargetFeature> P10AdditionalFeatures =
-    !listconcat(FusionFeatures, [
-       DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs,
-       FeaturePCRelativeMemops, FeatureP10Vector, FeatureMMA,
-       FeaturePairedVectorMemops]);
+    [DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs,
+     FeaturePCRelativeMemops, FeatureP10Vector, FeatureMMA,
+     FeaturePairedVectorMemops];
   list<SubtargetFeature> P10SpecificFeatures = [];
   list<SubtargetFeature> P10InheritableFeatures =
     !listconcat(P9InheritableFeatures, P10AdditionalFeatures);

diff  --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 8cb8c82e62833..2423bca42e805 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2222,111 +2222,6 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   return true;
 }
 
-bool PPCInstrInfo::getMemOperandsWithOffsetWidth(
-    const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
-    int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
-    const TargetRegisterInfo *TRI) const {
-  const MachineOperand *BaseOp;
-  if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI))
-    return false;
-  BaseOps.push_back(BaseOp);
-  return true;
-}
-
-static bool isLdStSafeToCluster(const MachineInstr &LdSt,
-                                const TargetRegisterInfo *TRI) {
-  // If this is a volatile load/store, don't mess with it.
-  if (LdSt.hasOrderedMemoryRef())
-    return false;
-
-  if (LdSt.getOperand(2).isFI())
-    return true;
-
-  assert(LdSt.getOperand(2).isReg() && "Expected a reg operand.");
-  // Can't cluster if the instruction modifies the base register
-  // or it is update form. e.g. ld r2,3(r2)
-  if (LdSt.modifiesRegister(LdSt.getOperand(2).getReg(), TRI))
-    return false;
-
-  return true;
-}
-
-// Only cluster instruction pair that have the same opcode, and they are
-// clusterable according to PowerPC specification.
-static bool isClusterableLdStOpcPair(unsigned FirstOpc, unsigned SecondOpc,
-                                     const PPCSubtarget &Subtarget) {
-  switch (FirstOpc) {
-  default:
-    return false;
-  case PPC::STD:
-  case PPC::STFD:
-  case PPC::STXSD:
-  case PPC::DFSTOREf64:
-    return FirstOpc == SecondOpc;
-  // PowerPC backend has opcode STW/STW8 for instruction "stw" to deal with
-  // 32bit and 64bit instruction selection. They are clusterable pair though
-  // they are 
diff erent opcode.
-  case PPC::STW:
-  case PPC::STW8:
-    return SecondOpc == PPC::STW || SecondOpc == PPC::STW8;
-  }
-}
-
-bool PPCInstrInfo::shouldClusterMemOps(
-    ArrayRef<const MachineOperand *> BaseOps1,
-    ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
-    unsigned NumBytes) const {
-
-  assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
-  const MachineOperand &BaseOp1 = *BaseOps1.front();
-  const MachineOperand &BaseOp2 = *BaseOps2.front();
-  assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
-         "Only base registers and frame indices are supported.");
-
-  // The NumLoads means the number of loads that has been clustered.
-  // Don't cluster memory op if there are already two ops clustered at least.
-  if (NumLoads > 2)
-    return false;
-
-  // Cluster the load/store only when they have the same base
-  // register or FI.
-  if ((BaseOp1.isReg() != BaseOp2.isReg()) ||
-      (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) ||
-      (BaseOp1.isFI() && BaseOp1.getIndex() != BaseOp2.getIndex()))
-    return false;
-
-  // Check if the load/store are clusterable according to the PowerPC
-  // specification.
-  const MachineInstr &FirstLdSt = *BaseOp1.getParent();
-  const MachineInstr &SecondLdSt = *BaseOp2.getParent();
-  unsigned FirstOpc = FirstLdSt.getOpcode();
-  unsigned SecondOpc = SecondLdSt.getOpcode();
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  // Cluster the load/store only when they have the same opcode, and they are
-  // clusterable opcode according to PowerPC specification.
-  if (!isClusterableLdStOpcPair(FirstOpc, SecondOpc, Subtarget))
-    return false;
-
-  // Can't cluster load/store that have ordered or volatile memory reference.
-  if (!isLdStSafeToCluster(FirstLdSt, TRI) ||
-      !isLdStSafeToCluster(SecondLdSt, TRI))
-    return false;
-
-  int64_t Offset1 = 0, Offset2 = 0;
-  unsigned Width1 = 0, Width2 = 0;
-  const MachineOperand *Base1 = nullptr, *Base2 = nullptr;
-  if (!getMemOperandWithOffsetWidth(FirstLdSt, Base1, Offset1, Width1, TRI) ||
-      !getMemOperandWithOffsetWidth(SecondLdSt, Base2, Offset2, Width2, TRI) ||
-      Width1 != Width2)
-    return false;
-
-  assert(Base1 == &BaseOp1 && Base2 == &BaseOp2 &&
-         "getMemOperandWithOffsetWidth return incorrect base op");
-  // The caller should already have ordered FirstMemOp/SecondMemOp by offset.
-  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
-  return Offset1 + Width1 == Offset2;
-}
-
 /// GetInstSize - Return the number of bytes of code the specified
 /// instruction may be.  This returns the maximum number of bytes.
 ///
@@ -4769,8 +4664,7 @@ bool PPCInstrInfo::getMemOperandWithOffsetWidth(
     return false;
 
   // Handle only loads/stores with base register followed by immediate offset.
-  if (!LdSt.getOperand(1).isImm() ||
-      (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()))
+  if (LdSt.getNumExplicitOperands() != 3)
     return false;
   if (!LdSt.getOperand(1).isImm() ||
       (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()))

diff  --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 2f867b16aa24f..75e8224892f4c 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -494,19 +494,6 @@ class PPCInstrInfo : public PPCGenInstrInfo {
                                     int64_t &Offset, unsigned &Width,
                                     const TargetRegisterInfo *TRI) const;
 
-  /// Get the base operand and byte offset of an instruction that reads/writes
-  /// memory.
-  bool getMemOperandsWithOffsetWidth(
-      const MachineInstr &MI, SmallVectorImpl<const MachineOperand *> &BaseOps,
-      int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
-      const TargetRegisterInfo *TRI) const override;
-
-  /// Returns true if the two given memory operations should be scheduled
-  /// adjacent.
-  bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
-                           ArrayRef<const MachineOperand *> BaseOps2,
-                           unsigned NumLoads, unsigned NumBytes) const override;
-
   /// Return true if two MIs access 
diff erent memory addresses and false
   /// otherwise
   bool

diff  --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 05922dbb38fc6..8021cfa4a18c6 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -108,7 +108,6 @@ void PPCSubtarget::initializeEnvironment() {
   HasHTM = false;
   HasFloat128 = false;
   HasFusion = false;
-  HasStoreFusion = false;
   HasAddiLoadFusion = false;
   HasAddisLoadFusion = false;
   IsISA3_0 = false;

diff  --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 0a134bb83ed2f..76b43dfc7a723 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -137,7 +137,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   bool HasHTM;
   bool HasFloat128;
   bool HasFusion;
-  bool HasStoreFusion;
   bool HasAddiLoadFusion;
   bool HasAddisLoadFusion;
   bool IsISA3_0;
@@ -309,7 +308,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   bool isISA3_1() const { return IsISA3_1; }
   bool useLongCalls() const { return UseLongCalls; }
   bool hasFusion() const { return HasFusion; }
-  bool hasStoreFusion() const { return HasStoreFusion; }
   bool hasAddiLoadFusion() const { return HasAddiLoadFusion; }
   bool hasAddisLoadFusion() const { return HasAddisLoadFusion; }
   bool needsSwapsForVSXMemOps() const {

diff  --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index c5671d6c73e05..ea9b37de6ff39 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -271,8 +271,6 @@ static ScheduleDAGInstrs *createPPCMachineScheduler(MachineSchedContext *C) {
                           std::make_unique<GenericScheduler>(C));
   // add DAG Mutations here.
   DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
-  if (ST.hasStoreFusion())
-    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.hasFusion())
     DAG->addMutation(createPowerPCMacroFusionDAGMutation());
 
@@ -287,8 +285,6 @@ static ScheduleDAGInstrs *createPPCPostMachineScheduler(
                       std::make_unique<PPCPostRASchedStrategy>(C) :
                       std::make_unique<PostGenericScheduler>(C), true);
   // add DAG Mutations here.
-  if (ST.hasStoreFusion())
-    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.hasFusion())
     DAG->addMutation(createPowerPCMacroFusionDAGMutation());
   return DAG;

diff  --git a/llvm/test/CodeGen/PowerPC/fusion-load-store.ll b/llvm/test/CodeGen/PowerPC/fusion-load-store.ll
deleted file mode 100644
index 75b2eca2168c0..0000000000000
--- a/llvm/test/CodeGen/PowerPC/fusion-load-store.ll
+++ /dev/null
@@ -1,268 +0,0 @@
-; Test if several consecutive loads/stores can be clustered(fused) by scheduler. The
-; scheduler will print "Cluster ld/st SU(x) - SU(y)" if SU(x) and SU(y) are fused.
-
-; REQUIRES: asserts
-; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 \
-; RUN:   -mattr=-paired-vector-memops,-pcrelative-memops -verify-misched \
-; RUN:   -debug-only=machine-scheduler 2>&1 | FileCheck %s
-
-define i64 @store_i64(i64* nocapture %P, i64 %v) {
-entry:
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_i64:%bb.0
-; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]])
-; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]])
-; CHECK: SU([[SU2]]): STD %[[REG:[0-9]+]]:g8rc, 24
-; CHECK: SU([[SU3]]): STD %[[REG]]:g8rc, 16
-; CHECK: SU([[SU4]]): STD %[[REG]]:g8rc, 8
-; CHECK: SU([[SU5]]): STD %[[REG]]:g8rc, 32
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_i64:%bb.0
-; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]])
-; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]])
-; CHECK: SU([[SU0]]): STD renamable $x[[REG:[0-9]+]], 16
-; CHECK: SU([[SU1]]): STD renamable $x[[REG]], 8
-; CHECK: SU([[SU2]]): STD renamable $x[[REG]], 24
-; CHECK: SU([[SU3]]): STD renamable $x[[REG]], 32
-  %arrayidx = getelementptr inbounds i64, i64* %P, i64 3
-  store i64 %v, i64* %arrayidx
-  %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2
-  store i64 %v, i64* %arrayidx1
-  %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1
-  store i64 %v, i64* %arrayidx2
-  %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4
-  store i64 %v, i64* %arrayidx3
-  ret i64 %v
-}
-
-define i32 @store_i32(i32* nocapture %P, i32 %v) {
-entry:
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_i32:%bb.0
-; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]])
-; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]])
-; CHECK: SU([[SU2]]): STW %[[REG:[0-9]+]].sub_32:g8rc, 52
-; CHECK: SU([[SU3]]): STW %[[REG]].sub_32:g8rc, 48
-; CHECK: SU([[SU4]]): STW %[[REG]].sub_32:g8rc, 44
-; CHECK: SU([[SU5]]): STW %[[REG]].sub_32:g8rc, 56
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_i32:%bb.0
-; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]])
-; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]])
-; CHECK: SU([[SU0]]): STW renamable $r[[REG:[0-9]+]], 48
-; CHECK: SU([[SU1]]): STW renamable $r[[REG]], 44
-; CHECK: SU([[SU2]]): STW renamable $r[[REG]], 52
-; CHECK: SU([[SU3]]): STW renamable $r[[REG]], 56
-  %arrayidx = getelementptr inbounds i32, i32* %P, i32 13
-  store i32 %v, i32* %arrayidx
-  %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 12
-  store i32 %v, i32* %arrayidx1
-  %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 11
-  store i32 %v, i32* %arrayidx2
-  %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 14
-  store i32 %v, i32* %arrayidx3
-  ret i32 %v
-}
-
-define void @store_i64_neg(i64* nocapture %P, i64 %v) #0 {
-entry:
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_i64_neg:%bb.0
-; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]])
-; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]])
-; CHECK: SU([[SU2]]): STD %[[REG:[0-9]+]]:g8rc, -24
-; CHECK: SU([[SU3]]): STD %[[REG]]:g8rc, -8
-; CHECK: SU([[SU4]]): STD %[[REG]]:g8rc, -16
-; CHECK: SU([[SU5]]): STD %[[REG]]:g8rc, -32
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_i64_neg:%bb.0
-; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]])
-; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]])
-; CHECK: SU([[SU0]]): STD renamable $x[[REG:[0-9]+]], -8
-; CHECK: SU([[SU1]]): STD renamable $x[[REG]], -16
-; CHECK: SU([[SU2]]): STD renamable $x[[REG]], -24
-; CHECK: SU([[SU3]]): STD renamable $x[[REG]], -32
-  %arrayidx = getelementptr inbounds i64, i64* %P, i64 -3
-  store i64 %v, i64* %arrayidx
-  %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 -1
-  store i64 %v, i64* %arrayidx1
-  %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 -2
-  store i64 %v, i64* %arrayidx2
-  %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 -4
-  store i64 %v, i64* %arrayidx3
-  ret void
-}
-
-define void @store_i32_neg(i32* nocapture %P, i32 %v) #0 {
-entry:
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_i32_neg:%bb.0
-; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]])
-; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]])
-; CHECK: SU([[SU2]]): STW %[[REG:[0-9]+]].sub_32:g8rc, -12
-; CHECK: SU([[SU3]]): STW %[[REG]].sub_32:g8rc, -4
-; CHECK: SU([[SU4]]): STW %[[REG]].sub_32:g8rc, -8
-; CHECK: SU([[SU5]]): STW %[[REG]].sub_32:g8rc, -16
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_i32_neg:%bb.0
-; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]])
-; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]])
-; CHECK:SU([[SU0]]): STW renamable $r[[REG:[0-9]+]], -4
-; CHECK:SU([[SU1]]): STW renamable $r[[REG]], -8
-; CHECK:SU([[SU2]]): STW renamable $r[[REG]], -12
-; CHECK:SU([[SU3]]): STW renamable $r[[REG]], -16
-  %arrayidx = getelementptr inbounds i32, i32* %P, i32 -3
-  store i32 %v, i32* %arrayidx
-  %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 -1
-  store i32 %v, i32* %arrayidx1
-  %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 -2
-  store i32 %v, i32* %arrayidx2
-  %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 -4
-  store i32 %v, i32* %arrayidx3
-  ret void
-}
-
-define void @store_double(double* nocapture %P, double %v)  {
-entry:
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_double:%bb.0
-; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]])
-; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]])
-; CHECK: SU([[SU2]]): DFSTOREf64 %[[REG:[0-9]+]]:vsfrc, 24
-; CHECK: SU([[SU3]]): DFSTOREf64 %[[REG]]:vsfrc, 8
-; CHECK: SU([[SU4]]): DFSTOREf64 %[[REG]]:vsfrc, 16
-; CHECK: SU([[SU5]]): DFSTOREf64 %[[REG]]:vsfrc, 32
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_double:%bb.0
-; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]])
-; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]])
-; CHECK: SU([[SU0]]): STFD renamable $f[[REG:[0-9]+]], 8
-; CHECK: SU([[SU1]]): STFD renamable $f[[REG]], 16
-; CHECK: SU([[SU2]]): STFD renamable $f[[REG]], 24
-; CHECK: SU([[SU3]]): STFD renamable $f[[REG]], 32
-  %arrayidx = getelementptr inbounds double, double* %P, i64 3
-  store double %v, double* %arrayidx
-  %arrayidx1 = getelementptr inbounds double, double* %P, i64 1
-  store double %v, double* %arrayidx1
-  %arrayidx2 = getelementptr inbounds double, double* %P, i64 2
-  store double %v, double* %arrayidx2
-  %arrayidx3 = getelementptr inbounds double, double* %P, i64 4
-  store double %v, double* %arrayidx3
-  ret void
-}
-
-define void @store_float(float* nocapture %P, float %v)  {
-entry:
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_float:%bb.0
-; CHECK-NOT: Cluster ld/st
-; CHECK-NOT: Cluster ld/st
-; CHECK: SU([[SU2]]): DFSTOREf32 %[[REG:[0-9]+]]:vssrc, 12
-; CHECK: SU([[SU3]]): DFSTOREf32 %[[REG]]:vssrc, 4
-; CHECK: SU([[SU4]]): DFSTOREf32 %[[REG]]:vssrc, 8
-; CHECK: SU([[SU5]]): DFSTOREf32 %[[REG]]:vssrc, 16
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_float:%bb.0
-; CHECK-NOT: Cluster ld/st
-; CHECK-NOT: Cluster ld/st
-; CHECK: SU([[SU0]]): STFS renamable $f[[REG:[0-9]+]], 12
-; CHECK: SU([[SU1]]): STFS renamable $f[[REG]], 4
-; CHECK: SU([[SU2]]): STFS renamable $f[[REG]], 8
-; CHECK: SU([[SU3]]): STFS renamable $f[[REG]], 16
-  %arrayidx = getelementptr inbounds float, float* %P, i64 3
-  store float %v, float* %arrayidx
-  %arrayidx1 = getelementptr inbounds float, float* %P, i64 1
-  store float %v, float* %arrayidx1
-  %arrayidx2 = getelementptr inbounds float, float* %P, i64 2
-  store float %v, float* %arrayidx2
-  %arrayidx3 = getelementptr inbounds float, float* %P, i64 4
-  store float %v, float* %arrayidx3
-  ret void
-}
-
-; Cannot fuse the store/load if there is volatile in between
-define i64 @store_volatile(i64* nocapture %P, i64 %v) {
-entry:
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_volatile:%bb.0
-; CHECK-NOT: Cluster ld/st
-; CHECK: SU([[SU2]]): STD %[[REG:[0-9]+]]:g8rc, 24
-; CHECK: SU([[SU3]]): STD %[[REG]]:g8rc, 16
-; CHECK: SU([[SU4]]): STD %[[REG]]:g8rc, 8
-; CHECK: SU([[SU5]]): STD %[[REG]]:g8rc, 32
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_volatile:%bb.0
-; CHECK-NOT: Cluster ld/st
-; CHECK: SU([[SU0]]): STD renamable $x[[REG:[0-9]+]], 24
-; CHECK: SU([[SU1]]): STD renamable $x[[REG]], 16
-; CHECK: SU([[SU2]]): STD renamable $x[[REG]], 8
-; CHECK: SU([[SU3]]): STD renamable $x[[REG]], 32
-  %arrayidx = getelementptr inbounds i64, i64* %P, i64 3
-  store volatile i64 %v, i64* %arrayidx
-  %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2
-  store volatile i64 %v, i64* %arrayidx1
-  %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1
-  store volatile i64 %v, i64* %arrayidx2
-  %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4
-  store volatile i64 %v, i64* %arrayidx3
-  ret i64 %v
-}
-
- at p = common local_unnamed_addr global [100 x i32] zeroinitializer, align 4
-
-define void @store_i32_stw_stw8(i32 signext %m, i32 signext %n)  {
-entry:
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_i32_stw_stw8:%bb.0
-; CHECK: Cluster ld/st SU([[SU5:[0-9]+]]) - SU([[SU8:[0-9]+]])
-; CHECK: SU([[SU5]]): STW8 %{{[0-9]+}}:g8rc, 24
-; CHECK: SU([[SU8]]): STW %{{[0-9]+}}:gprc, 20
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_i32_stw_stw8:%bb.0
-; CHECK: Cluster ld/st SU([[SU5:[0-9]+]]) - SU([[SU6:[0-9]+]])
-; CHECK: SU([[SU5]]): STW8 renamable $x{{[0-9]+}}, 24
-; CHECK: SU([[SU6]]): STW renamable $r{{[0-9]+}}, 20
-  store i32 9, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 6), align 4
-  store i32 %n, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 7), align 4
-  %add = add nsw i32 %n, %m
-  store i32 %add, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 5), align 4
-  ret void
-}
-
-define void @store_i32_stw8(i32 signext %m, i32 signext %n)  {
-entry:
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_i32_stw8:%bb.0
-; CHECK: Cluster ld/st SU([[SU4:[0-9]+]]) - SU([[SU5:[0-9]+]])
-; CHECK: SU([[SU4]]): STW8 %{{[0-9]+}}:g8rc, 24
-; CHECK: SU([[SU5]]): STW8 %{{[0-9]+}}:g8rc, 28
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_i32_stw8:%bb.0
-; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]])
-; CHECK: SU([[SU3]]): STW8 renamable $x{{[0-9]+}}, 24
-; CHECK: SU([[SU4]]): STW8 renamable $x{{[0-9]+}}, 28
-  store i32 9, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 6), align 4
-  store i32 %n, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 7), align 4
-  ret void
-}
-
-declare void @bar(i64*)
-
-define void @store_frame_index(i32 %a, i32 %b) {
-entry:
-; CHECK: ********** MI Scheduling **********
-; CHECK-LABEL: store_frame_index:%bb.0
-; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]])
-; CHECK: SU([[SU2]]): STD %{{[0-9]+}}:g8rc, 0, %stack.0.buf
-; CHECK: SU([[SU3]]): STD %{{[0-9]+}}:g8rc, 8, %stack.0.buf
-  %buf = alloca [8 x i64], align 8
-  %0 = bitcast [8 x i64]* %buf to i8*
-  %conv = zext i32 %a to i64
-  %arrayidx = getelementptr inbounds [8 x i64], [8 x i64]* %buf, i64 0, i64 0
-  store i64 %conv, i64* %arrayidx, align 8
-  %conv1 = zext i32 %b to i64
-  %arrayidx2 = getelementptr inbounds [8 x i64], [8 x i64]* %buf, i64 0, i64 1
-  store i64 %conv1, i64* %arrayidx2, align 8
-  call void @bar(i64* nonnull %arrayidx)
-  ret void
-}

diff  --git a/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll b/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll
index 1623889200848..9141fdc735a0e 100644
--- a/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll
+++ b/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll
@@ -104,7 +104,6 @@ define dso_local signext i32 @X2IsCallerSaved(i32 signext %a, i32 signext %b, i3
 ; CHECK-P9-NOT:    .localentry
 ; CHECK-ALL:       # %bb.0: # %entry
 ; CHECK-S-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
-; CHECK-S-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-S-NEXT:    add r11, r4, r3
 ; CHECK-S-NEXT:    sub r29, r8, r9
 ; CHECK-S-NEXT:    add r9, r10, r9
@@ -120,6 +119,7 @@ define dso_local signext i32 @X2IsCallerSaved(i32 signext %a, i32 signext %b, i3
 ; CHECK-S-NEXT:    mullw r3, r3, r7
 ; CHECK-S-NEXT:    sub r2, r6, r7
 ; CHECK-S-NEXT:    mullw r3, r3, r8
+; CHECK-S-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-S-NEXT:    add r30, r8, r7
 ; CHECK-S-NEXT:    mullw r3, r3, r2
 ; CHECK-S-NEXT:    mullw r3, r3, r30