[llvm] 9b5e2b5 - [PowerPC] Implement basic macro fusion in Power10

Qiu Chaofan via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 8 01:24:23 PST 2021


Author: Qiu Chaofan
Date: 2021-11-08T17:23:56+08:00
New Revision: 9b5e2b5261f6cb1e7913e452e5374e6f6ce68e44

URL: https://github.com/llvm/llvm-project/commit/9b5e2b5261f6cb1e7913e452e5374e6f6ce68e44
DIFF: https://github.com/llvm/llvm-project/commit/9b5e2b5261f6cb1e7913e452e5374e6f6ce68e44.diff

LOG: [PowerPC] Implement basic macro fusion in Power10

Including basic fusion types around arithmetic and logical instructions.

Reviewed By: jsji

Differential Revision: https://reviews.llvm.org/D111693

Added: 
    llvm/test/CodeGen/PowerPC/macro-fusion.mir

Modified: 
    llvm/lib/Target/PowerPC/PPC.td
    llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
    llvm/lib/Target/PowerPC/PPCMacroFusion.def
    llvm/lib/Target/PowerPC/PPCSubtarget.cpp
    llvm/lib/Target/PowerPC/PPCSubtarget.h
    llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
    llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll
    llvm/test/CodeGen/PowerPC/p10-spill-creq.ll
    llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index 8e983acb450b..a1ff20bb3612 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -187,6 +187,22 @@ def FeatureAddisLoadFusion : SubtargetFeature<"fuse-addis-load",
 def FeatureStoreFusion : SubtargetFeature<"fuse-store", "HasStoreFusion", "true",
                                           "Target supports store clustering",
                                           [FeatureFusion]>;
+def FeatureArithAddFusion :
+  SubtargetFeature<"fuse-arith-add", "HasArithAddFusion", "true",
+                   "Target supports Arithmetic Operations with Add fusion",
+                   [FeatureFusion]>;
+def FeatureAddLogicalFusion :
+  SubtargetFeature<"fuse-add-logical", "HasAddLogicalFusion", "true",
+                   "Target supports Add with Logical Operations fusion",
+                   [FeatureFusion]>;
+def FeatureLogicalAddFusion :
+  SubtargetFeature<"fuse-logical-add", "HasLogicalAddFusion", "true",
+                   "Target supports Logical with Add Operations fusion",
+                   [FeatureFusion]>;
+def FeatureLogicalFusion :
+  SubtargetFeature<"fuse-logical", "HasLogicalFusion", "true",
+                   "Target supports Logical Operations fusion",
+                   [FeatureFusion]>;
 def FeatureUnalignedFloats :
   SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess",
                    "true", "CPU does not trap on unaligned FP access">;
@@ -375,7 +391,10 @@ def ProcessorFeatures {
   // Power10
   // For P10 CPU we assume that all of the existing features from Power9
   // still exist with the exception of those we know are Power9 specific.
-  list<SubtargetFeature> FusionFeatures = [FeatureStoreFusion];
+  list<SubtargetFeature> FusionFeatures = [
+    FeatureStoreFusion, FeatureAddLogicalFusion, FeatureLogicalAddFusion,
+    FeatureLogicalFusion, FeatureArithAddFusion
+  ];
   list<SubtargetFeature> P10AdditionalFeatures =
     !listconcat(FusionFeatures, [
        DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs,

diff  --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
index d12c6d9cd406..bdff5109c1e1 100644
--- a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
@@ -75,6 +75,19 @@ static bool matchingRegOps(const MachineInstr &FirstMI,
   return Op1.getReg() == Op2.getReg();
 }
 
+static bool matchingImmOps(const MachineInstr &MI,
+                           int MIOpIndex,
+                           int64_t Expect,
+                           unsigned ExtendFrom = 64) {
+  const MachineOperand &Op = MI.getOperand(MIOpIndex);
+  if (!Op.isImm())
+    return false;
+  int64_t Imm = Op.getImm();
+  if (ExtendFrom < 64)
+    Imm = SignExtend64(Imm, ExtendFrom);
+  return Imm == Expect;
+}
+
 // Return true if the FirstMI meets the constraints of SecondMI according to
 // fusion specification.
 static bool checkOpConstraints(FusionFeature::FusionKind Kd,
@@ -116,7 +129,7 @@ static bool checkOpConstraints(FusionFeature::FusionKind Kd,
     if (((Imm & 0xFFF0) != 0) && ((Imm & 0xFFF0) != 0xFFF0))
       return false;
 
-    // If si = 1111111111110000 and the msb of the d/ds field of the load equals 
+    // If si = 1111111111110000 and the msb of the d/ds field of the load equals
     // 1, then fusion does not occur.
     if ((Imm & 0xFFF0) == 0xFFF0) {
       const MachineOperand &D = SecondMI.getOperand(1);
@@ -132,6 +145,10 @@ static bool checkOpConstraints(FusionFeature::FusionKind Kd,
     }
     return true;
   }
+
+  case FusionFeature::FK_SldiAdd:
+    return (matchingImmOps(FirstMI, 2, 3) && matchingImmOps(FirstMI, 3, 60)) ||
+           (matchingImmOps(FirstMI, 2, 6) && matchingImmOps(FirstMI, 3, 57));
   }
 
   llvm_unreachable("All the cases should have been handled");

diff  --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.def b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
index c7e4e7c22e0a..469a24800423 100644
--- a/llvm/lib/Target/PowerPC/PPCMacroFusion.def
+++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
@@ -41,5 +41,42 @@ FUSION_FEATURE(AddisLoad, hasAddisLoadFusion, 2, \
                FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8), \
                FUSION_OP_SET(LD, LBZ, LBZ8, LHZ, LHZ8, LWZ, LWZ8))
 
+// Power10 User Manual Section 19.1.5.4, Fusion
+// {add, mulld} - add
+FUSION_FEATURE(ArithAdd, hasArithAddFusion, -1,
+               FUSION_OP_SET(ADD4, ADD8, MULLD), FUSION_OP_SET(ADD4, ADD8))
+
+// {add, subf} - {and, nand, nor, or}
+FUSION_FEATURE(ArithLogical, hasAddLogicalFusion, -1,
+               FUSION_OP_SET(ADD4, ADD8, SUBF, SUBF8),
+               FUSION_OP_SET(AND, AND8, OR, OR8, NAND, NAND8, NOR, NOR8))
+
+// {and, andc, eqv, nand, nor, or, orc, xor} - {add, subf}
+FUSION_FEATURE(LogicalArith, hasLogicalAddFusion, -1,
+               FUSION_OP_SET(AND, ANDC, EQV, NAND, NOR, OR, ORC, XOR, AND8,
+                             ANDC8, EQV8, NAND8, NOR8, OR8, ORC8, XOR8),
+               FUSION_OP_SET(ADD4, ADD8, SUBF, SUBF8))
+
+// Either of {and, andc, eqv, nand, nor, or, orc, xor}
+FUSION_FEATURE(Logical, hasLogicalFusion, -1,
+               FUSION_OP_SET(AND, ANDC, EQV, NAND, NOR, OR, ORC, XOR, AND8,
+                             ANDC8, EQV8, NAND8, NOR8, OR8, ORC8, XOR8),
+               FUSION_OP_SET(AND, ANDC, EQV, NAND, NOR, OR, ORC, XOR, AND8,
+                             ANDC8, EQV8, NAND8, NOR8, OR8, ORC8, XOR8))
+
+// vaddudm - vaddudm
+FUSION_FEATURE(VecAdd, hasArithAddFusion, -1, FUSION_OP_SET(VADDUDM),
+               FUSION_OP_SET(VADDUDM))
+
+// Either of {vand, vandc, veqv, vnand, vnor, vor, vorc, vxor}
+FUSION_FEATURE(VecLogical, hasLogicalFusion, -1,
+               FUSION_OP_SET(VAND, VANDC, VEQV, VNAND, VNOR, VOR, VORC, VXOR),
+               FUSION_OP_SET(VAND, VANDC, VEQV, VNAND, VNOR, VOR, VORC, VXOR))
+
+// sldi rx, ra, {3, 6} - {add, subf}
+// sldi rx, ra n is alias of rldicr rx, ra, n, 63-n
+FUSION_FEATURE(SldiAdd, hasArithAddFusion, -1, FUSION_OP_SET(RLDICR, RLDICR_32),
+               FUSION_OP_SET(ADD4, ADD8, SUBF, SUBF8))
+
 #undef FUSION_FEATURE
 #undef FUSION_OP_SET

diff  --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index b9684f61795a..dfc29dbb10f1 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -127,6 +127,10 @@ void PPCSubtarget::initializeEnvironment() {
   HasStoreFusion = false;
   HasAddiLoadFusion = false;
   HasAddisLoadFusion = false;
+  HasArithAddFusion = false;
+  HasAddLogicalFusion = false;
+  HasLogicalAddFusion = false;
+  HasLogicalFusion = false;
   IsISA2_06 = false;
   IsISA2_07 = false;
   IsISA3_0 = false;

diff  --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index e2502cae7681..783ea121ccb8 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -147,6 +147,10 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   bool HasStoreFusion;
   bool HasAddiLoadFusion;
   bool HasAddisLoadFusion;
+  bool HasArithAddFusion;
+  bool HasAddLogicalFusion;
+  bool HasLogicalAddFusion;
+  bool HasLogicalFusion;
   bool IsISA2_06;
   bool IsISA2_07;
   bool IsISA3_0;
@@ -332,6 +336,10 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   bool hasStoreFusion() const { return HasStoreFusion; }
   bool hasAddiLoadFusion() const { return HasAddiLoadFusion; }
   bool hasAddisLoadFusion() const { return HasAddisLoadFusion; }
+  bool hasArithAddFusion() const { return HasArithAddFusion; }
+  bool hasAddLogicalFusion() const { return HasAddLogicalFusion; }
+  bool hasLogicalAddFusion() const { return HasLogicalAddFusion; }
+  bool hasLogicalFusion() const { return HasLogicalFusion; }
   bool needsSwapsForVSXMemOps() const {
     return hasVSX() && isLittleEndian() && !hasP9Vector();
   }

diff  --git a/llvm/test/CodeGen/PowerPC/macro-fusion.mir b/llvm/test/CodeGen/PowerPC/macro-fusion.mir
new file mode 100644
index 000000000000..16391a2ab8fa
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/macro-fusion.mir
@@ -0,0 +1,95 @@
+# REQUIRES: asserts
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 -x=mir < %s \
+# RUN:   -debug-only=machine-scheduler -start-before=postmisched 2>&1 \
+# RUN:   | FileCheck %s
+
+# CHECK: add_mulld:%bb.0
+# CHECK: Macro fuse: SU(0) - SU(1) /  MULLD - ADD8
+---
+name: add_mulld
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x3, $x4, $x5
+    renamable $x4 = MULLD $x3, $x4
+    renamable $x3 = ADD8 killed renamable $x4, $x5
+    BLR8 implicit $lr8, implicit $rm, implicit $x3
+...
+
+# CHECK: add_and:%bb.0
+# CHECK: Macro fuse: SU(0) - SU(1) /  ADD8 - AND8
+---
+name: add_and
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x3, $x4, $x5
+    renamable $x4 = ADD8 $x3, $x4
+    renamable $x3 = AND8 killed renamable $x4, $x5
+    BLR8 implicit $lr8, implicit $rm, implicit $x3
+...
+
+# CHECK: xor_subf:%bb.0
+# CHECK: Macro fuse: SU(0) - SU(1) /  XOR8 - SUBF8
+---
+name: xor_subf
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x3, $x4, $x5
+    renamable $x4 = XOR8 $x3, $x4
+    renamable $x3 = SUBF8 killed renamable $x4, $x5
+    BLR8 implicit $lr8, implicit $rm, implicit $x3
+...
+
+# CHECK: or_nand:%bb.0
+# CHECK: Macro fuse: SU(0) - SU(1) /  OR8 - NAND8
+---
+name: or_nand
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x3, $x4, $x5
+    renamable $x4 = OR8 $x3, $x4
+    renamable $x3 = NAND8 killed renamable $x4, $x5
+    BLR8 implicit $lr8, implicit $rm, implicit $x3
+...
+
+# CHECK: vand_vand:%bb.0
+# CHECK: Macro fuse: SU(0) - SU(1) /  VAND - VAND
+---
+name: vand_vand
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v2, $v3, $v4
+    renamable $v2 = VAND $v3, $v2
+    renamable $v2 = VAND killed renamable $v2, $v4
+    BLR8 implicit $lr8, implicit $rm
+...
+
+# CHECK: vadd_vadd:%bb.0
+# CHECK: Macro fuse: SU(0) - SU(1) /  VADDUDM - VADDUDM
+---
+name: vadd_vadd
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v2, $v3, $v4
+    renamable $v2 = VADDUDM $v3, $v2
+    renamable $v2 = VADDUDM killed renamable $v2, $v4
+    BLR8 implicit $lr8, implicit $rm
+...
+
+# CHECK: sldi_add:%bb.0
+# CHECK: Macro fuse: SU(0) - SU(1) /  RLDICR - ADD8
+---
+name: sldi_add
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x3, $x4, $x5
+    renamable $x4 = RLDICR $x3, 3, 60
+    renamable $x3 = ADD8 killed renamable $x4, $x5
+    BLR8 implicit $lr8, implicit $rm, implicit $x3
+...

diff  --git a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
index ee97843beac2..5c8c84be0702 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
@@ -130,37 +130,37 @@ entry:
 define dso_local void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) {
 ; LE-PAIRED-LABEL: testXLdSt:
 ; LE-PAIRED:       # %bb.0: # %entry
-; LE-PAIRED-NEXT:    sldi r3, r3, 6
 ; LE-PAIRED-NEXT:    paddi r5, 0, f at PCREL, 1
+; LE-PAIRED-NEXT:    sldi r3, r3, 6
 ; LE-PAIRED-NEXT:    add r6, r5, r3
 ; LE-PAIRED-NEXT:    lxv vs1, 32(r6)
 ; LE-PAIRED-NEXT:    lxv vs0, 48(r6)
 ; LE-PAIRED-NEXT:    lxvx vs3, r5, r3
 ; LE-PAIRED-NEXT:    lxv vs2, 16(r6)
 ; LE-PAIRED-NEXT:    sldi r3, r4, 6
+; LE-PAIRED-NEXT:    add r4, r5, r3
 ; LE-PAIRED-NEXT:    stxvx vs3, r5, r3
-; LE-PAIRED-NEXT:    add r3, r5, r3
-; LE-PAIRED-NEXT:    stxv vs0, 48(r3)
-; LE-PAIRED-NEXT:    stxv vs1, 32(r3)
-; LE-PAIRED-NEXT:    stxv vs2, 16(r3)
+; LE-PAIRED-NEXT:    stxv vs0, 48(r4)
+; LE-PAIRED-NEXT:    stxv vs1, 32(r4)
+; LE-PAIRED-NEXT:    stxv vs2, 16(r4)
 ; LE-PAIRED-NEXT:    blr
 ;
 ; BE-PAIRED-LABEL: testXLdSt:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r5, r2, f at toc@ha
-; BE-PAIRED-NEXT:    sldi r3, r3, 6
 ; BE-PAIRED-NEXT:    addi r5, r5, f at toc@l
+; BE-PAIRED-NEXT:    sldi r3, r3, 6
 ; BE-PAIRED-NEXT:    add r6, r5, r3
 ; BE-PAIRED-NEXT:    lxvx vs0, r5, r3
 ; BE-PAIRED-NEXT:    sldi r3, r4, 6
+; BE-PAIRED-NEXT:    add r4, r5, r3
 ; BE-PAIRED-NEXT:    lxv vs1, 16(r6)
 ; BE-PAIRED-NEXT:    lxv vs3, 48(r6)
 ; BE-PAIRED-NEXT:    lxv vs2, 32(r6)
 ; BE-PAIRED-NEXT:    stxvx vs0, r5, r3
-; BE-PAIRED-NEXT:    add r3, r5, r3
-; BE-PAIRED-NEXT:    stxv vs1, 16(r3)
-; BE-PAIRED-NEXT:    stxv vs3, 48(r3)
-; BE-PAIRED-NEXT:    stxv vs2, 32(r3)
+; BE-PAIRED-NEXT:    stxv vs1, 16(r4)
+; BE-PAIRED-NEXT:    stxv vs3, 48(r4)
+; BE-PAIRED-NEXT:    stxv vs2, 32(r4)
 ; BE-PAIRED-NEXT:    blr
 ;
 ; LE-PWR9-LABEL: testXLdSt:

diff  --git a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll
index 0d9662dc1242..d5ef4e64d0a8 100644
--- a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll
+++ b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll
@@ -10,8 +10,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
 define void @foo(i32* %.m, i32* %.n, [0 x %_elem_type_of_a]* %.a, [0 x %_elem_type_of_x]* %.x, i32* %.l, <2 x double>* %.vy01, <2 x double>* %.vy02, <2 x double>* %.vy03, <2 x double>* %.vy04, <2 x double>* %.vy05, <2 x double>* %.vy06, <2 x double>* %.vy07, <2 x double>* %.vy08, <2 x double>* %.vy09, <2 x double>* %.vy0a, <2 x double>* %.vy0b, <2 x double>* %.vy0c, <2 x double>* %.vy21, <2 x double>* %.vy22, <2 x double>* %.vy23, <2 x double>* %.vy24, <2 x double>* %.vy25, <2 x double>* %.vy26, <2 x double>* %.vy27, <2 x double>* %.vy28, <2 x double>* %.vy29, <2 x double>* %.vy2a, <2 x double>* %.vy2b, <2 x double>* %.vy2c) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    stdu 1, -592(1)
-; CHECK-NEXT:    .cfi_def_cfa_offset 592
+; CHECK-NEXT:    stdu 1, -608(1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 608
 ; CHECK-NEXT:    .cfi_offset r14, -192
 ; CHECK-NEXT:    .cfi_offset r15, -184
 ; CHECK-NEXT:    .cfi_offset r16, -176
@@ -48,193 +48,194 @@ define void @foo(i32* %.m, i32* %.n, [0 x %_elem_type_of_a]* %.a, [0 x %_elem_ty
 ; CHECK-NEXT:    .cfi_offset v29, -240
 ; CHECK-NEXT:    .cfi_offset v30, -224
 ; CHECK-NEXT:    .cfi_offset v31, -208
-; CHECK-NEXT:    lwz 4, 0(4)
-; CHECK-NEXT:    std 14, 400(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 15, 408(1) # 8-byte Folded Spill
-; CHECK-NEXT:    cmpwi 4, 1
-; CHECK-NEXT:    std 16, 416(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 17, 424(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 18, 432(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 19, 440(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 20, 448(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 21, 456(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 22, 464(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 23, 472(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 24, 480(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 25, 488(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 26, 496(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 27, 504(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 28, 512(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 29, 520(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 30, 528(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 31, 536(1) # 8-byte Folded Spill
-; CHECK-NEXT:    stfd 26, 544(1) # 8-byte Folded Spill
-; CHECK-NEXT:    stfd 27, 552(1) # 8-byte Folded Spill
-; CHECK-NEXT:    stfd 28, 560(1) # 8-byte Folded Spill
-; CHECK-NEXT:    stfd 29, 568(1) # 8-byte Folded Spill
-; CHECK-NEXT:    stfd 30, 576(1) # 8-byte Folded Spill
-; CHECK-NEXT:    stfd 31, 584(1) # 8-byte Folded Spill
-; CHECK-NEXT:    stxv 52, 208(1) # 16-byte Folded Spill
-; CHECK-NEXT:    stxv 53, 224(1) # 16-byte Folded Spill
-; CHECK-NEXT:    stxv 54, 240(1) # 16-byte Folded Spill
-; CHECK-NEXT:    stxv 55, 256(1) # 16-byte Folded Spill
-; CHECK-NEXT:    stxv 56, 272(1) # 16-byte Folded Spill
-; CHECK-NEXT:    stxv 57, 288(1) # 16-byte Folded Spill
-; CHECK-NEXT:    stxv 58, 304(1) # 16-byte Folded Spill
-; CHECK-NEXT:    stxv 59, 320(1) # 16-byte Folded Spill
-; CHECK-NEXT:    stxv 60, 336(1) # 16-byte Folded Spill
-; CHECK-NEXT:    stxv 61, 352(1) # 16-byte Folded Spill
-; CHECK-NEXT:    stxv 62, 368(1) # 16-byte Folded Spill
-; CHECK-NEXT:    stxv 63, 384(1) # 16-byte Folded Spill
+; CHECK-NEXT:    lwz 0, 0(4)
+; CHECK-NEXT:    std 14, 416(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 15, 424(1) # 8-byte Folded Spill
+; CHECK-NEXT:    cmpwi 0, 1
+; CHECK-NEXT:    std 16, 432(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 17, 440(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 18, 448(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 19, 456(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 20, 464(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 21, 472(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 22, 480(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 23, 488(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 24, 496(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 25, 504(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 26, 512(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 27, 520(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 28, 528(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 29, 536(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 30, 544(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 31, 552(1) # 8-byte Folded Spill
+; CHECK-NEXT:    stfd 26, 560(1) # 8-byte Folded Spill
+; CHECK-NEXT:    stfd 27, 568(1) # 8-byte Folded Spill
+; CHECK-NEXT:    stfd 28, 576(1) # 8-byte Folded Spill
+; CHECK-NEXT:    stfd 29, 584(1) # 8-byte Folded Spill
+; CHECK-NEXT:    stfd 30, 592(1) # 8-byte Folded Spill
+; CHECK-NEXT:    stfd 31, 600(1) # 8-byte Folded Spill
+; CHECK-NEXT:    stxv 52, 224(1) # 16-byte Folded Spill
+; CHECK-NEXT:    stxv 53, 240(1) # 16-byte Folded Spill
+; CHECK-NEXT:    stxv 54, 256(1) # 16-byte Folded Spill
+; CHECK-NEXT:    stxv 55, 272(1) # 16-byte Folded Spill
+; CHECK-NEXT:    stxv 56, 288(1) # 16-byte Folded Spill
+; CHECK-NEXT:    stxv 57, 304(1) # 16-byte Folded Spill
+; CHECK-NEXT:    stxv 58, 320(1) # 16-byte Folded Spill
+; CHECK-NEXT:    stxv 59, 336(1) # 16-byte Folded Spill
+; CHECK-NEXT:    stxv 60, 352(1) # 16-byte Folded Spill
+; CHECK-NEXT:    stxv 61, 368(1) # 16-byte Folded Spill
+; CHECK-NEXT:    stxv 62, 384(1) # 16-byte Folded Spill
+; CHECK-NEXT:    stxv 63, 400(1) # 16-byte Folded Spill
 ; CHECK-NEXT:    blt 0, .LBB0_7
 ; CHECK-NEXT:  # %bb.1: # %_loop_1_do_.lr.ph
-; CHECK-NEXT:    mr 22, 5
-; CHECK-NEXT:    lwz 5, 0(3)
-; CHECK-NEXT:    cmpwi 5, 1
+; CHECK-NEXT:    lwz 3, 0(3)
+; CHECK-NEXT:    cmpwi 3, 1
 ; CHECK-NEXT:    blt 0, .LBB0_7
 ; CHECK-NEXT:  # %bb.2: # %_loop_1_do_.preheader
-; CHECK-NEXT:    mr 14, 6
-; CHECK-NEXT:    ld 6, 712(1)
-; CHECK-NEXT:    lwa 3, 0(7)
-; CHECK-NEXT:    addi 5, 5, 1
+; CHECK-NEXT:    mr 23, 5
+; CHECK-NEXT:    ld 5, 704(1)
+; CHECK-NEXT:    addi 3, 3, 1
+; CHECK-NEXT:    ld 4, 728(1)
+; CHECK-NEXT:    mr 11, 10
+; CHECK-NEXT:    mr 10, 6
 ; CHECK-NEXT:    std 8, 40(1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std 9, 48(1) # 8-byte Folded Spill
-; CHECK-NEXT:    mr 11, 10
-; CHECK-NEXT:    cmpldi 5, 9
+; CHECK-NEXT:    lwa 7, 0(7)
+; CHECK-NEXT:    ld 29, 840(1)
+; CHECK-NEXT:    cmpldi 3, 9
+; CHECK-NEXT:    ld 27, 832(1)
+; CHECK-NEXT:    ld 28, 856(1)
+; CHECK-NEXT:    std 5, 112(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 4, 120(1) # 8-byte Folded Spill
+; CHECK-NEXT:    lxv 1, 0(5)
+; CHECK-NEXT:    li 5, 9
+; CHECK-NEXT:    ld 30, 848(1)
+; CHECK-NEXT:    lxv 0, 0(4)
+; CHECK-NEXT:    sldi 4, 7, 3
+; CHECK-NEXT:    add 4, 4, 23
+; CHECK-NEXT:    sldi 16, 7, 2
+; CHECK-NEXT:    sldi 15, 7, 1
+; CHECK-NEXT:    ld 17, 760(1)
+; CHECK-NEXT:    std 27, 192(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 29, 200(1) # 8-byte Folded Spill
+; CHECK-NEXT:    lxv 6, 0(29)
+; CHECK-NEXT:    ld 26, 824(1)
+; CHECK-NEXT:    ld 25, 816(1)
+; CHECK-NEXT:    ld 24, 808(1)
+; CHECK-NEXT:    std 30, 208(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 28, 216(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 25, 176(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 26, 184(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 24, 168(1) # 8-byte Folded Spill
+; CHECK-NEXT:    iselgt 3, 3, 5
+; CHECK-NEXT:    ld 5, 752(1)
+; CHECK-NEXT:    addi 14, 4, 32
+; CHECK-NEXT:    sldi 4, 7, 4
+; CHECK-NEXT:    add 29, 7, 15
+; CHECK-NEXT:    ld 22, 800(1)
+; CHECK-NEXT:    ld 21, 792(1)
+; CHECK-NEXT:    ld 20, 784(1)
+; CHECK-NEXT:    std 22, 160(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 20, 144(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 21, 152(1) # 8-byte Folded Spill
+; CHECK-NEXT:    addi 6, 3, -2
+; CHECK-NEXT:    add 3, 7, 16
+; CHECK-NEXT:    add 4, 4, 23
+; CHECK-NEXT:    ld 19, 776(1)
+; CHECK-NEXT:    ld 18, 768(1)
 ; CHECK-NEXT:    lxv 4, 0(8)
-; CHECK-NEXT:    ld 8, 696(1)
-; CHECK-NEXT:    ld 10, 736(1)
-; CHECK-NEXT:    ld 28, 824(1)
-; CHECK-NEXT:    std 6, 88(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 10, 96(1) # 8-byte Folded Spill
-; CHECK-NEXT:    lxv 0, 0(6)
-; CHECK-NEXT:    li 6, 9
-; CHECK-NEXT:    ld 7, 688(1)
-; CHECK-NEXT:    ld 27, 840(1)
-; CHECK-NEXT:    ld 29, 832(1)
-; CHECK-NEXT:    ld 26, 816(1)
-; CHECK-NEXT:    ld 25, 808(1)
-; CHECK-NEXT:    ld 24, 800(1)
-; CHECK-NEXT:    ld 23, 792(1)
-; CHECK-NEXT:    std 8, 32(1) # 8-byte Folded Spill
-; CHECK-NEXT:    sldi 0, 3, 1
-; CHECK-NEXT:    sldi 31, 3, 2
-; CHECK-NEXT:    std 28, 184(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 29, 192(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 25, 168(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 26, 176(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 23, 152(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 24, 160(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 27, 200(1) # 8-byte Folded Spill
-; CHECK-NEXT:    iselgt 5, 5, 6
+; CHECK-NEXT:    lxv 2, 0(11)
+; CHECK-NEXT:    std 18, 128(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 19, 136(1) # 8-byte Folded Spill
+; CHECK-NEXT:    addi 12, 4, 32
+; CHECK-NEXT:    rldicl 2, 6, 61, 3
 ; CHECK-NEXT:    sldi 6, 3, 3
-; CHECK-NEXT:    ld 21, 784(1)
-; CHECK-NEXT:    ld 20, 776(1)
-; CHECK-NEXT:    ld 19, 768(1)
-; CHECK-NEXT:    ld 18, 760(1)
-; CHECK-NEXT:    std 18, 120(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 19, 128(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 20, 136(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 21, 144(1) # 8-byte Folded Spill
-; CHECK-NEXT:    add 2, 6, 22
-; CHECK-NEXT:    ld 17, 752(1)
-; CHECK-NEXT:    ld 16, 744(1)
+; CHECK-NEXT:    ld 4, 736(1)
+; CHECK-NEXT:    ld 31, 720(1)
+; CHECK-NEXT:    std 11, 56(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 31, 64(1) # 8-byte Folded Spill
+; CHECK-NEXT:    add 11, 23, 6
+; CHECK-NEXT:    ld 6, 744(1)
+; CHECK-NEXT:    ld 8, 712(1)
+; CHECK-NEXT:    std 5, 96(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 17, 104(1) # 8-byte Folded Spill
+; CHECK-NEXT:    lxv 39, 0(5)
+; CHECK-NEXT:    sldi 5, 7, 5
+; CHECK-NEXT:    lxv 5, 0(30)
+; CHECK-NEXT:    lxv 7, 0(28)
 ; CHECK-NEXT:    lxv 3, 0(9)
-; CHECK-NEXT:    ld 6, 728(1)
-; CHECK-NEXT:    addi 5, 5, -2
-; CHECK-NEXT:    std 7, 80(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 6, 72(1) # 8-byte Folded Spill
-; CHECK-NEXT:    ld 15, 720(1)
-; CHECK-NEXT:    ld 9, 704(1)
+; CHECK-NEXT:    addi 2, 2, 1
+; CHECK-NEXT:    add 30, 23, 5
+; CHECK-NEXT:    sldi 5, 29, 3
+; CHECK-NEXT:    add 28, 23, 5
+; CHECK-NEXT:    ld 5, 864(1)
 ; CHECK-NEXT:    lxv 43, 0(8)
-; CHECK-NEXT:    ld 8, 848(1)
-; CHECK-NEXT:    std 11, 56(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 15, 64(1) # 8-byte Folded Spill
-; CHECK-NEXT:    lxv 2, 0(11)
-; CHECK-NEXT:    sldi 11, 3, 4
-; CHECK-NEXT:    rldicl 5, 5, 61, 3
-; CHECK-NEXT:    lxv 1, 0(7)
-; CHECK-NEXT:    add 7, 3, 31
-; CHECK-NEXT:    add 12, 11, 22
-; CHECK-NEXT:    addi 11, 2, 32
-; CHECK-NEXT:    addi 2, 5, 1
-; CHECK-NEXT:    lxv 6, 0(28)
-; CHECK-NEXT:    sldi 5, 3, 5
-; CHECK-NEXT:    add 28, 3, 0
-; CHECK-NEXT:    lxv 42, 0(9)
-; CHECK-NEXT:    lxv 41, 0(15)
+; CHECK-NEXT:    lxv 42, 0(31)
+; CHECK-NEXT:    lxv 38, 0(17)
+; CHECK-NEXT:    std 4, 72(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 6, 80(1) # 8-byte Folded Spill
+; CHECK-NEXT:    lxv 41, 0(4)
 ; CHECK-NEXT:    lxv 40, 0(6)
-; CHECK-NEXT:    lxv 39, 0(10)
-; CHECK-NEXT:    lxv 38, 0(16)
-; CHECK-NEXT:    sldi 30, 7, 3
-; CHECK-NEXT:    addi 12, 12, 32
-; CHECK-NEXT:    add 30, 22, 30
-; CHECK-NEXT:    std 16, 104(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 17, 112(1) # 8-byte Folded Spill
-; CHECK-NEXT:    lxv 33, 0(17)
-; CHECK-NEXT:    lxv 32, 0(18)
-; CHECK-NEXT:    lxv 37, 0(19)
-; CHECK-NEXT:    lxv 36, 0(20)
-; CHECK-NEXT:    lxv 13, 0(21)
-; CHECK-NEXT:    lxv 12, 0(23)
-; CHECK-NEXT:    li 23, 0
-; CHECK-NEXT:    lxv 11, 0(24)
-; CHECK-NEXT:    li 24, 1
-; CHECK-NEXT:    lxv 9, 0(25)
-; CHECK-NEXT:    mulli 25, 3, 6
-; CHECK-NEXT:    lxv 8, 0(26)
-; CHECK-NEXT:    mulli 26, 3, 48
-; CHECK-NEXT:    lxv 5, 0(29)
-; CHECK-NEXT:    add 29, 22, 5
-; CHECK-NEXT:    sldi 5, 28, 3
-; CHECK-NEXT:    lxv 7, 0(27)
-; CHECK-NEXT:    add 27, 22, 5
-; CHECK-NEXT:    mr 5, 22
-; CHECK-NEXT:    lxv 10, 0(8)
+; CHECK-NEXT:    lxv 33, 0(18)
+; CHECK-NEXT:    lxv 32, 0(19)
+; CHECK-NEXT:    std 5, 88(1) # 8-byte Folded Spill
+; CHECK-NEXT:    lxv 37, 0(20)
+; CHECK-NEXT:    lxv 36, 0(21)
+; CHECK-NEXT:    lxv 13, 0(22)
+; CHECK-NEXT:    lxv 12, 0(24)
+; CHECK-NEXT:    lxv 11, 0(25)
+; CHECK-NEXT:    lxv 9, 0(26)
+; CHECK-NEXT:    lxv 8, 0(27)
+; CHECK-NEXT:    lxv 10, 0(5)
+; CHECK-NEXT:    mulli 27, 7, 48
+; CHECK-NEXT:    mulli 26, 7, 6
+; CHECK-NEXT:    li 25, 1
+; CHECK-NEXT:    li 24, 0
+; CHECK-NEXT:    mr 5, 23
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_3: # %_loop_2_do_.lr.ph
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB0_4 Depth 2
-; CHECK-NEXT:    maddld 6, 25, 23, 7
+; CHECK-NEXT:    maddld 6, 26, 24, 3
 ; CHECK-NEXT:    mtctr 2
 ; CHECK-NEXT:    sldi 6, 6, 3
-; CHECK-NEXT:    add 21, 22, 6
-; CHECK-NEXT:    maddld 6, 25, 23, 31
+; CHECK-NEXT:    add 22, 23, 6
+; CHECK-NEXT:    maddld 6, 26, 24, 16
 ; CHECK-NEXT:    sldi 6, 6, 3
-; CHECK-NEXT:    add 20, 22, 6
-; CHECK-NEXT:    maddld 6, 25, 23, 28
+; CHECK-NEXT:    add 21, 23, 6
+; CHECK-NEXT:    maddld 6, 26, 24, 29
 ; CHECK-NEXT:    sldi 6, 6, 3
-; CHECK-NEXT:    add 19, 22, 6
-; CHECK-NEXT:    maddld 6, 25, 23, 0
+; CHECK-NEXT:    add 20, 23, 6
+; CHECK-NEXT:    maddld 6, 26, 24, 15
 ; CHECK-NEXT:    sldi 6, 6, 3
-; CHECK-NEXT:    add 18, 22, 6
-; CHECK-NEXT:    maddld 6, 25, 23, 3
+; CHECK-NEXT:    add 19, 23, 6
+; CHECK-NEXT:    maddld 6, 26, 24, 7
 ; CHECK-NEXT:    sldi 6, 6, 3
-; CHECK-NEXT:    add 17, 22, 6
-; CHECK-NEXT:    mulld 6, 25, 23
+; CHECK-NEXT:    add 18, 23, 6
+; CHECK-NEXT:    mulld 6, 26, 24
 ; CHECK-NEXT:    sldi 6, 6, 3
-; CHECK-NEXT:    add 16, 22, 6
-; CHECK-NEXT:    mr 6, 14
+; CHECK-NEXT:    add 17, 23, 6
+; CHECK-NEXT:    mr 6, 10
 ; CHECK-NEXT:    .p2align 5
 ; CHECK-NEXT:  .LBB0_4: # %_loop_2_do_
 ; CHECK-NEXT:    # Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    lxvp 34, 0(6)
-; CHECK-NEXT:    lxvp 44, 0(16)
+; CHECK-NEXT:    lxvp 44, 0(17)
 ; CHECK-NEXT:    xvmaddadp 4, 45, 35
-; CHECK-NEXT:    lxvp 46, 0(17)
+; CHECK-NEXT:    lxvp 46, 0(18)
 ; CHECK-NEXT:    xvmaddadp 3, 47, 35
-; CHECK-NEXT:    lxvp 48, 0(18)
-; CHECK-NEXT:    lxvp 50, 0(19)
-; CHECK-NEXT:    lxvp 62, 0(20)
-; CHECK-NEXT:    lxvp 60, 0(21)
+; CHECK-NEXT:    lxvp 48, 0(19)
+; CHECK-NEXT:    lxvp 50, 0(20)
+; CHECK-NEXT:    lxvp 62, 0(21)
+; CHECK-NEXT:    lxvp 60, 0(22)
 ; CHECK-NEXT:    lxvp 58, 32(6)
-; CHECK-NEXT:    lxvp 56, 32(16)
-; CHECK-NEXT:    lxvp 54, 32(17)
-; CHECK-NEXT:    lxvp 52, 32(18)
-; CHECK-NEXT:    lxvp 30, 32(19)
-; CHECK-NEXT:    lxvp 28, 32(20)
-; CHECK-NEXT:    lxvp 26, 32(21)
+; CHECK-NEXT:    lxvp 56, 32(17)
+; CHECK-NEXT:    lxvp 54, 32(18)
+; CHECK-NEXT:    lxvp 52, 32(19)
+; CHECK-NEXT:    lxvp 30, 32(20)
+; CHECK-NEXT:    lxvp 28, 32(21)
+; CHECK-NEXT:    lxvp 26, 32(22)
 ; CHECK-NEXT:    xvmaddadp 2, 49, 35
 ; CHECK-NEXT:    xvmaddadp 1, 51, 35
 ; CHECK-NEXT:    xvmaddadp 43, 63, 35
@@ -258,24 +259,24 @@ define void @foo(i32* %.m, i32* %.n, [0 x %_elem_type_of_a]* %.a, [0 x %_elem_ty
 ; CHECK-NEXT:    xvmaddadp 7, 28, 58
 ; CHECK-NEXT:    xvmaddadp 10, 26, 58
 ; CHECK-NEXT:    addi 6, 6, 64
-; CHECK-NEXT:    addi 16, 16, 64
 ; CHECK-NEXT:    addi 17, 17, 64
 ; CHECK-NEXT:    addi 18, 18, 64
 ; CHECK-NEXT:    addi 19, 19, 64
 ; CHECK-NEXT:    addi 20, 20, 64
 ; CHECK-NEXT:    addi 21, 21, 64
+; CHECK-NEXT:    addi 22, 22, 64
 ; CHECK-NEXT:    bdnz .LBB0_4
 ; CHECK-NEXT:  # %bb.5: # %_loop_2_endl_
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    addi 24, 24, 6
-; CHECK-NEXT:    add 5, 5, 26
-; CHECK-NEXT:    add 11, 11, 26
-; CHECK-NEXT:    add 30, 30, 26
-; CHECK-NEXT:    add 12, 12, 26
-; CHECK-NEXT:    add 29, 29, 26
-; CHECK-NEXT:    add 27, 27, 26
-; CHECK-NEXT:    addi 23, 23, 1
-; CHECK-NEXT:    cmpld 24, 4
+; CHECK-NEXT:    addi 25, 25, 6
+; CHECK-NEXT:    add 5, 5, 27
+; CHECK-NEXT:    add 14, 14, 27
+; CHECK-NEXT:    add 11, 11, 27
+; CHECK-NEXT:    add 12, 12, 27
+; CHECK-NEXT:    add 30, 30, 27
+; CHECK-NEXT:    add 28, 28, 27
+; CHECK-NEXT:    addi 24, 24, 1
+; CHECK-NEXT:    cmpld 25, 0
 ; CHECK-NEXT:    ble 0, .LBB0_3
 ; CHECK-NEXT:  # %bb.6: # %_loop_1_loopHeader_._return_bb_crit_edge.loopexit
 ; CHECK-NEXT:    ld 3, 40(1) # 8-byte Folded Reload
@@ -284,84 +285,85 @@ define void @foo(i32* %.m, i32* %.n, [0 x %_elem_type_of_a]* %.a, [0 x %_elem_ty
 ; CHECK-NEXT:    stxv 3, 0(3)
 ; CHECK-NEXT:    ld 3, 56(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 2, 0(3)
-; CHECK-NEXT:    ld 3, 80(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 3, 112(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 1, 0(3)
-; CHECK-NEXT:    ld 3, 32(1) # 8-byte Folded Reload
-; CHECK-NEXT:    stxv 43, 0(3)
-; CHECK-NEXT:    ld 3, 88(1) # 8-byte Folded Reload
-; CHECK-NEXT:    stxv 42, 0(9)
-; CHECK-NEXT:    stxv 0, 0(3)
 ; CHECK-NEXT:    ld 3, 64(1) # 8-byte Folded Reload
-; CHECK-NEXT:    stxv 41, 0(3)
+; CHECK-NEXT:    stxv 43, 0(8)
+; CHECK-NEXT:    stxv 42, 0(3)
+; CHECK-NEXT:    ld 3, 120(1) # 8-byte Folded Reload
+; CHECK-NEXT:    stxv 0, 0(3)
 ; CHECK-NEXT:    ld 3, 72(1) # 8-byte Folded Reload
+; CHECK-NEXT:    stxv 41, 0(3)
+; CHECK-NEXT:    ld 3, 80(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 40, 0(3)
 ; CHECK-NEXT:    ld 3, 96(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 39, 0(3)
 ; CHECK-NEXT:    ld 3, 104(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 38, 0(3)
-; CHECK-NEXT:    ld 3, 112(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 3, 128(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 33, 0(3)
-; CHECK-NEXT:    ld 3, 120(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 3, 136(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 32, 0(3)
-; CHECK-NEXT:    ld 3, 128(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 3, 144(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 37, 0(3)
-; CHECK-NEXT:    ld 3, 136(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 3, 152(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 36, 0(3)
-; CHECK-NEXT:    ld 3, 144(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 3, 160(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 13, 0(3)
-; CHECK-NEXT:    ld 3, 152(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 3, 168(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 12, 0(3)
-; CHECK-NEXT:    ld 3, 160(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 3, 176(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 11, 0(3)
-; CHECK-NEXT:    ld 3, 168(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 3, 184(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 9, 0(3)
-; CHECK-NEXT:    ld 3, 176(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 3, 192(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 8, 0(3)
-; CHECK-NEXT:    ld 3, 184(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 3, 200(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 6, 0(3)
-; CHECK-NEXT:    ld 3, 192(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 3, 208(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 5, 0(3)
-; CHECK-NEXT:    ld 3, 200(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 3, 216(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    stxv 7, 0(3)
-; CHECK-NEXT:    stxv 10, 0(8)
+; CHECK-NEXT:    ld 3, 88(1) # 8-byte Folded Reload
+; CHECK-NEXT:    stxv 10, 0(3)
 ; CHECK-NEXT:  .LBB0_7: # %_return_bb
-; CHECK-NEXT:    lxv 63, 384(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lxv 62, 368(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lxv 61, 352(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lxv 60, 336(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lxv 59, 320(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lxv 58, 304(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lxv 57, 288(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lxv 56, 272(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lxv 55, 256(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lxv 54, 240(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lxv 53, 224(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lxv 52, 208(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lfd 31, 584(1) # 8-byte Folded Reload
-; CHECK-NEXT:    lfd 30, 576(1) # 8-byte Folded Reload
-; CHECK-NEXT:    lfd 29, 568(1) # 8-byte Folded Reload
-; CHECK-NEXT:    lfd 28, 560(1) # 8-byte Folded Reload
-; CHECK-NEXT:    lfd 27, 552(1) # 8-byte Folded Reload
-; CHECK-NEXT:    lfd 26, 544(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 31, 536(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 30, 528(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 29, 520(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 28, 512(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 27, 504(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 26, 496(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 25, 488(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 24, 480(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 23, 472(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 22, 464(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 21, 456(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 20, 448(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 19, 440(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 18, 432(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 17, 424(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 16, 416(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 15, 408(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 14, 400(1) # 8-byte Folded Reload
-; CHECK-NEXT:    addi 1, 1, 592
+; CHECK-NEXT:    lxv 63, 400(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lxv 62, 384(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lxv 61, 368(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lxv 60, 352(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lxv 59, 336(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lxv 58, 320(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lxv 57, 304(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lxv 56, 288(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lxv 55, 272(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lxv 54, 256(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lxv 53, 240(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lxv 52, 224(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lfd 31, 600(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lfd 30, 592(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lfd 29, 584(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lfd 28, 576(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lfd 27, 568(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lfd 26, 560(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 31, 552(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 30, 544(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 29, 536(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 28, 528(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 27, 520(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 26, 512(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 25, 504(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 24, 496(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 23, 488(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 22, 480(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 21, 472(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 20, 464(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 19, 456(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 18, 448(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 17, 440(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 16, 432(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 15, 424(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 14, 416(1) # 8-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 608
 ; CHECK-NEXT:    blr
 entry:
   %_val_l_ = load i32, i32* %.l, align 4

diff  --git a/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll b/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll
index 27b7d2d47ebe..e3d54cc38dd8 100644
--- a/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll
@@ -158,15 +158,15 @@ define dso_local double @P10_Spill_CR_EQ(%2* %arg) local_unnamed_addr #0 {
 ; CHECK-NEXT:    mfocrf r8, 4
 ; CHECK-NEXT:    rlwimi r8, r9, 9, 23, 23
 ; CHECK-NEXT:    lwz r9, -4(r1)
-; CHECK-NEXT:    add r5, r7, r5
 ; CHECK-NEXT:    mtocrf 4, r8
 ; CHECK-NEXT:    isel r3, 0, r3, 4*cr5+lt
 ; CHECK-NEXT:    setbc r8, 4*cr5+un
 ; CHECK-NEXT:    isel r6, 0, r6, 4*cr5+gt
-; CHECK-NEXT:    isel r4, 0, r4, 4*cr5+eq
+; CHECK-NEXT:    add r5, r7, r5
+; CHECK-NEXT:    add r5, r8, r5
 ; CHECK-NEXT:    mtocrf 128, r9
 ; CHECK-NEXT:    lwz r9, -8(r1)
-; CHECK-NEXT:    add r5, r8, r5
+; CHECK-NEXT:    isel r4, 0, r4, 4*cr5+eq
 ; CHECK-NEXT:    iseleq r3, 0, r3
 ; CHECK-NEXT:    mtfprd f0, r5
 ; CHECK-NEXT:    xscvsxddp f0, f0
@@ -174,13 +174,13 @@ define dso_local double @P10_Spill_CR_EQ(%2* %arg) local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwz r9, -12(r1)
 ; CHECK-NEXT:    lwz r12, 8(r1)
 ; CHECK-NEXT:    iseleq r6, 0, r6
-; CHECK-NEXT:    add r3, r6, r3
 ; CHECK-NEXT:    xsmuldp f0, f0, f2
 ; CHECK-NEXT:    mtocrf 128, r9
 ; CHECK-NEXT:    mtocrf 32, r12
 ; CHECK-NEXT:    mtocrf 16, r12
 ; CHECK-NEXT:    mtocrf 8, r12
 ; CHECK-NEXT:    iseleq r4, 0, r4
+; CHECK-NEXT:    add r3, r6, r3
 ; CHECK-NEXT:    add r3, r4, r3
 ; CHECK-NEXT:    mtfprd f1, r3
 ; CHECK-NEXT:    xscvsxddp f1, f1

diff  --git a/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll b/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll
index 37cf078f53bf..c65a202e2108 100644
--- a/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll
+++ b/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll
@@ -45,12 +45,12 @@ define dso_local signext i32 @AsmClobberX2WithTOC(i32 signext %a, i32 signext %b
 ; CHECK-LARGE:     ld r2, .Lfunc_toc2-.Lfunc_gep2(r12)
 ; CHECK-LARGE:     add r2, r2, r12
 ; CHECK-S:         .localentry     AsmClobberX2WithTOC
-; CHECK-S:         add r3, r4, r3
-; CHECK-S-NEXT:    #APP
+; CHECK-S:         #APP
 ; CHECK-S-NEXT:    li r2, 0
 ; CHECK-S-NEXT:    #NO_APP
-; CHECK-S-NEXT:    plwz r4, global_int at PCREL(0), 1
-; CHECK-S-NEXT:    add r3, r3, r4
+; CHECK-S-NEXT:    plwz r5, global_int at PCREL(0), 1
+; CHECK-S-NEXT:    add r3, r4, r3
+; CHECK-S-NEXT:    add r3, r3, r5
 ; CHECK-S-NEXT:    extsw r3, r3
 ; CHECK-S-NEXT:    blr
 entry:


        


More information about the llvm-commits mailing list