[llvm] [PowerPC] improve P10 store forwarding on P7 scalar to vector (PR #102330)

via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 7 10:40:59 PDT 2024


https://github.com/RolandF77 updated https://github.com/llvm/llvm-project/pull/102330

>From a6a70508836addfa5ca59ff2e510da6d96629c42 Mon Sep 17 00:00:00 2001
From: Roland Froese <froese at ca.ibm.com>
Date: Wed, 7 Aug 2024 16:43:56 +0000
Subject: [PATCH 1/2] mitigate P7 scalar to vector LHS

---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  40 +-
 llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll |   8 +-
 .../build-vector-from-load-and-zeros.ll       |  52 ++-
 llvm/test/CodeGen/PowerPC/load-and-splat.ll   |   1 +
 llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll   | 350 +++++++++++-------
 .../CodeGen/PowerPC/test-vector-insert.ll     |  16 +-
 llvm/test/CodeGen/PowerPC/vec-trunc2.ll       |  76 ++--
 llvm/test/CodeGen/PowerPC/vsx.ll              |  16 +-
 .../CodeGen/PowerPC/widen-vec-correctly-be.ll |   4 +-
 9 files changed, 361 insertions(+), 202 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 1686ec572c855..d1f29a79c9668 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -105,6 +105,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "ppc-lowering"
 
+static cl::opt<bool> DisableP10StoreForward("disable-p10-store-forward",
+    cl::desc("disable P10 store forward-friendly conversion"), cl::Hidden,
+    cl::init(false));
+
 static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
 cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
 
@@ -985,6 +989,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
     setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
+    // LE is P8+/64-bit so direct moves are supported and these operations
+    // are legal. The custom transformation requires 64-bit since we need a
+    // pair of stores that will cover a 128-bit load for P10.
+    if (!DisableP10StoreForward && isPPC64 && !Subtarget.isLittleEndian()) {
+      setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
+    }
 
     setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
@@ -11479,8 +11491,34 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
+  SDValue Val = Op.getOperand(0);
+  EVT ValVT = Val.getValueType();
+  // P10 hardware store forwarding requires that a single store contains all
+  // the data for the load. P10 is able to merge a pair of adjacent stores. Try
+  // to avoid load hit store on P10 when running binaries compiled for older
+  // processors by generating two mergeable scalar stores to forward with the
+  // vector load.
+  if (!DisableP10StoreForward && Subtarget.isPPC64() &&
+      !Subtarget.isLittleEndian() && ValVT.isInteger() &&
+      ValVT.getSizeInBits() <= 64) {
+    Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, Val);
+    EVT ShiftAmountTy = getShiftAmountTy(MVT::i64, DAG.getDataLayout());
+    SDValue ShiftBy =
+        DAG.getConstant(64 - Op.getValueType().getScalarSizeInBits(),
+                        dl, ShiftAmountTy);
+    Val = DAG.getNode(ISD::SHL, dl, MVT::i64, Val, ShiftBy);
+    SDValue Plus8 = DAG.getNode(ISD::ADD, dl, PtrVT, FIdx,
+                                DAG.getConstant(8, dl, PtrVT));
+    SDValue Store2 = DAG.getStore(DAG.getEntryNode(), dl, Val, Plus8,
+                                  MachinePointerInfo());
+    SDValue Store = DAG.getStore(Store2, dl, Val, FIdx,
+                                 MachinePointerInfo());
+    return DAG.getLoad(Op.getValueType(), dl, Store, FIdx,
+                       MachinePointerInfo());
+  }
+
   // Store the input value into Value#0 of the stack slot.
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Val, FIdx,
                                MachinePointerInfo());
   // Load it out.
   return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
diff --git a/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll b/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
index 05edf92d72498..f5515e8ba19bd 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
@@ -14,7 +14,9 @@ define void @test_aix_splatimm(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-AIX-NEXT:    slwi 3, 3, 8
 ; CHECK-AIX-NEXT:    neg 3, 3
 ; CHECK-AIX-NEXT:    lwz 6, 0(3)
-; CHECK-AIX-NEXT:    sth 3, -16(1)
+; CHECK-AIX-NEXT:    sldi 3, 3, 48
+; CHECK-AIX-NEXT:    std 3, -16(1)
+; CHECK-AIX-NEXT:    std 3, -8(1)
 ; CHECK-AIX-NEXT:    addi 3, 1, -16
 ; CHECK-AIX-NEXT:    lxvw4x 34, 0, 3
 ; CHECK-AIX-NEXT:    srwi 3, 4, 16
@@ -24,9 +26,11 @@ define void @test_aix_splatimm(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-AIX-NEXT:    mullw 3, 3, 4
 ; CHECK-AIX-NEXT:    li 4, 0
 ; CHECK-AIX-NEXT:    neg 3, 3
+; CHECK-AIX-NEXT:    sldi 3, 3, 48
 ; CHECK-AIX-NEXT:    vsplth 2, 2, 0
 ; CHECK-AIX-NEXT:    stxvw4x 34, 0, 4
-; CHECK-AIX-NEXT:    sth 3, -32(1)
+; CHECK-AIX-NEXT:    std 3, -32(1)
+; CHECK-AIX-NEXT:    std 3, -24(1)
 ; CHECK-AIX-NEXT:    addi 3, 1, -32
 ; CHECK-AIX-NEXT:    lxvw4x 34, 0, 3
 ; CHECK-AIX-NEXT:    vsplth 2, 2, 0
diff --git a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
index cc32a76b22c28..6d35a7281de6b 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
@@ -338,17 +338,16 @@ define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) {
 ; PWR7-BE-LABEL: build_v4i32_load_0:
 ; PWR7-BE:       # %bb.0: # %entry
 ; PWR7-BE-NEXT:    lwz 3, 0(3)
-; PWR7-BE-NEXT:    li 4, 0
-; PWR7-BE-NEXT:    stw 4, -16(1)
-; PWR7-BE-NEXT:    stw 3, -32(1)
+; PWR7-BE-NEXT:    xxlxor 36, 36, 36
+; PWR7-BE-NEXT:    sldi 3, 3, 32
+; PWR7-BE-NEXT:    std 3, -32(1)
+; PWR7-BE-NEXT:    std 3, -24(1)
 ; PWR7-BE-NEXT:    addis 3, 2, .LCPI8_0 at toc@ha
 ; PWR7-BE-NEXT:    addi 3, 3, .LCPI8_0 at toc@l
 ; PWR7-BE-NEXT:    lxvw4x 34, 0, 3
-; PWR7-BE-NEXT:    addi 3, 1, -16
-; PWR7-BE-NEXT:    lxvw4x 35, 0, 3
 ; PWR7-BE-NEXT:    addi 3, 1, -32
-; PWR7-BE-NEXT:    lxvw4x 36, 0, 3
-; PWR7-BE-NEXT:    vperm 2, 4, 3, 2
+; PWR7-BE-NEXT:    lxvw4x 35, 0, 3
+; PWR7-BE-NEXT:    vperm 2, 3, 4, 2
 ; PWR7-BE-NEXT:    blr
 ;
 ; PWR8-BE-LABEL: build_v4i32_load_0:
@@ -402,17 +401,16 @@ define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) {
 ; PWR7-BE-LABEL: build_v4i32_load_1:
 ; PWR7-BE:       # %bb.0: # %entry
 ; PWR7-BE-NEXT:    lwz 3, 0(3)
-; PWR7-BE-NEXT:    li 4, 0
-; PWR7-BE-NEXT:    stw 4, -32(1)
-; PWR7-BE-NEXT:    stw 3, -16(1)
+; PWR7-BE-NEXT:    xxlxor 36, 36, 36
+; PWR7-BE-NEXT:    sldi 3, 3, 32
+; PWR7-BE-NEXT:    std 3, -16(1)
+; PWR7-BE-NEXT:    std 3, -8(1)
 ; PWR7-BE-NEXT:    addis 3, 2, .LCPI9_0 at toc@ha
 ; PWR7-BE-NEXT:    addi 3, 3, .LCPI9_0 at toc@l
 ; PWR7-BE-NEXT:    lxvw4x 34, 0, 3
-; PWR7-BE-NEXT:    addi 3, 1, -32
-; PWR7-BE-NEXT:    lxvw4x 35, 0, 3
 ; PWR7-BE-NEXT:    addi 3, 1, -16
-; PWR7-BE-NEXT:    lxvw4x 36, 0, 3
-; PWR7-BE-NEXT:    vperm 2, 3, 4, 2
+; PWR7-BE-NEXT:    lxvw4x 35, 0, 3
+; PWR7-BE-NEXT:    vperm 2, 4, 3, 2
 ; PWR7-BE-NEXT:    blr
 ;
 ; PWR8-BE-LABEL: build_v4i32_load_1:
@@ -466,17 +464,16 @@ define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) {
 ; PWR7-BE-LABEL: build_v4i32_load_2:
 ; PWR7-BE:       # %bb.0: # %entry
 ; PWR7-BE-NEXT:    lwz 3, 0(3)
-; PWR7-BE-NEXT:    li 4, 0
-; PWR7-BE-NEXT:    stw 4, -32(1)
-; PWR7-BE-NEXT:    stw 3, -16(1)
+; PWR7-BE-NEXT:    xxlxor 36, 36, 36
+; PWR7-BE-NEXT:    sldi 3, 3, 32
+; PWR7-BE-NEXT:    std 3, -16(1)
+; PWR7-BE-NEXT:    std 3, -8(1)
 ; PWR7-BE-NEXT:    addis 3, 2, .LCPI10_0 at toc@ha
 ; PWR7-BE-NEXT:    addi 3, 3, .LCPI10_0 at toc@l
 ; PWR7-BE-NEXT:    lxvw4x 34, 0, 3
-; PWR7-BE-NEXT:    addi 3, 1, -32
-; PWR7-BE-NEXT:    lxvw4x 35, 0, 3
 ; PWR7-BE-NEXT:    addi 3, 1, -16
-; PWR7-BE-NEXT:    lxvw4x 36, 0, 3
-; PWR7-BE-NEXT:    vperm 2, 3, 4, 2
+; PWR7-BE-NEXT:    lxvw4x 35, 0, 3
+; PWR7-BE-NEXT:    vperm 2, 4, 3, 2
 ; PWR7-BE-NEXT:    blr
 ;
 ; PWR8-BE-LABEL: build_v4i32_load_2:
@@ -530,17 +527,16 @@ define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) {
 ; PWR7-BE-LABEL: build_v4i32_load_3:
 ; PWR7-BE:       # %bb.0: # %entry
 ; PWR7-BE-NEXT:    lwz 3, 0(3)
-; PWR7-BE-NEXT:    li 4, 0
-; PWR7-BE-NEXT:    stw 4, -32(1)
-; PWR7-BE-NEXT:    stw 3, -16(1)
+; PWR7-BE-NEXT:    xxlxor 36, 36, 36
+; PWR7-BE-NEXT:    sldi 3, 3, 32
+; PWR7-BE-NEXT:    std 3, -16(1)
+; PWR7-BE-NEXT:    std 3, -8(1)
 ; PWR7-BE-NEXT:    addis 3, 2, .LCPI11_0 at toc@ha
 ; PWR7-BE-NEXT:    addi 3, 3, .LCPI11_0 at toc@l
 ; PWR7-BE-NEXT:    lxvw4x 34, 0, 3
-; PWR7-BE-NEXT:    addi 3, 1, -32
-; PWR7-BE-NEXT:    lxvw4x 35, 0, 3
 ; PWR7-BE-NEXT:    addi 3, 1, -16
-; PWR7-BE-NEXT:    lxvw4x 36, 0, 3
-; PWR7-BE-NEXT:    vperm 2, 3, 4, 2
+; PWR7-BE-NEXT:    lxvw4x 35, 0, 3
+; PWR7-BE-NEXT:    vperm 2, 4, 3, 2
 ; PWR7-BE-NEXT:    blr
 ;
 ; PWR8-BE-LABEL: build_v4i32_load_3:
diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
index 6d2f3b3abc42d..bc68ad2a67bf5 100644
--- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll
+++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
@@ -591,6 +591,7 @@ define <16 x i8> @adjusted_lxvwsx(ptr %s, ptr %t) {
 ; P7:       # %bb.0: # %entry
 ; P7-NEXT:    ld r3, 0(r3)
 ; P7-NEXT:    std r3, -16(r1)
+; P7-NEXT:    std r3, -8(r1)
 ; P7-NEXT:    addi r3, r1, -16
 ; P7-NEXT:    lxvw4x vs0, 0, r3
 ; P7-NEXT:    xxspltw v2, vs0, 1
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
index 7a6640fea2d1e..bf8c5c96ccbde 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@@ -833,8 +833,18 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ;
 ; CHECK-PWR7-LABEL: sub_absv_8_ext:
 ; CHECK-PWR7:       # %bb.0: # %entry
-; CHECK-PWR7-NEXT:    stdu r1, -400(r1)
-; CHECK-PWR7-NEXT:    .cfi_def_cfa_offset 400
+; CHECK-PWR7-NEXT:    stdu r1, -512(r1)
+; CHECK-PWR7-NEXT:    .cfi_def_cfa_offset 512
+; CHECK-PWR7-NEXT:    .cfi_offset r14, -144
+; CHECK-PWR7-NEXT:    .cfi_offset r15, -136
+; CHECK-PWR7-NEXT:    .cfi_offset r16, -128
+; CHECK-PWR7-NEXT:    .cfi_offset r17, -120
+; CHECK-PWR7-NEXT:    .cfi_offset r18, -112
+; CHECK-PWR7-NEXT:    .cfi_offset r19, -104
+; CHECK-PWR7-NEXT:    .cfi_offset r20, -96
+; CHECK-PWR7-NEXT:    .cfi_offset r21, -88
+; CHECK-PWR7-NEXT:    .cfi_offset r22, -80
+; CHECK-PWR7-NEXT:    .cfi_offset r23, -72
 ; CHECK-PWR7-NEXT:    .cfi_offset r24, -64
 ; CHECK-PWR7-NEXT:    .cfi_offset r25, -56
 ; CHECK-PWR7-NEXT:    .cfi_offset r26, -48
@@ -842,184 +852,244 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR7-NEXT:    .cfi_offset r28, -32
 ; CHECK-PWR7-NEXT:    .cfi_offset r29, -24
 ; CHECK-PWR7-NEXT:    .cfi_offset r30, -16
-; CHECK-PWR7-NEXT:    addi r3, r1, 304
-; CHECK-PWR7-NEXT:    std r24, 336(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r25, 344(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r26, 352(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r27, 360(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r28, 368(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r29, 376(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r30, 384(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT:    .cfi_offset r31, -8
+; CHECK-PWR7-NEXT:    .cfi_offset r2, -152
 ; CHECK-PWR7-NEXT:    addi r3, r1, 320
-; CHECK-PWR7-NEXT:    lbz r4, 304(r1)
-; CHECK-PWR7-NEXT:    stxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    lbz r5, 305(r1)
-; CHECK-PWR7-NEXT:    lbz r6, 321(r1)
-; CHECK-PWR7-NEXT:    lbz r7, 306(r1)
-; CHECK-PWR7-NEXT:    lbz r8, 322(r1)
-; CHECK-PWR7-NEXT:    lbz r9, 307(r1)
-; CHECK-PWR7-NEXT:    lbz r10, 323(r1)
-; CHECK-PWR7-NEXT:    lbz r0, 309(r1)
-; CHECK-PWR7-NEXT:    lbz r30, 325(r1)
-; CHECK-PWR7-NEXT:    lbz r29, 310(r1)
-; CHECK-PWR7-NEXT:    lbz r28, 326(r1)
-; CHECK-PWR7-NEXT:    lbz r11, 308(r1)
-; CHECK-PWR7-NEXT:    lbz r12, 324(r1)
-; CHECK-PWR7-NEXT:    lbz r27, 311(r1)
-; CHECK-PWR7-NEXT:    lbz r26, 327(r1)
-; CHECK-PWR7-NEXT:    lbz r25, 312(r1)
-; CHECK-PWR7-NEXT:    sub r5, r5, r6
-; CHECK-PWR7-NEXT:    sub r6, r7, r8
-; CHECK-PWR7-NEXT:    sub r7, r9, r10
-; CHECK-PWR7-NEXT:    sub r9, r0, r30
-; CHECK-PWR7-NEXT:    sub r10, r29, r28
-; CHECK-PWR7-NEXT:    sub r8, r11, r12
-; CHECK-PWR7-NEXT:    srawi r0, r5, 31
-; CHECK-PWR7-NEXT:    srawi r30, r6, 31
-; CHECK-PWR7-NEXT:    srawi r29, r7, 31
-; CHECK-PWR7-NEXT:    srawi r28, r8, 31
-; CHECK-PWR7-NEXT:    sub r11, r27, r26
-; CHECK-PWR7-NEXT:    srawi r27, r9, 31
-; CHECK-PWR7-NEXT:    lbz r24, 328(r1)
-; CHECK-PWR7-NEXT:    xor r5, r5, r0
-; CHECK-PWR7-NEXT:    xor r6, r6, r30
-; CHECK-PWR7-NEXT:    xor r7, r7, r29
-; CHECK-PWR7-NEXT:    xor r8, r8, r28
-; CHECK-PWR7-NEXT:    xor r9, r9, r27
-; CHECK-PWR7-NEXT:    srawi r26, r10, 31
-; CHECK-PWR7-NEXT:    sub r5, r5, r0
-; CHECK-PWR7-NEXT:    sub r6, r6, r30
-; CHECK-PWR7-NEXT:    lbz r0, 313(r1)
-; CHECK-PWR7-NEXT:    lbz r30, 329(r1)
-; CHECK-PWR7-NEXT:    sub r7, r7, r29
-; CHECK-PWR7-NEXT:    lbz r29, 330(r1)
-; CHECK-PWR7-NEXT:    sub r8, r8, r28
-; CHECK-PWR7-NEXT:    lbz r28, 331(r1)
-; CHECK-PWR7-NEXT:    sub r9, r9, r27
-; CHECK-PWR7-NEXT:    lbz r27, 332(r1)
-; CHECK-PWR7-NEXT:    xor r10, r10, r26
-; CHECK-PWR7-NEXT:    sub r10, r10, r26
-; CHECK-PWR7-NEXT:    lbz r26, 333(r1)
-; CHECK-PWR7-NEXT:    sub r12, r25, r24
-; CHECK-PWR7-NEXT:    srawi r25, r11, 31
+; CHECK-PWR7-NEXT:    std r14, 368(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r15, 376(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r16, 384(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r17, 392(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r18, 400(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r19, 408(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r20, 416(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r21, 424(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r22, 432(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r23, 440(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r24, 448(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r25, 456(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r26, 464(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r27, 472(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r28, 480(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r29, 488(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r30, 496(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r31, 504(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r2, 360(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    stxvw4x v2, 0, r3
 ; CHECK-PWR7-NEXT:    lbz r3, 320(r1)
+; CHECK-PWR7-NEXT:    addi r4, r1, 336
+; CHECK-PWR7-NEXT:    stw r3, 60(r1) # 4-byte Folded Spill
+; CHECK-PWR7-NEXT:    stxvw4x v3, 0, r4
+; CHECK-PWR7-NEXT:    lbz r15, 334(r1)
+; CHECK-PWR7-NEXT:    lbz r14, 350(r1)
+; CHECK-PWR7-NEXT:    lbz r31, 335(r1)
+; CHECK-PWR7-NEXT:    lbz r2, 351(r1)
+; CHECK-PWR7-NEXT:    sub r15, r15, r14
+; CHECK-PWR7-NEXT:    sub r14, r31, r2
+; CHECK-PWR7-NEXT:    srawi r2, r14, 31
+; CHECK-PWR7-NEXT:    xor r14, r14, r2
+; CHECK-PWR7-NEXT:    lbz r3, 333(r1)
+; CHECK-PWR7-NEXT:    lbz r19, 331(r1)
+; CHECK-PWR7-NEXT:    lbz r18, 347(r1)
+; CHECK-PWR7-NEXT:    sub r19, r19, r18
+; CHECK-PWR7-NEXT:    lbz r17, 332(r1)
+; CHECK-PWR7-NEXT:    lbz r16, 348(r1)
+; CHECK-PWR7-NEXT:    sub r17, r17, r16
+; CHECK-PWR7-NEXT:    lbz r23, 329(r1)
+; CHECK-PWR7-NEXT:    sub r14, r14, r2
+; CHECK-PWR7-NEXT:    lbz r2, 349(r1)
+; CHECK-PWR7-NEXT:    lbz r22, 345(r1)
+; CHECK-PWR7-NEXT:    lbz r4, 336(r1)
+; CHECK-PWR7-NEXT:    lbz r5, 321(r1)
+; CHECK-PWR7-NEXT:    lbz r6, 337(r1)
+; CHECK-PWR7-NEXT:    lbz r7, 322(r1)
+; CHECK-PWR7-NEXT:    lbz r8, 338(r1)
+; CHECK-PWR7-NEXT:    lbz r9, 323(r1)
+; CHECK-PWR7-NEXT:    lbz r10, 339(r1)
+; CHECK-PWR7-NEXT:    lbz r11, 324(r1)
+; CHECK-PWR7-NEXT:    lbz r12, 340(r1)
+; CHECK-PWR7-NEXT:    lbz r0, 325(r1)
+; CHECK-PWR7-NEXT:    lbz r30, 341(r1)
+; CHECK-PWR7-NEXT:    lbz r29, 326(r1)
+; CHECK-PWR7-NEXT:    lbz r28, 342(r1)
+; CHECK-PWR7-NEXT:    lbz r27, 327(r1)
+; CHECK-PWR7-NEXT:    lbz r26, 343(r1)
+; CHECK-PWR7-NEXT:    sub r3, r3, r2
+; CHECK-PWR7-NEXT:    lbz r25, 328(r1)
+; CHECK-PWR7-NEXT:    lbz r24, 344(r1)
+; CHECK-PWR7-NEXT:    lbz r21, 330(r1)
+; CHECK-PWR7-NEXT:    lbz r20, 346(r1)
+; CHECK-PWR7-NEXT:    sub r5, r5, r6
+; CHECK-PWR7-NEXT:    srawi r18, r3, 31
+; CHECK-PWR7-NEXT:    sub r7, r7, r8
+; CHECK-PWR7-NEXT:    sub r9, r9, r10
+; CHECK-PWR7-NEXT:    sub r11, r11, r12
 ; CHECK-PWR7-NEXT:    sub r0, r0, r30
-; CHECK-PWR7-NEXT:    xor r11, r11, r25
-; CHECK-PWR7-NEXT:    sub r11, r11, r25
-; CHECK-PWR7-NEXT:    lbz r25, 334(r1)
-; CHECK-PWR7-NEXT:    sub r4, r4, r3
-; CHECK-PWR7-NEXT:    srawi r30, r0, 31
-; CHECK-PWR7-NEXT:    srawi r24, r12, 31
-; CHECK-PWR7-NEXT:    xor r12, r12, r24
-; CHECK-PWR7-NEXT:    sub r12, r12, r24
-; CHECK-PWR7-NEXT:    lbz r24, 335(r1)
-; CHECK-PWR7-NEXT:    srawi r3, r4, 31
-; CHECK-PWR7-NEXT:    xor r4, r4, r3
-; CHECK-PWR7-NEXT:    xor r0, r0, r30
-; CHECK-PWR7-NEXT:    sub r3, r4, r3
-; CHECK-PWR7-NEXT:    stb r3, 48(r1)
-; CHECK-PWR7-NEXT:    addi r3, r1, 288
-; CHECK-PWR7-NEXT:    stb r12, 176(r1)
-; CHECK-PWR7-NEXT:    sub r0, r0, r30
-; CHECK-PWR7-NEXT:    lbz r30, 314(r1)
-; CHECK-PWR7-NEXT:    stb r11, 160(r1)
-; CHECK-PWR7-NEXT:    sub r30, r30, r29
-; CHECK-PWR7-NEXT:    stb r0, 192(r1)
-; CHECK-PWR7-NEXT:    stb r10, 144(r1)
-; CHECK-PWR7-NEXT:    stb r9, 128(r1)
-; CHECK-PWR7-NEXT:    stb r8, 112(r1)
-; CHECK-PWR7-NEXT:    stb r7, 96(r1)
-; CHECK-PWR7-NEXT:    stb r6, 80(r1)
-; CHECK-PWR7-NEXT:    srawi r29, r30, 31
-; CHECK-PWR7-NEXT:    stb r5, 64(r1)
-; CHECK-PWR7-NEXT:    xor r30, r30, r29
-; CHECK-PWR7-NEXT:    sub r30, r30, r29
-; CHECK-PWR7-NEXT:    lbz r29, 315(r1)
 ; CHECK-PWR7-NEXT:    sub r29, r29, r28
-; CHECK-PWR7-NEXT:    stb r30, 208(r1)
-; CHECK-PWR7-NEXT:    ld r30, 384(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    srawi r28, r29, 31
-; CHECK-PWR7-NEXT:    xor r29, r29, r28
-; CHECK-PWR7-NEXT:    sub r29, r29, r28
-; CHECK-PWR7-NEXT:    lbz r28, 316(r1)
-; CHECK-PWR7-NEXT:    sub r28, r28, r27
-; CHECK-PWR7-NEXT:    stb r29, 224(r1)
-; CHECK-PWR7-NEXT:    ld r29, 376(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    srawi r27, r28, 31
-; CHECK-PWR7-NEXT:    xor r28, r28, r27
-; CHECK-PWR7-NEXT:    sub r28, r28, r27
-; CHECK-PWR7-NEXT:    lbz r27, 317(r1)
 ; CHECK-PWR7-NEXT:    sub r27, r27, r26
-; CHECK-PWR7-NEXT:    stb r28, 240(r1)
-; CHECK-PWR7-NEXT:    ld r28, 368(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    srawi r26, r27, 31
-; CHECK-PWR7-NEXT:    xor r27, r27, r26
-; CHECK-PWR7-NEXT:    sub r27, r27, r26
-; CHECK-PWR7-NEXT:    lbz r26, 318(r1)
-; CHECK-PWR7-NEXT:    sub r26, r26, r25
-; CHECK-PWR7-NEXT:    stb r27, 256(r1)
-; CHECK-PWR7-NEXT:    ld r27, 360(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    srawi r25, r26, 31
-; CHECK-PWR7-NEXT:    xor r26, r26, r25
-; CHECK-PWR7-NEXT:    sub r26, r26, r25
-; CHECK-PWR7-NEXT:    lbz r25, 319(r1)
 ; CHECK-PWR7-NEXT:    sub r25, r25, r24
-; CHECK-PWR7-NEXT:    stb r26, 272(r1)
-; CHECK-PWR7-NEXT:    ld r26, 352(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    srawi r31, r15, 31
+; CHECK-PWR7-NEXT:    ld r2, 360(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    xor r3, r3, r18
+; CHECK-PWR7-NEXT:    srawi r6, r5, 31
+; CHECK-PWR7-NEXT:    srawi r8, r7, 31
+; CHECK-PWR7-NEXT:    srawi r10, r9, 31
+; CHECK-PWR7-NEXT:    srawi r12, r11, 31
+; CHECK-PWR7-NEXT:    srawi r30, r0, 31
+; CHECK-PWR7-NEXT:    sub r3, r3, r18
+; CHECK-PWR7-NEXT:    srawi r18, r19, 31
+; CHECK-PWR7-NEXT:    srawi r28, r29, 31
+; CHECK-PWR7-NEXT:    ld r16, 384(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    sldi r3, r3, 56
+; CHECK-PWR7-NEXT:    srawi r26, r27, 31
 ; CHECK-PWR7-NEXT:    srawi r24, r25, 31
+; CHECK-PWR7-NEXT:    xor r19, r19, r18
+; CHECK-PWR7-NEXT:    xor r15, r15, r31
+; CHECK-PWR7-NEXT:    xor r5, r5, r6
+; CHECK-PWR7-NEXT:    std r3, 272(r1)
+; CHECK-PWR7-NEXT:    std r3, 280(r1)
+; CHECK-PWR7-NEXT:    srawi r3, r17, 31
+; CHECK-PWR7-NEXT:    sub r19, r19, r18
+; CHECK-PWR7-NEXT:    xor r7, r7, r8
+; CHECK-PWR7-NEXT:    sub r15, r15, r31
+; CHECK-PWR7-NEXT:    xor r17, r17, r3
+; CHECK-PWR7-NEXT:    xor r9, r9, r10
+; CHECK-PWR7-NEXT:    xor r11, r11, r12
+; CHECK-PWR7-NEXT:    xor r0, r0, r30
+; CHECK-PWR7-NEXT:    xor r29, r29, r28
+; CHECK-PWR7-NEXT:    xor r27, r27, r26
+; CHECK-PWR7-NEXT:    sub r3, r17, r3
 ; CHECK-PWR7-NEXT:    xor r25, r25, r24
 ; CHECK-PWR7-NEXT:    sub r25, r25, r24
-; CHECK-PWR7-NEXT:    ld r24, 336(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    stb r25, 288(r1)
-; CHECK-PWR7-NEXT:    ld r25, 344(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    sub r27, r27, r26
+; CHECK-PWR7-NEXT:    sub r29, r29, r28
+; CHECK-PWR7-NEXT:    sldi r3, r3, 56
+; CHECK-PWR7-NEXT:    sub r0, r0, r30
+; CHECK-PWR7-NEXT:    sub r11, r11, r12
+; CHECK-PWR7-NEXT:    sub r9, r9, r10
+; CHECK-PWR7-NEXT:    sub r7, r7, r8
+; CHECK-PWR7-NEXT:    sub r5, r5, r6
+; CHECK-PWR7-NEXT:    sldi r14, r14, 56
+; CHECK-PWR7-NEXT:    sldi r15, r15, 56
+; CHECK-PWR7-NEXT:    ld r31, 504(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r3, 256(r1)
+; CHECK-PWR7-NEXT:    std r3, 264(r1)
+; CHECK-PWR7-NEXT:    sldi r3, r19, 56
+; CHECK-PWR7-NEXT:    sldi r25, r25, 56
+; CHECK-PWR7-NEXT:    sldi r27, r27, 56
+; CHECK-PWR7-NEXT:    std r3, 240(r1)
+; CHECK-PWR7-NEXT:    std r3, 248(r1)
+; CHECK-PWR7-NEXT:    sub r3, r23, r22
+; CHECK-PWR7-NEXT:    srawi r23, r3, 31
+; CHECK-PWR7-NEXT:    sub r22, r21, r20
+; CHECK-PWR7-NEXT:    srawi r21, r22, 31
+; CHECK-PWR7-NEXT:    sldi r29, r29, 56
+; CHECK-PWR7-NEXT:    sldi r0, r0, 56
+; CHECK-PWR7-NEXT:    sldi r11, r11, 56
+; CHECK-PWR7-NEXT:    xor r3, r3, r23
+; CHECK-PWR7-NEXT:    xor r22, r22, r21
+; CHECK-PWR7-NEXT:    sldi r9, r9, 56
+; CHECK-PWR7-NEXT:    sldi r7, r7, 56
+; CHECK-PWR7-NEXT:    sldi r5, r5, 56
+; CHECK-PWR7-NEXT:    ld r30, 496(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r28, 480(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    sub r3, r3, r23
+; CHECK-PWR7-NEXT:    sub r22, r22, r21
+; CHECK-PWR7-NEXT:    std r14, 304(r1)
+; CHECK-PWR7-NEXT:    ld r26, 464(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    sldi r3, r3, 56
+; CHECK-PWR7-NEXT:    sldi r22, r22, 56
+; CHECK-PWR7-NEXT:    ld r24, 448(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r23, 440(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r14, 312(r1)
+; CHECK-PWR7-NEXT:    std r15, 288(r1)
+; CHECK-PWR7-NEXT:    std r3, 208(r1)
+; CHECK-PWR7-NEXT:    std r3, 216(r1)
+; CHECK-PWR7-NEXT:    lwz r3, 60(r1) # 4-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r15, 296(r1)
+; CHECK-PWR7-NEXT:    ld r21, 424(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r20, 416(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r22, 224(r1)
+; CHECK-PWR7-NEXT:    std r22, 232(r1)
+; CHECK-PWR7-NEXT:    sub r4, r3, r4
+; CHECK-PWR7-NEXT:    std r25, 192(r1)
+; CHECK-PWR7-NEXT:    ld r22, 432(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r19, 408(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    srawi r3, r4, 31
+; CHECK-PWR7-NEXT:    std r25, 200(r1)
+; CHECK-PWR7-NEXT:    ld r25, 456(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r27, 176(r1)
+; CHECK-PWR7-NEXT:    std r27, 184(r1)
+; CHECK-PWR7-NEXT:    xor r4, r4, r3
+; CHECK-PWR7-NEXT:    std r29, 160(r1)
+; CHECK-PWR7-NEXT:    ld r27, 472(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r29, 168(r1)
+; CHECK-PWR7-NEXT:    std r0, 144(r1)
+; CHECK-PWR7-NEXT:    sub r3, r4, r3
+; CHECK-PWR7-NEXT:    std r0, 152(r1)
+; CHECK-PWR7-NEXT:    ld r29, 488(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r18, 400(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    sldi r3, r3, 56
+; CHECK-PWR7-NEXT:    std r11, 128(r1)
+; CHECK-PWR7-NEXT:    ld r17, 392(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r11, 136(r1)
+; CHECK-PWR7-NEXT:    std r9, 112(r1)
+; CHECK-PWR7-NEXT:    std r3, 64(r1)
+; CHECK-PWR7-NEXT:    std r3, 72(r1)
+; CHECK-PWR7-NEXT:    addi r3, r1, 304
+; CHECK-PWR7-NEXT:    std r9, 120(r1)
+; CHECK-PWR7-NEXT:    ld r15, 376(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r7, 96(r1)
+; CHECK-PWR7-NEXT:    std r7, 104(r1)
+; CHECK-PWR7-NEXT:    std r5, 80(r1)
+; CHECK-PWR7-NEXT:    std r5, 88(r1)
 ; CHECK-PWR7-NEXT:    lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 272
+; CHECK-PWR7-NEXT:    addi r3, r1, 288
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 256
+; CHECK-PWR7-NEXT:    addi r3, r1, 272
+; CHECK-PWR7-NEXT:    ld r14, 368(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    vmrghb v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 240
+; CHECK-PWR7-NEXT:    addi r3, r1, 256
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 224
+; CHECK-PWR7-NEXT:    addi r3, r1, 240
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR7-NEXT:    vmrghh v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 208
+; CHECK-PWR7-NEXT:    addi r3, r1, 224
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 192
+; CHECK-PWR7-NEXT:    addi r3, r1, 208
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 176
+; CHECK-PWR7-NEXT:    addi r3, r1, 192
 ; CHECK-PWR7-NEXT:    lxvw4x v5, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 160
+; CHECK-PWR7-NEXT:    addi r3, r1, 176
 ; CHECK-PWR7-NEXT:    vmrghb v4, v5, v4
 ; CHECK-PWR7-NEXT:    vmrghh v3, v4, v3
 ; CHECK-PWR7-NEXT:    xxmrghw vs0, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 144
+; CHECK-PWR7-NEXT:    addi r3, r1, 160
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 128
+; CHECK-PWR7-NEXT:    addi r3, r1, 144
 ; CHECK-PWR7-NEXT:    vmrghb v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 112
+; CHECK-PWR7-NEXT:    addi r3, r1, 128
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 96
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
+; CHECK-PWR7-NEXT:    addi r3, r1, 112
 ; CHECK-PWR7-NEXT:    vmrghh v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 80
+; CHECK-PWR7-NEXT:    addi r3, r1, 96
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 64
+; CHECK-PWR7-NEXT:    addi r3, r1, 80
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 48
+; CHECK-PWR7-NEXT:    addi r3, r1, 64
 ; CHECK-PWR7-NEXT:    lxvw4x v5, 0, r3
 ; CHECK-PWR7-NEXT:    vmrghb v4, v5, v4
 ; CHECK-PWR7-NEXT:    vmrghh v3, v4, v3
 ; CHECK-PWR7-NEXT:    xxmrghw vs1, v3, v2
 ; CHECK-PWR7-NEXT:    xxmrghd v2, vs1, vs0
-; CHECK-PWR7-NEXT:    addi r1, r1, 400
+; CHECK-PWR7-NEXT:    addi r1, r1, 512
 ; CHECK-PWR7-NEXT:    blr
 entry:
   %vecext = extractelement <16 x i8> %a, i32 0
diff --git a/llvm/test/CodeGen/PowerPC/test-vector-insert.ll b/llvm/test/CodeGen/PowerPC/test-vector-insert.ll
index 087f2244f0f7d..73b4ad8a507b8 100644
--- a/llvm/test/CodeGen/PowerPC/test-vector-insert.ll
+++ b/llvm/test/CodeGen/PowerPC/test-vector-insert.ll
@@ -60,7 +60,9 @@ define dso_local <4 x i32> @test(<4 x i32> %a, double %b) {
 ; CHECK-BE-P7-NEXT:    addi r3, r1, -4
 ; CHECK-BE-P7-NEXT:    stfiwx f0, 0, r3
 ; CHECK-BE-P7-NEXT:    lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT:    stw r3, -32(r1)
+; CHECK-BE-P7-NEXT:    sldi r3, r3, 32
+; CHECK-BE-P7-NEXT:    std r3, -32(r1)
+; CHECK-BE-P7-NEXT:    std r3, -24(r1)
 ; CHECK-BE-P7-NEXT:    addis r3, r2, .LCPI0_0 at toc@ha
 ; CHECK-BE-P7-NEXT:    addi r3, r3, .LCPI0_0 at toc@l
 ; CHECK-BE-P7-NEXT:    lxvw4x v3, 0, r3
@@ -129,7 +131,9 @@ define dso_local <4 x i32> @test2(<4 x i32> %a, float %b) {
 ; CHECK-BE-P7-NEXT:    addi r3, r1, -4
 ; CHECK-BE-P7-NEXT:    stfiwx f0, 0, r3
 ; CHECK-BE-P7-NEXT:    lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT:    stw r3, -32(r1)
+; CHECK-BE-P7-NEXT:    sldi r3, r3, 32
+; CHECK-BE-P7-NEXT:    std r3, -32(r1)
+; CHECK-BE-P7-NEXT:    std r3, -24(r1)
 ; CHECK-BE-P7-NEXT:    addis r3, r2, .LCPI1_0 at toc@ha
 ; CHECK-BE-P7-NEXT:    addi r3, r3, .LCPI1_0 at toc@l
 ; CHECK-BE-P7-NEXT:    lxvw4x v3, 0, r3
@@ -198,7 +202,9 @@ define dso_local <4 x i32> @test3(<4 x i32> %a, double %b) {
 ; CHECK-BE-P7-NEXT:    addi r3, r1, -4
 ; CHECK-BE-P7-NEXT:    stfiwx f0, 0, r3
 ; CHECK-BE-P7-NEXT:    lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT:    stw r3, -32(r1)
+; CHECK-BE-P7-NEXT:    sldi r3, r3, 32
+; CHECK-BE-P7-NEXT:    std r3, -32(r1)
+; CHECK-BE-P7-NEXT:    std r3, -24(r1)
 ; CHECK-BE-P7-NEXT:    addis r3, r2, .LCPI2_0 at toc@ha
 ; CHECK-BE-P7-NEXT:    addi r3, r3, .LCPI2_0 at toc@l
 ; CHECK-BE-P7-NEXT:    lxvw4x v3, 0, r3
@@ -267,7 +273,9 @@ define dso_local <4 x i32> @test4(<4 x i32> %a, float %b) {
 ; CHECK-BE-P7-NEXT:    addi r3, r1, -4
 ; CHECK-BE-P7-NEXT:    stfiwx f0, 0, r3
 ; CHECK-BE-P7-NEXT:    lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT:    stw r3, -32(r1)
+; CHECK-BE-P7-NEXT:    sldi r3, r3, 32
+; CHECK-BE-P7-NEXT:    std r3, -32(r1)
+; CHECK-BE-P7-NEXT:    std r3, -24(r1)
 ; CHECK-BE-P7-NEXT:    addis r3, r2, .LCPI3_0 at toc@ha
 ; CHECK-BE-P7-NEXT:    addi r3, r3, .LCPI3_0 at toc@l
 ; CHECK-BE-P7-NEXT:    lxvw4x v3, 0, r3
diff --git a/llvm/test/CodeGen/PowerPC/vec-trunc2.ll b/llvm/test/CodeGen/PowerPC/vec-trunc2.ll
index 7a09d5a5e8bb2..b81bc9f21423f 100644
--- a/llvm/test/CodeGen/PowerPC/vec-trunc2.ll
+++ b/llvm/test/CodeGen/PowerPC/vec-trunc2.ll
@@ -28,16 +28,31 @@ define dso_local <8 x i8> @test8x32(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
 ;
 ; CHECK-BE-LABEL: test8x32:
 ; CHECK-BE:       # %bb.0:
-; CHECK-BE-NEXT:    stw r10, -80(r1)
-; CHECK-BE-NEXT:    stw r9, -96(r1)
-; CHECK-BE-NEXT:    stw r8, -112(r1)
-; CHECK-BE-NEXT:    stw r7, -128(r1)
-; CHECK-BE-NEXT:    stw r6, -16(r1)
-; CHECK-BE-NEXT:    stw r5, -32(r1)
-; CHECK-BE-NEXT:    stw r4, -48(r1)
-; CHECK-BE-NEXT:    stw r3, -64(r1)
-; CHECK-BE-NEXT:    addi r3, r1, -80
-; CHECK-BE-NEXT:    lxvw4x vs0, 0, r3
+; CHECK-BE-NEXT:    sldi r10, r10, 32
+; CHECK-BE-NEXT:    sldi r9, r9, 32
+; CHECK-BE-NEXT:    sldi r8, r8, 32
+; CHECK-BE-NEXT:    sldi r7, r7, 32
+; CHECK-BE-NEXT:    sldi r6, r6, 32
+; CHECK-BE-NEXT:    sldi r5, r5, 32
+; CHECK-BE-NEXT:    sldi r4, r4, 32
+; CHECK-BE-NEXT:    sldi r3, r3, 32
+; CHECK-BE-NEXT:    addi r11, r1, -80
+; CHECK-BE-NEXT:    std r10, -80(r1)
+; CHECK-BE-NEXT:    std r10, -72(r1)
+; CHECK-BE-NEXT:    std r9, -96(r1)
+; CHECK-BE-NEXT:    std r9, -88(r1)
+; CHECK-BE-NEXT:    std r8, -112(r1)
+; CHECK-BE-NEXT:    std r8, -104(r1)
+; CHECK-BE-NEXT:    std r7, -128(r1)
+; CHECK-BE-NEXT:    std r7, -120(r1)
+; CHECK-BE-NEXT:    std r6, -16(r1)
+; CHECK-BE-NEXT:    std r6, -8(r1)
+; CHECK-BE-NEXT:    std r5, -32(r1)
+; CHECK-BE-NEXT:    std r5, -24(r1)
+; CHECK-BE-NEXT:    std r4, -48(r1)
+; CHECK-BE-NEXT:    std r4, -40(r1)
+; CHECK-BE-NEXT:    std r3, -64(r1)
+; CHECK-BE-NEXT:    std r3, -56(r1)
 ; CHECK-BE-NEXT:    addi r3, r1, -96
 ; CHECK-BE-NEXT:    lxvw4x vs1, 0, r3
 ; CHECK-BE-NEXT:    addi r3, r1, -112
@@ -51,15 +66,16 @@ define dso_local <8 x i8> @test8x32(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
 ; CHECK-BE-NEXT:    addi r3, r1, -48
 ; CHECK-BE-NEXT:    lxvw4x vs6, 0, r3
 ; CHECK-BE-NEXT:    addi r3, r1, -64
+; CHECK-BE-NEXT:    lxvw4x vs0, 0, r11
 ; CHECK-BE-NEXT:    lxvw4x vs7, 0, r3
 ; CHECK-BE-NEXT:    addis r3, r2, .LCPI0_0 at toc@ha
+; CHECK-BE-NEXT:    addi r3, r3, .LCPI0_0 at toc@l
 ; CHECK-BE-NEXT:    xxmrghw vs0, vs1, vs0
 ; CHECK-BE-NEXT:    xxmrghw vs1, vs3, vs2
 ; CHECK-BE-NEXT:    xxmrghw vs2, vs5, vs4
-; CHECK-BE-NEXT:    addi r3, r3, .LCPI0_0 at toc@l
-; CHECK-BE-NEXT:    xxmrghd v3, vs1, vs0
 ; CHECK-BE-NEXT:    lxvw4x v2, 0, r3
 ; CHECK-BE-NEXT:    xxmrghw vs3, vs7, vs6
+; CHECK-BE-NEXT:    xxmrghd v3, vs1, vs0
 ; CHECK-BE-NEXT:    xxmrghd v4, vs3, vs2
 ; CHECK-BE-NEXT:    vperm v2, v4, v3, v2
 ; CHECK-BE-NEXT:    blr
@@ -136,16 +152,31 @@ define dso_local <8 x i16> @test8x24(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5
 ;
 ; CHECK-BE-LABEL: test8x24:
 ; CHECK-BE:       # %bb.0:
-; CHECK-BE-NEXT:    sth r10, -16(r1)
-; CHECK-BE-NEXT:    sth r9, -32(r1)
-; CHECK-BE-NEXT:    sth r8, -48(r1)
-; CHECK-BE-NEXT:    sth r7, -64(r1)
-; CHECK-BE-NEXT:    sth r6, -80(r1)
-; CHECK-BE-NEXT:    sth r5, -96(r1)
-; CHECK-BE-NEXT:    sth r4, -112(r1)
-; CHECK-BE-NEXT:    sth r3, -128(r1)
-; CHECK-BE-NEXT:    addi r3, r1, -16
-; CHECK-BE-NEXT:    lxvw4x v2, 0, r3
+; CHECK-BE-NEXT:    sldi r10, r10, 48
+; CHECK-BE-NEXT:    sldi r9, r9, 48
+; CHECK-BE-NEXT:    sldi r8, r8, 48
+; CHECK-BE-NEXT:    sldi r7, r7, 48
+; CHECK-BE-NEXT:    sldi r6, r6, 48
+; CHECK-BE-NEXT:    sldi r5, r5, 48
+; CHECK-BE-NEXT:    sldi r4, r4, 48
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    addi r11, r1, -16
+; CHECK-BE-NEXT:    std r10, -16(r1)
+; CHECK-BE-NEXT:    std r10, -8(r1)
+; CHECK-BE-NEXT:    std r9, -32(r1)
+; CHECK-BE-NEXT:    std r9, -24(r1)
+; CHECK-BE-NEXT:    std r8, -48(r1)
+; CHECK-BE-NEXT:    std r8, -40(r1)
+; CHECK-BE-NEXT:    std r7, -64(r1)
+; CHECK-BE-NEXT:    std r7, -56(r1)
+; CHECK-BE-NEXT:    std r6, -80(r1)
+; CHECK-BE-NEXT:    std r6, -72(r1)
+; CHECK-BE-NEXT:    std r5, -96(r1)
+; CHECK-BE-NEXT:    std r5, -88(r1)
+; CHECK-BE-NEXT:    std r4, -112(r1)
+; CHECK-BE-NEXT:    std r4, -104(r1)
+; CHECK-BE-NEXT:    std r3, -128(r1)
+; CHECK-BE-NEXT:    std r3, -120(r1)
 ; CHECK-BE-NEXT:    addi r3, r1, -32
 ; CHECK-BE-NEXT:    lxvw4x v3, 0, r3
 ; CHECK-BE-NEXT:    addi r3, r1, -48
@@ -159,6 +190,7 @@ define dso_local <8 x i16> @test8x24(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5
 ; CHECK-BE-NEXT:    addi r3, r1, -112
 ; CHECK-BE-NEXT:    lxvw4x v6, 0, r3
 ; CHECK-BE-NEXT:    addi r3, r1, -128
+; CHECK-BE-NEXT:    lxvw4x v2, 0, r11
 ; CHECK-BE-NEXT:    lxvw4x v7, 0, r3
 ; CHECK-BE-NEXT:    vmrghh v2, v3, v2
 ; CHECK-BE-NEXT:    vmrghh v3, v5, v4
diff --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll
index d1d29a0f884c6..14b3d69f8c273 100644
--- a/llvm/test/CodeGen/PowerPC/vsx.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx.ll
@@ -2383,7 +2383,9 @@ define <2 x double> @test70(<2 x i8> %a) {
 define <2 x i32> @test80(i32 %v) {
 ; CHECK-LABEL: test80:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    stw r3, -16(r1)
+; CHECK-NEXT:    sldi r3, r3, 32
+; CHECK-NEXT:    std r3, -16(r1)
+; CHECK-NEXT:    std r3, -8(r1)
 ; CHECK-NEXT:    addi r3, r1, -16
 ; CHECK-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-NEXT:    addis r3, r2, .LCPI65_0 at toc@ha
@@ -2395,7 +2397,9 @@ define <2 x i32> @test80(i32 %v) {
 ;
 ; CHECK-REG-LABEL: test80:
 ; CHECK-REG:       # %bb.0:
-; CHECK-REG-NEXT:    stw r3, -16(r1)
+; CHECK-REG-NEXT:    sldi r3, r3, 32
+; CHECK-REG-NEXT:    std r3, -16(r1)
+; CHECK-REG-NEXT:    std r3, -8(r1)
 ; CHECK-REG-NEXT:    addi r3, r1, -16
 ; CHECK-REG-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-REG-NEXT:    addis r3, r2, .LCPI65_0 at toc@ha
@@ -2407,8 +2411,12 @@ define <2 x i32> @test80(i32 %v) {
 ;
 ; CHECK-FISL-LABEL: test80:
 ; CHECK-FISL:       # %bb.0:
-; CHECK-FISL-NEXT:    # kill: def $r3 killed $r3 killed $x3
-; CHECK-FISL-NEXT:    stw r3, -16(r1)
+; CHECK-FISL-NEXT:    mr r4, r3
+; CHECK-FISL-NEXT:    # implicit-def: $x3
+; CHECK-FISL-NEXT:    mr r3, r4
+; CHECK-FISL-NEXT:    sldi r3, r3, 32
+; CHECK-FISL-NEXT:    std r3, -8(r1)
+; CHECK-FISL-NEXT:    std r3, -16(r1)
 ; CHECK-FISL-NEXT:    addi r3, r1, -16
 ; CHECK-FISL-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-FISL-NEXT:    xxspltw v2, vs0, 0
diff --git a/llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll b/llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll
index c31b2faedd51d..069e734088a84 100644
--- a/llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll
+++ b/llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll
@@ -13,7 +13,9 @@ define void @test() local_unnamed_addr #0 align 2 {
 ; CHECK-BE-NEXT:    addi r3, r3, 1
 ; CHECK-BE-NEXT:    xxlxor vs1, vs1, vs1
 ; CHECK-BE-NEXT:    vsrw v2, v2, v2
-; CHECK-BE-NEXT:    sth r3, -32(r1)
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    std r3, -32(r1)
+; CHECK-BE-NEXT:    std r3, -24(r1)
 ; CHECK-BE-NEXT:    addi r3, r1, -32
 ; CHECK-BE-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-BE-NEXT:    addi r3, r1, -16

>From d58d8492763d692a4858899cb4485d0b6967f660 Mon Sep 17 00:00:00 2001
From: Roland Froese <froese at ca.ibm.com>
Date: Wed, 7 Aug 2024 17:50:31 +0000
Subject: [PATCH 2/2] fix formatting

---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 23 ++++++++++-----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index d1f29a79c9668..3228a89ea5ce1 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -105,7 +105,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "ppc-lowering"
 
-static cl::opt<bool> DisableP10StoreForward("disable-p10-store-forward",
+static cl::opt<bool> DisableP10StoreForward(
+    "disable-p10-store-forward",
     cl::desc("disable P10 store forward-friendly conversion"), cl::Hidden,
     cl::init(false));
 
@@ -11503,23 +11504,21 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
       ValVT.getSizeInBits() <= 64) {
     Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, Val);
     EVT ShiftAmountTy = getShiftAmountTy(MVT::i64, DAG.getDataLayout());
-    SDValue ShiftBy =
-        DAG.getConstant(64 - Op.getValueType().getScalarSizeInBits(),
-                        dl, ShiftAmountTy);
+    SDValue ShiftBy = DAG.getConstant(
+        64 - Op.getValueType().getScalarSizeInBits(), dl, ShiftAmountTy);
     Val = DAG.getNode(ISD::SHL, dl, MVT::i64, Val, ShiftBy);
-    SDValue Plus8 = DAG.getNode(ISD::ADD, dl, PtrVT, FIdx,
-                                DAG.getConstant(8, dl, PtrVT));
-    SDValue Store2 = DAG.getStore(DAG.getEntryNode(), dl, Val, Plus8,
-                                  MachinePointerInfo());
-    SDValue Store = DAG.getStore(Store2, dl, Val, FIdx,
-                                 MachinePointerInfo());
+    SDValue Plus8 =
+        DAG.getNode(ISD::ADD, dl, PtrVT, FIdx, DAG.getConstant(8, dl, PtrVT));
+    SDValue Store2 =
+        DAG.getStore(DAG.getEntryNode(), dl, Val, Plus8, MachinePointerInfo());
+    SDValue Store = DAG.getStore(Store2, dl, Val, FIdx, MachinePointerInfo());
     return DAG.getLoad(Op.getValueType(), dl, Store, FIdx,
                        MachinePointerInfo());
   }
 
   // Store the input value into Value#0 of the stack slot.
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Val, FIdx,
-                               MachinePointerInfo());
+  SDValue Store =
+      DAG.getStore(DAG.getEntryNode(), dl, Val, FIdx, MachinePointerInfo());
   // Load it out.
   return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
 }



More information about the llvm-commits mailing list