[llvm] [PowerPC] improve P10 store forwarding on P7 scalar to vector (PR #102330)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 7 10:40:59 PDT 2024
https://github.com/RolandF77 updated https://github.com/llvm/llvm-project/pull/102330
>From a6a70508836addfa5ca59ff2e510da6d96629c42 Mon Sep 17 00:00:00 2001
From: Roland Froese <froese at ca.ibm.com>
Date: Wed, 7 Aug 2024 16:43:56 +0000
Subject: [PATCH 1/2] mitigate P7 scalar to vector LHS
---
llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 40 +-
llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll | 8 +-
.../build-vector-from-load-and-zeros.ll | 52 ++-
llvm/test/CodeGen/PowerPC/load-and-splat.ll | 1 +
llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll | 350 +++++++++++-------
.../CodeGen/PowerPC/test-vector-insert.ll | 16 +-
llvm/test/CodeGen/PowerPC/vec-trunc2.ll | 76 ++--
llvm/test/CodeGen/PowerPC/vsx.ll | 16 +-
.../CodeGen/PowerPC/widen-vec-correctly-be.ll | 4 +-
9 files changed, 361 insertions(+), 202 deletions(-)
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 1686ec572c855..d1f29a79c9668 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -105,6 +105,10 @@ using namespace llvm;
#define DEBUG_TYPE "ppc-lowering"
+static cl::opt<bool> DisableP10StoreForward("disable-p10-store-forward",
+ cl::desc("disable P10 store forward-friendly conversion"), cl::Hidden,
+ cl::init(false));
+
static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
@@ -985,6 +989,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
+ // LE is P8+/64-bit so direct moves are supported and these operations
+ // are legal. The custom transformation requires 64-bit since we need a
+ // pair of stores that will cover a 128-bit load for P10.
+ if (!DisableP10StoreForward && isPPC64 && !Subtarget.isLittleEndian()) {
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
+ }
setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
@@ -11479,8 +11491,34 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+ SDValue Val = Op.getOperand(0);
+ EVT ValVT = Val.getValueType();
+ // P10 hardware store forwarding requires that a single store contains all
+ // the data for the load. P10 is able to merge a pair of adjacent stores. Try
+ // to avoid load hit store on P10 when running binaries compiled for older
+ // processors by generating two mergeable scalar stores to forward with the
+ // vector load.
+ if (!DisableP10StoreForward && Subtarget.isPPC64() &&
+ !Subtarget.isLittleEndian() && ValVT.isInteger() &&
+ ValVT.getSizeInBits() <= 64) {
+ Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, Val);
+ EVT ShiftAmountTy = getShiftAmountTy(MVT::i64, DAG.getDataLayout());
+ SDValue ShiftBy =
+ DAG.getConstant(64 - Op.getValueType().getScalarSizeInBits(),
+ dl, ShiftAmountTy);
+ Val = DAG.getNode(ISD::SHL, dl, MVT::i64, Val, ShiftBy);
+ SDValue Plus8 = DAG.getNode(ISD::ADD, dl, PtrVT, FIdx,
+ DAG.getConstant(8, dl, PtrVT));
+ SDValue Store2 = DAG.getStore(DAG.getEntryNode(), dl, Val, Plus8,
+ MachinePointerInfo());
+ SDValue Store = DAG.getStore(Store2, dl, Val, FIdx,
+ MachinePointerInfo());
+ return DAG.getLoad(Op.getValueType(), dl, Store, FIdx,
+ MachinePointerInfo());
+ }
+
// Store the input value into Value#0 of the stack slot.
- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Val, FIdx,
MachinePointerInfo());
// Load it out.
return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
diff --git a/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll b/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
index 05edf92d72498..f5515e8ba19bd 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
@@ -14,7 +14,9 @@ define void @test_aix_splatimm(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-AIX-NEXT: slwi 3, 3, 8
; CHECK-AIX-NEXT: neg 3, 3
; CHECK-AIX-NEXT: lwz 6, 0(3)
-; CHECK-AIX-NEXT: sth 3, -16(1)
+; CHECK-AIX-NEXT: sldi 3, 3, 48
+; CHECK-AIX-NEXT: std 3, -16(1)
+; CHECK-AIX-NEXT: std 3, -8(1)
; CHECK-AIX-NEXT: addi 3, 1, -16
; CHECK-AIX-NEXT: lxvw4x 34, 0, 3
; CHECK-AIX-NEXT: srwi 3, 4, 16
@@ -24,9 +26,11 @@ define void @test_aix_splatimm(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-AIX-NEXT: mullw 3, 3, 4
; CHECK-AIX-NEXT: li 4, 0
; CHECK-AIX-NEXT: neg 3, 3
+; CHECK-AIX-NEXT: sldi 3, 3, 48
; CHECK-AIX-NEXT: vsplth 2, 2, 0
; CHECK-AIX-NEXT: stxvw4x 34, 0, 4
-; CHECK-AIX-NEXT: sth 3, -32(1)
+; CHECK-AIX-NEXT: std 3, -32(1)
+; CHECK-AIX-NEXT: std 3, -24(1)
; CHECK-AIX-NEXT: addi 3, 1, -32
; CHECK-AIX-NEXT: lxvw4x 34, 0, 3
; CHECK-AIX-NEXT: vsplth 2, 2, 0
diff --git a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
index cc32a76b22c28..6d35a7281de6b 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
@@ -338,17 +338,16 @@ define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_0:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: li 4, 0
-; PWR7-BE-NEXT: stw 4, -16(1)
-; PWR7-BE-NEXT: stw 3, -32(1)
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
+; PWR7-BE-NEXT: sldi 3, 3, 32
+; PWR7-BE-NEXT: std 3, -32(1)
+; PWR7-BE-NEXT: std 3, -24(1)
; PWR7-BE-NEXT: addis 3, 2, .LCPI8_0 at toc@ha
; PWR7-BE-NEXT: addi 3, 3, .LCPI8_0 at toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -16
-; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -32
-; PWR7-BE-NEXT: lxvw4x 36, 0, 3
-; PWR7-BE-NEXT: vperm 2, 4, 3, 2
+; PWR7-BE-NEXT: lxvw4x 35, 0, 3
+; PWR7-BE-NEXT: vperm 2, 3, 4, 2
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_0:
@@ -402,17 +401,16 @@ define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_1:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: li 4, 0
-; PWR7-BE-NEXT: stw 4, -32(1)
-; PWR7-BE-NEXT: stw 3, -16(1)
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
+; PWR7-BE-NEXT: sldi 3, 3, 32
+; PWR7-BE-NEXT: std 3, -16(1)
+; PWR7-BE-NEXT: std 3, -8(1)
; PWR7-BE-NEXT: addis 3, 2, .LCPI9_0 at toc@ha
; PWR7-BE-NEXT: addi 3, 3, .LCPI9_0 at toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -32
-; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -16
-; PWR7-BE-NEXT: lxvw4x 36, 0, 3
-; PWR7-BE-NEXT: vperm 2, 3, 4, 2
+; PWR7-BE-NEXT: lxvw4x 35, 0, 3
+; PWR7-BE-NEXT: vperm 2, 4, 3, 2
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_1:
@@ -466,17 +464,16 @@ define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_2:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: li 4, 0
-; PWR7-BE-NEXT: stw 4, -32(1)
-; PWR7-BE-NEXT: stw 3, -16(1)
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
+; PWR7-BE-NEXT: sldi 3, 3, 32
+; PWR7-BE-NEXT: std 3, -16(1)
+; PWR7-BE-NEXT: std 3, -8(1)
; PWR7-BE-NEXT: addis 3, 2, .LCPI10_0 at toc@ha
; PWR7-BE-NEXT: addi 3, 3, .LCPI10_0 at toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -32
-; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -16
-; PWR7-BE-NEXT: lxvw4x 36, 0, 3
-; PWR7-BE-NEXT: vperm 2, 3, 4, 2
+; PWR7-BE-NEXT: lxvw4x 35, 0, 3
+; PWR7-BE-NEXT: vperm 2, 4, 3, 2
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_2:
@@ -530,17 +527,16 @@ define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_3:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: li 4, 0
-; PWR7-BE-NEXT: stw 4, -32(1)
-; PWR7-BE-NEXT: stw 3, -16(1)
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
+; PWR7-BE-NEXT: sldi 3, 3, 32
+; PWR7-BE-NEXT: std 3, -16(1)
+; PWR7-BE-NEXT: std 3, -8(1)
; PWR7-BE-NEXT: addis 3, 2, .LCPI11_0 at toc@ha
; PWR7-BE-NEXT: addi 3, 3, .LCPI11_0 at toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -32
-; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -16
-; PWR7-BE-NEXT: lxvw4x 36, 0, 3
-; PWR7-BE-NEXT: vperm 2, 3, 4, 2
+; PWR7-BE-NEXT: lxvw4x 35, 0, 3
+; PWR7-BE-NEXT: vperm 2, 4, 3, 2
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_3:
diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
index 6d2f3b3abc42d..bc68ad2a67bf5 100644
--- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll
+++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
@@ -591,6 +591,7 @@ define <16 x i8> @adjusted_lxvwsx(ptr %s, ptr %t) {
; P7: # %bb.0: # %entry
; P7-NEXT: ld r3, 0(r3)
; P7-NEXT: std r3, -16(r1)
+; P7-NEXT: std r3, -8(r1)
; P7-NEXT: addi r3, r1, -16
; P7-NEXT: lxvw4x vs0, 0, r3
; P7-NEXT: xxspltw v2, vs0, 1
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
index 7a6640fea2d1e..bf8c5c96ccbde 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@@ -833,8 +833,18 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
;
; CHECK-PWR7-LABEL: sub_absv_8_ext:
; CHECK-PWR7: # %bb.0: # %entry
-; CHECK-PWR7-NEXT: stdu r1, -400(r1)
-; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 400
+; CHECK-PWR7-NEXT: stdu r1, -512(r1)
+; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 512
+; CHECK-PWR7-NEXT: .cfi_offset r14, -144
+; CHECK-PWR7-NEXT: .cfi_offset r15, -136
+; CHECK-PWR7-NEXT: .cfi_offset r16, -128
+; CHECK-PWR7-NEXT: .cfi_offset r17, -120
+; CHECK-PWR7-NEXT: .cfi_offset r18, -112
+; CHECK-PWR7-NEXT: .cfi_offset r19, -104
+; CHECK-PWR7-NEXT: .cfi_offset r20, -96
+; CHECK-PWR7-NEXT: .cfi_offset r21, -88
+; CHECK-PWR7-NEXT: .cfi_offset r22, -80
+; CHECK-PWR7-NEXT: .cfi_offset r23, -72
; CHECK-PWR7-NEXT: .cfi_offset r24, -64
; CHECK-PWR7-NEXT: .cfi_offset r25, -56
; CHECK-PWR7-NEXT: .cfi_offset r26, -48
@@ -842,184 +852,244 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
; CHECK-PWR7-NEXT: .cfi_offset r28, -32
; CHECK-PWR7-NEXT: .cfi_offset r29, -24
; CHECK-PWR7-NEXT: .cfi_offset r30, -16
-; CHECK-PWR7-NEXT: addi r3, r1, 304
-; CHECK-PWR7-NEXT: std r24, 336(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r25, 344(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r26, 352(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r27, 360(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r28, 368(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r29, 376(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r30, 384(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT: .cfi_offset r31, -8
+; CHECK-PWR7-NEXT: .cfi_offset r2, -152
; CHECK-PWR7-NEXT: addi r3, r1, 320
-; CHECK-PWR7-NEXT: lbz r4, 304(r1)
-; CHECK-PWR7-NEXT: stxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: lbz r5, 305(r1)
-; CHECK-PWR7-NEXT: lbz r6, 321(r1)
-; CHECK-PWR7-NEXT: lbz r7, 306(r1)
-; CHECK-PWR7-NEXT: lbz r8, 322(r1)
-; CHECK-PWR7-NEXT: lbz r9, 307(r1)
-; CHECK-PWR7-NEXT: lbz r10, 323(r1)
-; CHECK-PWR7-NEXT: lbz r0, 309(r1)
-; CHECK-PWR7-NEXT: lbz r30, 325(r1)
-; CHECK-PWR7-NEXT: lbz r29, 310(r1)
-; CHECK-PWR7-NEXT: lbz r28, 326(r1)
-; CHECK-PWR7-NEXT: lbz r11, 308(r1)
-; CHECK-PWR7-NEXT: lbz r12, 324(r1)
-; CHECK-PWR7-NEXT: lbz r27, 311(r1)
-; CHECK-PWR7-NEXT: lbz r26, 327(r1)
-; CHECK-PWR7-NEXT: lbz r25, 312(r1)
-; CHECK-PWR7-NEXT: sub r5, r5, r6
-; CHECK-PWR7-NEXT: sub r6, r7, r8
-; CHECK-PWR7-NEXT: sub r7, r9, r10
-; CHECK-PWR7-NEXT: sub r9, r0, r30
-; CHECK-PWR7-NEXT: sub r10, r29, r28
-; CHECK-PWR7-NEXT: sub r8, r11, r12
-; CHECK-PWR7-NEXT: srawi r0, r5, 31
-; CHECK-PWR7-NEXT: srawi r30, r6, 31
-; CHECK-PWR7-NEXT: srawi r29, r7, 31
-; CHECK-PWR7-NEXT: srawi r28, r8, 31
-; CHECK-PWR7-NEXT: sub r11, r27, r26
-; CHECK-PWR7-NEXT: srawi r27, r9, 31
-; CHECK-PWR7-NEXT: lbz r24, 328(r1)
-; CHECK-PWR7-NEXT: xor r5, r5, r0
-; CHECK-PWR7-NEXT: xor r6, r6, r30
-; CHECK-PWR7-NEXT: xor r7, r7, r29
-; CHECK-PWR7-NEXT: xor r8, r8, r28
-; CHECK-PWR7-NEXT: xor r9, r9, r27
-; CHECK-PWR7-NEXT: srawi r26, r10, 31
-; CHECK-PWR7-NEXT: sub r5, r5, r0
-; CHECK-PWR7-NEXT: sub r6, r6, r30
-; CHECK-PWR7-NEXT: lbz r0, 313(r1)
-; CHECK-PWR7-NEXT: lbz r30, 329(r1)
-; CHECK-PWR7-NEXT: sub r7, r7, r29
-; CHECK-PWR7-NEXT: lbz r29, 330(r1)
-; CHECK-PWR7-NEXT: sub r8, r8, r28
-; CHECK-PWR7-NEXT: lbz r28, 331(r1)
-; CHECK-PWR7-NEXT: sub r9, r9, r27
-; CHECK-PWR7-NEXT: lbz r27, 332(r1)
-; CHECK-PWR7-NEXT: xor r10, r10, r26
-; CHECK-PWR7-NEXT: sub r10, r10, r26
-; CHECK-PWR7-NEXT: lbz r26, 333(r1)
-; CHECK-PWR7-NEXT: sub r12, r25, r24
-; CHECK-PWR7-NEXT: srawi r25, r11, 31
+; CHECK-PWR7-NEXT: std r14, 368(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r15, 376(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r16, 384(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r17, 392(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r18, 400(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r19, 408(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r20, 416(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r21, 424(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r22, 432(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r23, 440(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r24, 448(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r25, 456(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r26, 464(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r27, 472(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r28, 480(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r29, 488(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r30, 496(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r31, 504(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r2, 360(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: stxvw4x v2, 0, r3
; CHECK-PWR7-NEXT: lbz r3, 320(r1)
+; CHECK-PWR7-NEXT: addi r4, r1, 336
+; CHECK-PWR7-NEXT: stw r3, 60(r1) # 4-byte Folded Spill
+; CHECK-PWR7-NEXT: stxvw4x v3, 0, r4
+; CHECK-PWR7-NEXT: lbz r15, 334(r1)
+; CHECK-PWR7-NEXT: lbz r14, 350(r1)
+; CHECK-PWR7-NEXT: lbz r31, 335(r1)
+; CHECK-PWR7-NEXT: lbz r2, 351(r1)
+; CHECK-PWR7-NEXT: sub r15, r15, r14
+; CHECK-PWR7-NEXT: sub r14, r31, r2
+; CHECK-PWR7-NEXT: srawi r2, r14, 31
+; CHECK-PWR7-NEXT: xor r14, r14, r2
+; CHECK-PWR7-NEXT: lbz r3, 333(r1)
+; CHECK-PWR7-NEXT: lbz r19, 331(r1)
+; CHECK-PWR7-NEXT: lbz r18, 347(r1)
+; CHECK-PWR7-NEXT: sub r19, r19, r18
+; CHECK-PWR7-NEXT: lbz r17, 332(r1)
+; CHECK-PWR7-NEXT: lbz r16, 348(r1)
+; CHECK-PWR7-NEXT: sub r17, r17, r16
+; CHECK-PWR7-NEXT: lbz r23, 329(r1)
+; CHECK-PWR7-NEXT: sub r14, r14, r2
+; CHECK-PWR7-NEXT: lbz r2, 349(r1)
+; CHECK-PWR7-NEXT: lbz r22, 345(r1)
+; CHECK-PWR7-NEXT: lbz r4, 336(r1)
+; CHECK-PWR7-NEXT: lbz r5, 321(r1)
+; CHECK-PWR7-NEXT: lbz r6, 337(r1)
+; CHECK-PWR7-NEXT: lbz r7, 322(r1)
+; CHECK-PWR7-NEXT: lbz r8, 338(r1)
+; CHECK-PWR7-NEXT: lbz r9, 323(r1)
+; CHECK-PWR7-NEXT: lbz r10, 339(r1)
+; CHECK-PWR7-NEXT: lbz r11, 324(r1)
+; CHECK-PWR7-NEXT: lbz r12, 340(r1)
+; CHECK-PWR7-NEXT: lbz r0, 325(r1)
+; CHECK-PWR7-NEXT: lbz r30, 341(r1)
+; CHECK-PWR7-NEXT: lbz r29, 326(r1)
+; CHECK-PWR7-NEXT: lbz r28, 342(r1)
+; CHECK-PWR7-NEXT: lbz r27, 327(r1)
+; CHECK-PWR7-NEXT: lbz r26, 343(r1)
+; CHECK-PWR7-NEXT: sub r3, r3, r2
+; CHECK-PWR7-NEXT: lbz r25, 328(r1)
+; CHECK-PWR7-NEXT: lbz r24, 344(r1)
+; CHECK-PWR7-NEXT: lbz r21, 330(r1)
+; CHECK-PWR7-NEXT: lbz r20, 346(r1)
+; CHECK-PWR7-NEXT: sub r5, r5, r6
+; CHECK-PWR7-NEXT: srawi r18, r3, 31
+; CHECK-PWR7-NEXT: sub r7, r7, r8
+; CHECK-PWR7-NEXT: sub r9, r9, r10
+; CHECK-PWR7-NEXT: sub r11, r11, r12
; CHECK-PWR7-NEXT: sub r0, r0, r30
-; CHECK-PWR7-NEXT: xor r11, r11, r25
-; CHECK-PWR7-NEXT: sub r11, r11, r25
-; CHECK-PWR7-NEXT: lbz r25, 334(r1)
-; CHECK-PWR7-NEXT: sub r4, r4, r3
-; CHECK-PWR7-NEXT: srawi r30, r0, 31
-; CHECK-PWR7-NEXT: srawi r24, r12, 31
-; CHECK-PWR7-NEXT: xor r12, r12, r24
-; CHECK-PWR7-NEXT: sub r12, r12, r24
-; CHECK-PWR7-NEXT: lbz r24, 335(r1)
-; CHECK-PWR7-NEXT: srawi r3, r4, 31
-; CHECK-PWR7-NEXT: xor r4, r4, r3
-; CHECK-PWR7-NEXT: xor r0, r0, r30
-; CHECK-PWR7-NEXT: sub r3, r4, r3
-; CHECK-PWR7-NEXT: stb r3, 48(r1)
-; CHECK-PWR7-NEXT: addi r3, r1, 288
-; CHECK-PWR7-NEXT: stb r12, 176(r1)
-; CHECK-PWR7-NEXT: sub r0, r0, r30
-; CHECK-PWR7-NEXT: lbz r30, 314(r1)
-; CHECK-PWR7-NEXT: stb r11, 160(r1)
-; CHECK-PWR7-NEXT: sub r30, r30, r29
-; CHECK-PWR7-NEXT: stb r0, 192(r1)
-; CHECK-PWR7-NEXT: stb r10, 144(r1)
-; CHECK-PWR7-NEXT: stb r9, 128(r1)
-; CHECK-PWR7-NEXT: stb r8, 112(r1)
-; CHECK-PWR7-NEXT: stb r7, 96(r1)
-; CHECK-PWR7-NEXT: stb r6, 80(r1)
-; CHECK-PWR7-NEXT: srawi r29, r30, 31
-; CHECK-PWR7-NEXT: stb r5, 64(r1)
-; CHECK-PWR7-NEXT: xor r30, r30, r29
-; CHECK-PWR7-NEXT: sub r30, r30, r29
-; CHECK-PWR7-NEXT: lbz r29, 315(r1)
; CHECK-PWR7-NEXT: sub r29, r29, r28
-; CHECK-PWR7-NEXT: stb r30, 208(r1)
-; CHECK-PWR7-NEXT: ld r30, 384(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r28, r29, 31
-; CHECK-PWR7-NEXT: xor r29, r29, r28
-; CHECK-PWR7-NEXT: sub r29, r29, r28
-; CHECK-PWR7-NEXT: lbz r28, 316(r1)
-; CHECK-PWR7-NEXT: sub r28, r28, r27
-; CHECK-PWR7-NEXT: stb r29, 224(r1)
-; CHECK-PWR7-NEXT: ld r29, 376(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r27, r28, 31
-; CHECK-PWR7-NEXT: xor r28, r28, r27
-; CHECK-PWR7-NEXT: sub r28, r28, r27
-; CHECK-PWR7-NEXT: lbz r27, 317(r1)
; CHECK-PWR7-NEXT: sub r27, r27, r26
-; CHECK-PWR7-NEXT: stb r28, 240(r1)
-; CHECK-PWR7-NEXT: ld r28, 368(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r26, r27, 31
-; CHECK-PWR7-NEXT: xor r27, r27, r26
-; CHECK-PWR7-NEXT: sub r27, r27, r26
-; CHECK-PWR7-NEXT: lbz r26, 318(r1)
-; CHECK-PWR7-NEXT: sub r26, r26, r25
-; CHECK-PWR7-NEXT: stb r27, 256(r1)
-; CHECK-PWR7-NEXT: ld r27, 360(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r25, r26, 31
-; CHECK-PWR7-NEXT: xor r26, r26, r25
-; CHECK-PWR7-NEXT: sub r26, r26, r25
-; CHECK-PWR7-NEXT: lbz r25, 319(r1)
; CHECK-PWR7-NEXT: sub r25, r25, r24
-; CHECK-PWR7-NEXT: stb r26, 272(r1)
-; CHECK-PWR7-NEXT: ld r26, 352(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: srawi r31, r15, 31
+; CHECK-PWR7-NEXT: ld r2, 360(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: xor r3, r3, r18
+; CHECK-PWR7-NEXT: srawi r6, r5, 31
+; CHECK-PWR7-NEXT: srawi r8, r7, 31
+; CHECK-PWR7-NEXT: srawi r10, r9, 31
+; CHECK-PWR7-NEXT: srawi r12, r11, 31
+; CHECK-PWR7-NEXT: srawi r30, r0, 31
+; CHECK-PWR7-NEXT: sub r3, r3, r18
+; CHECK-PWR7-NEXT: srawi r18, r19, 31
+; CHECK-PWR7-NEXT: srawi r28, r29, 31
+; CHECK-PWR7-NEXT: ld r16, 384(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: sldi r3, r3, 56
+; CHECK-PWR7-NEXT: srawi r26, r27, 31
; CHECK-PWR7-NEXT: srawi r24, r25, 31
+; CHECK-PWR7-NEXT: xor r19, r19, r18
+; CHECK-PWR7-NEXT: xor r15, r15, r31
+; CHECK-PWR7-NEXT: xor r5, r5, r6
+; CHECK-PWR7-NEXT: std r3, 272(r1)
+; CHECK-PWR7-NEXT: std r3, 280(r1)
+; CHECK-PWR7-NEXT: srawi r3, r17, 31
+; CHECK-PWR7-NEXT: sub r19, r19, r18
+; CHECK-PWR7-NEXT: xor r7, r7, r8
+; CHECK-PWR7-NEXT: sub r15, r15, r31
+; CHECK-PWR7-NEXT: xor r17, r17, r3
+; CHECK-PWR7-NEXT: xor r9, r9, r10
+; CHECK-PWR7-NEXT: xor r11, r11, r12
+; CHECK-PWR7-NEXT: xor r0, r0, r30
+; CHECK-PWR7-NEXT: xor r29, r29, r28
+; CHECK-PWR7-NEXT: xor r27, r27, r26
+; CHECK-PWR7-NEXT: sub r3, r17, r3
; CHECK-PWR7-NEXT: xor r25, r25, r24
; CHECK-PWR7-NEXT: sub r25, r25, r24
-; CHECK-PWR7-NEXT: ld r24, 336(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: stb r25, 288(r1)
-; CHECK-PWR7-NEXT: ld r25, 344(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: sub r27, r27, r26
+; CHECK-PWR7-NEXT: sub r29, r29, r28
+; CHECK-PWR7-NEXT: sldi r3, r3, 56
+; CHECK-PWR7-NEXT: sub r0, r0, r30
+; CHECK-PWR7-NEXT: sub r11, r11, r12
+; CHECK-PWR7-NEXT: sub r9, r9, r10
+; CHECK-PWR7-NEXT: sub r7, r7, r8
+; CHECK-PWR7-NEXT: sub r5, r5, r6
+; CHECK-PWR7-NEXT: sldi r14, r14, 56
+; CHECK-PWR7-NEXT: sldi r15, r15, 56
+; CHECK-PWR7-NEXT: ld r31, 504(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r3, 256(r1)
+; CHECK-PWR7-NEXT: std r3, 264(r1)
+; CHECK-PWR7-NEXT: sldi r3, r19, 56
+; CHECK-PWR7-NEXT: sldi r25, r25, 56
+; CHECK-PWR7-NEXT: sldi r27, r27, 56
+; CHECK-PWR7-NEXT: std r3, 240(r1)
+; CHECK-PWR7-NEXT: std r3, 248(r1)
+; CHECK-PWR7-NEXT: sub r3, r23, r22
+; CHECK-PWR7-NEXT: srawi r23, r3, 31
+; CHECK-PWR7-NEXT: sub r22, r21, r20
+; CHECK-PWR7-NEXT: srawi r21, r22, 31
+; CHECK-PWR7-NEXT: sldi r29, r29, 56
+; CHECK-PWR7-NEXT: sldi r0, r0, 56
+; CHECK-PWR7-NEXT: sldi r11, r11, 56
+; CHECK-PWR7-NEXT: xor r3, r3, r23
+; CHECK-PWR7-NEXT: xor r22, r22, r21
+; CHECK-PWR7-NEXT: sldi r9, r9, 56
+; CHECK-PWR7-NEXT: sldi r7, r7, 56
+; CHECK-PWR7-NEXT: sldi r5, r5, 56
+; CHECK-PWR7-NEXT: ld r30, 496(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r28, 480(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: sub r3, r3, r23
+; CHECK-PWR7-NEXT: sub r22, r22, r21
+; CHECK-PWR7-NEXT: std r14, 304(r1)
+; CHECK-PWR7-NEXT: ld r26, 464(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: sldi r3, r3, 56
+; CHECK-PWR7-NEXT: sldi r22, r22, 56
+; CHECK-PWR7-NEXT: ld r24, 448(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r23, 440(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r14, 312(r1)
+; CHECK-PWR7-NEXT: std r15, 288(r1)
+; CHECK-PWR7-NEXT: std r3, 208(r1)
+; CHECK-PWR7-NEXT: std r3, 216(r1)
+; CHECK-PWR7-NEXT: lwz r3, 60(r1) # 4-byte Folded Reload
+; CHECK-PWR7-NEXT: std r15, 296(r1)
+; CHECK-PWR7-NEXT: ld r21, 424(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r20, 416(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r22, 224(r1)
+; CHECK-PWR7-NEXT: std r22, 232(r1)
+; CHECK-PWR7-NEXT: sub r4, r3, r4
+; CHECK-PWR7-NEXT: std r25, 192(r1)
+; CHECK-PWR7-NEXT: ld r22, 432(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r19, 408(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: srawi r3, r4, 31
+; CHECK-PWR7-NEXT: std r25, 200(r1)
+; CHECK-PWR7-NEXT: ld r25, 456(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r27, 176(r1)
+; CHECK-PWR7-NEXT: std r27, 184(r1)
+; CHECK-PWR7-NEXT: xor r4, r4, r3
+; CHECK-PWR7-NEXT: std r29, 160(r1)
+; CHECK-PWR7-NEXT: ld r27, 472(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r29, 168(r1)
+; CHECK-PWR7-NEXT: std r0, 144(r1)
+; CHECK-PWR7-NEXT: sub r3, r4, r3
+; CHECK-PWR7-NEXT: std r0, 152(r1)
+; CHECK-PWR7-NEXT: ld r29, 488(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r18, 400(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: sldi r3, r3, 56
+; CHECK-PWR7-NEXT: std r11, 128(r1)
+; CHECK-PWR7-NEXT: ld r17, 392(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r11, 136(r1)
+; CHECK-PWR7-NEXT: std r9, 112(r1)
+; CHECK-PWR7-NEXT: std r3, 64(r1)
+; CHECK-PWR7-NEXT: std r3, 72(r1)
+; CHECK-PWR7-NEXT: addi r3, r1, 304
+; CHECK-PWR7-NEXT: std r9, 120(r1)
+; CHECK-PWR7-NEXT: ld r15, 376(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r7, 96(r1)
+; CHECK-PWR7-NEXT: std r7, 104(r1)
+; CHECK-PWR7-NEXT: std r5, 80(r1)
+; CHECK-PWR7-NEXT: std r5, 88(r1)
; CHECK-PWR7-NEXT: lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 272
+; CHECK-PWR7-NEXT: addi r3, r1, 288
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 256
+; CHECK-PWR7-NEXT: addi r3, r1, 272
+; CHECK-PWR7-NEXT: ld r14, 368(r1) # 8-byte Folded Reload
; CHECK-PWR7-NEXT: vmrghb v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 240
+; CHECK-PWR7-NEXT: addi r3, r1, 256
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 224
+; CHECK-PWR7-NEXT: addi r3, r1, 240
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
; CHECK-PWR7-NEXT: vmrghh v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 208
+; CHECK-PWR7-NEXT: addi r3, r1, 224
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 192
+; CHECK-PWR7-NEXT: addi r3, r1, 208
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 176
+; CHECK-PWR7-NEXT: addi r3, r1, 192
; CHECK-PWR7-NEXT: lxvw4x v5, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 160
+; CHECK-PWR7-NEXT: addi r3, r1, 176
; CHECK-PWR7-NEXT: vmrghb v4, v5, v4
; CHECK-PWR7-NEXT: vmrghh v3, v4, v3
; CHECK-PWR7-NEXT: xxmrghw vs0, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 144
+; CHECK-PWR7-NEXT: addi r3, r1, 160
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 128
+; CHECK-PWR7-NEXT: addi r3, r1, 144
; CHECK-PWR7-NEXT: vmrghb v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 112
+; CHECK-PWR7-NEXT: addi r3, r1, 128
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 96
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
+; CHECK-PWR7-NEXT: addi r3, r1, 112
; CHECK-PWR7-NEXT: vmrghh v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 80
+; CHECK-PWR7-NEXT: addi r3, r1, 96
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 64
+; CHECK-PWR7-NEXT: addi r3, r1, 80
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 48
+; CHECK-PWR7-NEXT: addi r3, r1, 64
; CHECK-PWR7-NEXT: lxvw4x v5, 0, r3
; CHECK-PWR7-NEXT: vmrghb v4, v5, v4
; CHECK-PWR7-NEXT: vmrghh v3, v4, v3
; CHECK-PWR7-NEXT: xxmrghw vs1, v3, v2
; CHECK-PWR7-NEXT: xxmrghd v2, vs1, vs0
-; CHECK-PWR7-NEXT: addi r1, r1, 400
+; CHECK-PWR7-NEXT: addi r1, r1, 512
; CHECK-PWR7-NEXT: blr
entry:
%vecext = extractelement <16 x i8> %a, i32 0
diff --git a/llvm/test/CodeGen/PowerPC/test-vector-insert.ll b/llvm/test/CodeGen/PowerPC/test-vector-insert.ll
index 087f2244f0f7d..73b4ad8a507b8 100644
--- a/llvm/test/CodeGen/PowerPC/test-vector-insert.ll
+++ b/llvm/test/CodeGen/PowerPC/test-vector-insert.ll
@@ -60,7 +60,9 @@ define dso_local <4 x i32> @test(<4 x i32> %a, double %b) {
; CHECK-BE-P7-NEXT: addi r3, r1, -4
; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3
; CHECK-BE-P7-NEXT: lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT: stw r3, -32(r1)
+; CHECK-BE-P7-NEXT: sldi r3, r3, 32
+; CHECK-BE-P7-NEXT: std r3, -32(r1)
+; CHECK-BE-P7-NEXT: std r3, -24(r1)
; CHECK-BE-P7-NEXT: addis r3, r2, .LCPI0_0 at toc@ha
; CHECK-BE-P7-NEXT: addi r3, r3, .LCPI0_0 at toc@l
; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3
@@ -129,7 +131,9 @@ define dso_local <4 x i32> @test2(<4 x i32> %a, float %b) {
; CHECK-BE-P7-NEXT: addi r3, r1, -4
; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3
; CHECK-BE-P7-NEXT: lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT: stw r3, -32(r1)
+; CHECK-BE-P7-NEXT: sldi r3, r3, 32
+; CHECK-BE-P7-NEXT: std r3, -32(r1)
+; CHECK-BE-P7-NEXT: std r3, -24(r1)
; CHECK-BE-P7-NEXT: addis r3, r2, .LCPI1_0 at toc@ha
; CHECK-BE-P7-NEXT: addi r3, r3, .LCPI1_0 at toc@l
; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3
@@ -198,7 +202,9 @@ define dso_local <4 x i32> @test3(<4 x i32> %a, double %b) {
; CHECK-BE-P7-NEXT: addi r3, r1, -4
; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3
; CHECK-BE-P7-NEXT: lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT: stw r3, -32(r1)
+; CHECK-BE-P7-NEXT: sldi r3, r3, 32
+; CHECK-BE-P7-NEXT: std r3, -32(r1)
+; CHECK-BE-P7-NEXT: std r3, -24(r1)
; CHECK-BE-P7-NEXT: addis r3, r2, .LCPI2_0 at toc@ha
; CHECK-BE-P7-NEXT: addi r3, r3, .LCPI2_0 at toc@l
; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3
@@ -267,7 +273,9 @@ define dso_local <4 x i32> @test4(<4 x i32> %a, float %b) {
; CHECK-BE-P7-NEXT: addi r3, r1, -4
; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3
; CHECK-BE-P7-NEXT: lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT: stw r3, -32(r1)
+; CHECK-BE-P7-NEXT: sldi r3, r3, 32
+; CHECK-BE-P7-NEXT: std r3, -32(r1)
+; CHECK-BE-P7-NEXT: std r3, -24(r1)
; CHECK-BE-P7-NEXT: addis r3, r2, .LCPI3_0 at toc@ha
; CHECK-BE-P7-NEXT: addi r3, r3, .LCPI3_0 at toc@l
; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3
diff --git a/llvm/test/CodeGen/PowerPC/vec-trunc2.ll b/llvm/test/CodeGen/PowerPC/vec-trunc2.ll
index 7a09d5a5e8bb2..b81bc9f21423f 100644
--- a/llvm/test/CodeGen/PowerPC/vec-trunc2.ll
+++ b/llvm/test/CodeGen/PowerPC/vec-trunc2.ll
@@ -28,16 +28,31 @@ define dso_local <8 x i8> @test8x32(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
;
; CHECK-BE-LABEL: test8x32:
; CHECK-BE: # %bb.0:
-; CHECK-BE-NEXT: stw r10, -80(r1)
-; CHECK-BE-NEXT: stw r9, -96(r1)
-; CHECK-BE-NEXT: stw r8, -112(r1)
-; CHECK-BE-NEXT: stw r7, -128(r1)
-; CHECK-BE-NEXT: stw r6, -16(r1)
-; CHECK-BE-NEXT: stw r5, -32(r1)
-; CHECK-BE-NEXT: stw r4, -48(r1)
-; CHECK-BE-NEXT: stw r3, -64(r1)
-; CHECK-BE-NEXT: addi r3, r1, -80
-; CHECK-BE-NEXT: lxvw4x vs0, 0, r3
+; CHECK-BE-NEXT: sldi r10, r10, 32
+; CHECK-BE-NEXT: sldi r9, r9, 32
+; CHECK-BE-NEXT: sldi r8, r8, 32
+; CHECK-BE-NEXT: sldi r7, r7, 32
+; CHECK-BE-NEXT: sldi r6, r6, 32
+; CHECK-BE-NEXT: sldi r5, r5, 32
+; CHECK-BE-NEXT: sldi r4, r4, 32
+; CHECK-BE-NEXT: sldi r3, r3, 32
+; CHECK-BE-NEXT: addi r11, r1, -80
+; CHECK-BE-NEXT: std r10, -80(r1)
+; CHECK-BE-NEXT: std r10, -72(r1)
+; CHECK-BE-NEXT: std r9, -96(r1)
+; CHECK-BE-NEXT: std r9, -88(r1)
+; CHECK-BE-NEXT: std r8, -112(r1)
+; CHECK-BE-NEXT: std r8, -104(r1)
+; CHECK-BE-NEXT: std r7, -128(r1)
+; CHECK-BE-NEXT: std r7, -120(r1)
+; CHECK-BE-NEXT: std r6, -16(r1)
+; CHECK-BE-NEXT: std r6, -8(r1)
+; CHECK-BE-NEXT: std r5, -32(r1)
+; CHECK-BE-NEXT: std r5, -24(r1)
+; CHECK-BE-NEXT: std r4, -48(r1)
+; CHECK-BE-NEXT: std r4, -40(r1)
+; CHECK-BE-NEXT: std r3, -64(r1)
+; CHECK-BE-NEXT: std r3, -56(r1)
; CHECK-BE-NEXT: addi r3, r1, -96
; CHECK-BE-NEXT: lxvw4x vs1, 0, r3
; CHECK-BE-NEXT: addi r3, r1, -112
@@ -51,15 +66,16 @@ define dso_local <8 x i8> @test8x32(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
; CHECK-BE-NEXT: addi r3, r1, -48
; CHECK-BE-NEXT: lxvw4x vs6, 0, r3
; CHECK-BE-NEXT: addi r3, r1, -64
+; CHECK-BE-NEXT: lxvw4x vs0, 0, r11
; CHECK-BE-NEXT: lxvw4x vs7, 0, r3
; CHECK-BE-NEXT: addis r3, r2, .LCPI0_0 at toc@ha
+; CHECK-BE-NEXT: addi r3, r3, .LCPI0_0 at toc@l
; CHECK-BE-NEXT: xxmrghw vs0, vs1, vs0
; CHECK-BE-NEXT: xxmrghw vs1, vs3, vs2
; CHECK-BE-NEXT: xxmrghw vs2, vs5, vs4
-; CHECK-BE-NEXT: addi r3, r3, .LCPI0_0 at toc@l
-; CHECK-BE-NEXT: xxmrghd v3, vs1, vs0
; CHECK-BE-NEXT: lxvw4x v2, 0, r3
; CHECK-BE-NEXT: xxmrghw vs3, vs7, vs6
+; CHECK-BE-NEXT: xxmrghd v3, vs1, vs0
; CHECK-BE-NEXT: xxmrghd v4, vs3, vs2
; CHECK-BE-NEXT: vperm v2, v4, v3, v2
; CHECK-BE-NEXT: blr
@@ -136,16 +152,31 @@ define dso_local <8 x i16> @test8x24(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5
;
; CHECK-BE-LABEL: test8x24:
; CHECK-BE: # %bb.0:
-; CHECK-BE-NEXT: sth r10, -16(r1)
-; CHECK-BE-NEXT: sth r9, -32(r1)
-; CHECK-BE-NEXT: sth r8, -48(r1)
-; CHECK-BE-NEXT: sth r7, -64(r1)
-; CHECK-BE-NEXT: sth r6, -80(r1)
-; CHECK-BE-NEXT: sth r5, -96(r1)
-; CHECK-BE-NEXT: sth r4, -112(r1)
-; CHECK-BE-NEXT: sth r3, -128(r1)
-; CHECK-BE-NEXT: addi r3, r1, -16
-; CHECK-BE-NEXT: lxvw4x v2, 0, r3
+; CHECK-BE-NEXT: sldi r10, r10, 48
+; CHECK-BE-NEXT: sldi r9, r9, 48
+; CHECK-BE-NEXT: sldi r8, r8, 48
+; CHECK-BE-NEXT: sldi r7, r7, 48
+; CHECK-BE-NEXT: sldi r6, r6, 48
+; CHECK-BE-NEXT: sldi r5, r5, 48
+; CHECK-BE-NEXT: sldi r4, r4, 48
+; CHECK-BE-NEXT: sldi r3, r3, 48
+; CHECK-BE-NEXT: addi r11, r1, -16
+; CHECK-BE-NEXT: std r10, -16(r1)
+; CHECK-BE-NEXT: std r10, -8(r1)
+; CHECK-BE-NEXT: std r9, -32(r1)
+; CHECK-BE-NEXT: std r9, -24(r1)
+; CHECK-BE-NEXT: std r8, -48(r1)
+; CHECK-BE-NEXT: std r8, -40(r1)
+; CHECK-BE-NEXT: std r7, -64(r1)
+; CHECK-BE-NEXT: std r7, -56(r1)
+; CHECK-BE-NEXT: std r6, -80(r1)
+; CHECK-BE-NEXT: std r6, -72(r1)
+; CHECK-BE-NEXT: std r5, -96(r1)
+; CHECK-BE-NEXT: std r5, -88(r1)
+; CHECK-BE-NEXT: std r4, -112(r1)
+; CHECK-BE-NEXT: std r4, -104(r1)
+; CHECK-BE-NEXT: std r3, -128(r1)
+; CHECK-BE-NEXT: std r3, -120(r1)
; CHECK-BE-NEXT: addi r3, r1, -32
; CHECK-BE-NEXT: lxvw4x v3, 0, r3
; CHECK-BE-NEXT: addi r3, r1, -48
@@ -159,6 +190,7 @@ define dso_local <8 x i16> @test8x24(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5
; CHECK-BE-NEXT: addi r3, r1, -112
; CHECK-BE-NEXT: lxvw4x v6, 0, r3
; CHECK-BE-NEXT: addi r3, r1, -128
+; CHECK-BE-NEXT: lxvw4x v2, 0, r11
; CHECK-BE-NEXT: lxvw4x v7, 0, r3
; CHECK-BE-NEXT: vmrghh v2, v3, v2
; CHECK-BE-NEXT: vmrghh v3, v5, v4
diff --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll
index d1d29a0f884c6..14b3d69f8c273 100644
--- a/llvm/test/CodeGen/PowerPC/vsx.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx.ll
@@ -2383,7 +2383,9 @@ define <2 x double> @test70(<2 x i8> %a) {
define <2 x i32> @test80(i32 %v) {
; CHECK-LABEL: test80:
; CHECK: # %bb.0:
-; CHECK-NEXT: stw r3, -16(r1)
+; CHECK-NEXT: sldi r3, r3, 32
+; CHECK-NEXT: std r3, -16(r1)
+; CHECK-NEXT: std r3, -8(r1)
; CHECK-NEXT: addi r3, r1, -16
; CHECK-NEXT: lxvw4x vs0, 0, r3
; CHECK-NEXT: addis r3, r2, .LCPI65_0 at toc@ha
@@ -2395,7 +2397,9 @@ define <2 x i32> @test80(i32 %v) {
;
; CHECK-REG-LABEL: test80:
; CHECK-REG: # %bb.0:
-; CHECK-REG-NEXT: stw r3, -16(r1)
+; CHECK-REG-NEXT: sldi r3, r3, 32
+; CHECK-REG-NEXT: std r3, -16(r1)
+; CHECK-REG-NEXT: std r3, -8(r1)
; CHECK-REG-NEXT: addi r3, r1, -16
; CHECK-REG-NEXT: lxvw4x vs0, 0, r3
; CHECK-REG-NEXT: addis r3, r2, .LCPI65_0 at toc@ha
@@ -2407,8 +2411,12 @@ define <2 x i32> @test80(i32 %v) {
;
; CHECK-FISL-LABEL: test80:
; CHECK-FISL: # %bb.0:
-; CHECK-FISL-NEXT: # kill: def $r3 killed $r3 killed $x3
-; CHECK-FISL-NEXT: stw r3, -16(r1)
+; CHECK-FISL-NEXT: mr r4, r3
+; CHECK-FISL-NEXT: # implicit-def: $x3
+; CHECK-FISL-NEXT: mr r3, r4
+; CHECK-FISL-NEXT: sldi r3, r3, 32
+; CHECK-FISL-NEXT: std r3, -8(r1)
+; CHECK-FISL-NEXT: std r3, -16(r1)
; CHECK-FISL-NEXT: addi r3, r1, -16
; CHECK-FISL-NEXT: lxvw4x vs0, 0, r3
; CHECK-FISL-NEXT: xxspltw v2, vs0, 0
diff --git a/llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll b/llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll
index c31b2faedd51d..069e734088a84 100644
--- a/llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll
+++ b/llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll
@@ -13,7 +13,9 @@ define void @test() local_unnamed_addr #0 align 2 {
; CHECK-BE-NEXT: addi r3, r3, 1
; CHECK-BE-NEXT: xxlxor vs1, vs1, vs1
; CHECK-BE-NEXT: vsrw v2, v2, v2
-; CHECK-BE-NEXT: sth r3, -32(r1)
+; CHECK-BE-NEXT: sldi r3, r3, 48
+; CHECK-BE-NEXT: std r3, -32(r1)
+; CHECK-BE-NEXT: std r3, -24(r1)
; CHECK-BE-NEXT: addi r3, r1, -32
; CHECK-BE-NEXT: lxvw4x vs0, 0, r3
; CHECK-BE-NEXT: addi r3, r1, -16
>From d58d8492763d692a4858899cb4485d0b6967f660 Mon Sep 17 00:00:00 2001
From: Roland Froese <froese at ca.ibm.com>
Date: Wed, 7 Aug 2024 17:50:31 +0000
Subject: [PATCH 2/2] fix formatting
---
llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 23 ++++++++++-----------
1 file changed, 11 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index d1f29a79c9668..3228a89ea5ce1 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -105,7 +105,8 @@ using namespace llvm;
#define DEBUG_TYPE "ppc-lowering"
-static cl::opt<bool> DisableP10StoreForward("disable-p10-store-forward",
+static cl::opt<bool> DisableP10StoreForward(
+ "disable-p10-store-forward",
cl::desc("disable P10 store forward-friendly conversion"), cl::Hidden,
cl::init(false));
@@ -11503,23 +11504,21 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
ValVT.getSizeInBits() <= 64) {
Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, Val);
EVT ShiftAmountTy = getShiftAmountTy(MVT::i64, DAG.getDataLayout());
- SDValue ShiftBy =
- DAG.getConstant(64 - Op.getValueType().getScalarSizeInBits(),
- dl, ShiftAmountTy);
+ SDValue ShiftBy = DAG.getConstant(
+ 64 - Op.getValueType().getScalarSizeInBits(), dl, ShiftAmountTy);
Val = DAG.getNode(ISD::SHL, dl, MVT::i64, Val, ShiftBy);
- SDValue Plus8 = DAG.getNode(ISD::ADD, dl, PtrVT, FIdx,
- DAG.getConstant(8, dl, PtrVT));
- SDValue Store2 = DAG.getStore(DAG.getEntryNode(), dl, Val, Plus8,
- MachinePointerInfo());
- SDValue Store = DAG.getStore(Store2, dl, Val, FIdx,
- MachinePointerInfo());
+ SDValue Plus8 =
+ DAG.getNode(ISD::ADD, dl, PtrVT, FIdx, DAG.getConstant(8, dl, PtrVT));
+ SDValue Store2 =
+ DAG.getStore(DAG.getEntryNode(), dl, Val, Plus8, MachinePointerInfo());
+ SDValue Store = DAG.getStore(Store2, dl, Val, FIdx, MachinePointerInfo());
return DAG.getLoad(Op.getValueType(), dl, Store, FIdx,
MachinePointerInfo());
}
// Store the input value into Value#0 of the stack slot.
- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Val, FIdx,
- MachinePointerInfo());
+ SDValue Store =
+ DAG.getStore(DAG.getEntryNode(), dl, Val, FIdx, MachinePointerInfo());
// Load it out.
return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
}
More information about the llvm-commits
mailing list