[llvm] [PowerPC] improve P10 store forwarding on P7 scalar to vector (PR #102330)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 7 09:44:06 PDT 2024
https://github.com/RolandF77 created https://github.com/llvm/llvm-project/pull/102330
Commercial applications are often built for the oldest supported processor, even though they will usually be run on newer processors. Try to make code built for P7 that has scalar to vector operations run better on P10 by making a small change to the P7 scalar to vector code to allow hardware store forwarding on P10.
>From a6a70508836addfa5ca59ff2e510da6d96629c42 Mon Sep 17 00:00:00 2001
From: Roland Froese <froese at ca.ibm.com>
Date: Wed, 7 Aug 2024 16:43:56 +0000
Subject: [PATCH] mitigate P7 scalar to vector LHS
---
llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 40 +-
llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll | 8 +-
.../build-vector-from-load-and-zeros.ll | 52 ++-
llvm/test/CodeGen/PowerPC/load-and-splat.ll | 1 +
llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll | 350 +++++++++++-------
.../CodeGen/PowerPC/test-vector-insert.ll | 16 +-
llvm/test/CodeGen/PowerPC/vec-trunc2.ll | 76 ++--
llvm/test/CodeGen/PowerPC/vsx.ll | 16 +-
.../CodeGen/PowerPC/widen-vec-correctly-be.ll | 4 +-
9 files changed, 361 insertions(+), 202 deletions(-)
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 1686ec572c8553..d1f29a79c96689 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -105,6 +105,10 @@ using namespace llvm;
#define DEBUG_TYPE "ppc-lowering"
+static cl::opt<bool> DisableP10StoreForward("disable-p10-store-forward",
+ cl::desc("disable P10 store forward-friendly conversion"), cl::Hidden,
+ cl::init(false));
+
static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
@@ -985,6 +989,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
+ // LE is P8+/64-bit so direct moves are supported and these operations
+ // are legal. The custom transformation requires 64-bit since we need a
+ // pair of stores that will cover a 128-bit load for P10.
+ if (!DisableP10StoreForward && isPPC64 && !Subtarget.isLittleEndian()) {
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
+ }
setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
@@ -11479,8 +11491,34 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+ SDValue Val = Op.getOperand(0);
+ EVT ValVT = Val.getValueType();
+ // P10 hardware store forwarding requires that a single store contains all
+ // the data for the load. P10 is able to merge a pair of adjacent stores. Try
+ // to avoid load hit store on P10 when running binaries compiled for older
+ // processors by generating two mergeable scalar stores to forward with the
+ // vector load.
+ if (!DisableP10StoreForward && Subtarget.isPPC64() &&
+ !Subtarget.isLittleEndian() && ValVT.isInteger() &&
+ ValVT.getSizeInBits() <= 64) {
+ Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, Val);
+ EVT ShiftAmountTy = getShiftAmountTy(MVT::i64, DAG.getDataLayout());
+ SDValue ShiftBy =
+ DAG.getConstant(64 - Op.getValueType().getScalarSizeInBits(),
+ dl, ShiftAmountTy);
+ Val = DAG.getNode(ISD::SHL, dl, MVT::i64, Val, ShiftBy);
+ SDValue Plus8 = DAG.getNode(ISD::ADD, dl, PtrVT, FIdx,
+ DAG.getConstant(8, dl, PtrVT));
+ SDValue Store2 = DAG.getStore(DAG.getEntryNode(), dl, Val, Plus8,
+ MachinePointerInfo());
+ SDValue Store = DAG.getStore(Store2, dl, Val, FIdx,
+ MachinePointerInfo());
+ return DAG.getLoad(Op.getValueType(), dl, Store, FIdx,
+ MachinePointerInfo());
+ }
+
// Store the input value into Value#0 of the stack slot.
- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Val, FIdx,
MachinePointerInfo());
// Load it out.
return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
diff --git a/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll b/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
index 05edf92d724988..f5515e8ba19bd5 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
@@ -14,7 +14,9 @@ define void @test_aix_splatimm(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-AIX-NEXT: slwi 3, 3, 8
; CHECK-AIX-NEXT: neg 3, 3
; CHECK-AIX-NEXT: lwz 6, 0(3)
-; CHECK-AIX-NEXT: sth 3, -16(1)
+; CHECK-AIX-NEXT: sldi 3, 3, 48
+; CHECK-AIX-NEXT: std 3, -16(1)
+; CHECK-AIX-NEXT: std 3, -8(1)
; CHECK-AIX-NEXT: addi 3, 1, -16
; CHECK-AIX-NEXT: lxvw4x 34, 0, 3
; CHECK-AIX-NEXT: srwi 3, 4, 16
@@ -24,9 +26,11 @@ define void @test_aix_splatimm(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-AIX-NEXT: mullw 3, 3, 4
; CHECK-AIX-NEXT: li 4, 0
; CHECK-AIX-NEXT: neg 3, 3
+; CHECK-AIX-NEXT: sldi 3, 3, 48
; CHECK-AIX-NEXT: vsplth 2, 2, 0
; CHECK-AIX-NEXT: stxvw4x 34, 0, 4
-; CHECK-AIX-NEXT: sth 3, -32(1)
+; CHECK-AIX-NEXT: std 3, -32(1)
+; CHECK-AIX-NEXT: std 3, -24(1)
; CHECK-AIX-NEXT: addi 3, 1, -32
; CHECK-AIX-NEXT: lxvw4x 34, 0, 3
; CHECK-AIX-NEXT: vsplth 2, 2, 0
diff --git a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
index cc32a76b22c287..6d35a7281de6b4 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
@@ -338,17 +338,16 @@ define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_0:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: li 4, 0
-; PWR7-BE-NEXT: stw 4, -16(1)
-; PWR7-BE-NEXT: stw 3, -32(1)
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
+; PWR7-BE-NEXT: sldi 3, 3, 32
+; PWR7-BE-NEXT: std 3, -32(1)
+; PWR7-BE-NEXT: std 3, -24(1)
; PWR7-BE-NEXT: addis 3, 2, .LCPI8_0 at toc@ha
; PWR7-BE-NEXT: addi 3, 3, .LCPI8_0 at toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -16
-; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -32
-; PWR7-BE-NEXT: lxvw4x 36, 0, 3
-; PWR7-BE-NEXT: vperm 2, 4, 3, 2
+; PWR7-BE-NEXT: lxvw4x 35, 0, 3
+; PWR7-BE-NEXT: vperm 2, 3, 4, 2
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_0:
@@ -402,17 +401,16 @@ define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_1:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: li 4, 0
-; PWR7-BE-NEXT: stw 4, -32(1)
-; PWR7-BE-NEXT: stw 3, -16(1)
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
+; PWR7-BE-NEXT: sldi 3, 3, 32
+; PWR7-BE-NEXT: std 3, -16(1)
+; PWR7-BE-NEXT: std 3, -8(1)
; PWR7-BE-NEXT: addis 3, 2, .LCPI9_0 at toc@ha
; PWR7-BE-NEXT: addi 3, 3, .LCPI9_0 at toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -32
-; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -16
-; PWR7-BE-NEXT: lxvw4x 36, 0, 3
-; PWR7-BE-NEXT: vperm 2, 3, 4, 2
+; PWR7-BE-NEXT: lxvw4x 35, 0, 3
+; PWR7-BE-NEXT: vperm 2, 4, 3, 2
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_1:
@@ -466,17 +464,16 @@ define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_2:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: li 4, 0
-; PWR7-BE-NEXT: stw 4, -32(1)
-; PWR7-BE-NEXT: stw 3, -16(1)
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
+; PWR7-BE-NEXT: sldi 3, 3, 32
+; PWR7-BE-NEXT: std 3, -16(1)
+; PWR7-BE-NEXT: std 3, -8(1)
; PWR7-BE-NEXT: addis 3, 2, .LCPI10_0 at toc@ha
; PWR7-BE-NEXT: addi 3, 3, .LCPI10_0 at toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -32
-; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -16
-; PWR7-BE-NEXT: lxvw4x 36, 0, 3
-; PWR7-BE-NEXT: vperm 2, 3, 4, 2
+; PWR7-BE-NEXT: lxvw4x 35, 0, 3
+; PWR7-BE-NEXT: vperm 2, 4, 3, 2
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_2:
@@ -530,17 +527,16 @@ define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_3:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: li 4, 0
-; PWR7-BE-NEXT: stw 4, -32(1)
-; PWR7-BE-NEXT: stw 3, -16(1)
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
+; PWR7-BE-NEXT: sldi 3, 3, 32
+; PWR7-BE-NEXT: std 3, -16(1)
+; PWR7-BE-NEXT: std 3, -8(1)
; PWR7-BE-NEXT: addis 3, 2, .LCPI11_0 at toc@ha
; PWR7-BE-NEXT: addi 3, 3, .LCPI11_0 at toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -32
-; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -16
-; PWR7-BE-NEXT: lxvw4x 36, 0, 3
-; PWR7-BE-NEXT: vperm 2, 3, 4, 2
+; PWR7-BE-NEXT: lxvw4x 35, 0, 3
+; PWR7-BE-NEXT: vperm 2, 4, 3, 2
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_3:
diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
index 6d2f3b3abc42dc..bc68ad2a67bf5d 100644
--- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll
+++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
@@ -591,6 +591,7 @@ define <16 x i8> @adjusted_lxvwsx(ptr %s, ptr %t) {
; P7: # %bb.0: # %entry
; P7-NEXT: ld r3, 0(r3)
; P7-NEXT: std r3, -16(r1)
+; P7-NEXT: std r3, -8(r1)
; P7-NEXT: addi r3, r1, -16
; P7-NEXT: lxvw4x vs0, 0, r3
; P7-NEXT: xxspltw v2, vs0, 1
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
index 7a6640fea2d1e4..bf8c5c96ccbde4 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@@ -833,8 +833,18 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
;
; CHECK-PWR7-LABEL: sub_absv_8_ext:
; CHECK-PWR7: # %bb.0: # %entry
-; CHECK-PWR7-NEXT: stdu r1, -400(r1)
-; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 400
+; CHECK-PWR7-NEXT: stdu r1, -512(r1)
+; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 512
+; CHECK-PWR7-NEXT: .cfi_offset r14, -144
+; CHECK-PWR7-NEXT: .cfi_offset r15, -136
+; CHECK-PWR7-NEXT: .cfi_offset r16, -128
+; CHECK-PWR7-NEXT: .cfi_offset r17, -120
+; CHECK-PWR7-NEXT: .cfi_offset r18, -112
+; CHECK-PWR7-NEXT: .cfi_offset r19, -104
+; CHECK-PWR7-NEXT: .cfi_offset r20, -96
+; CHECK-PWR7-NEXT: .cfi_offset r21, -88
+; CHECK-PWR7-NEXT: .cfi_offset r22, -80
+; CHECK-PWR7-NEXT: .cfi_offset r23, -72
; CHECK-PWR7-NEXT: .cfi_offset r24, -64
; CHECK-PWR7-NEXT: .cfi_offset r25, -56
; CHECK-PWR7-NEXT: .cfi_offset r26, -48
@@ -842,184 +852,244 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
; CHECK-PWR7-NEXT: .cfi_offset r28, -32
; CHECK-PWR7-NEXT: .cfi_offset r29, -24
; CHECK-PWR7-NEXT: .cfi_offset r30, -16
-; CHECK-PWR7-NEXT: addi r3, r1, 304
-; CHECK-PWR7-NEXT: std r24, 336(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r25, 344(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r26, 352(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r27, 360(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r28, 368(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r29, 376(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r30, 384(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT: .cfi_offset r31, -8
+; CHECK-PWR7-NEXT: .cfi_offset r2, -152
; CHECK-PWR7-NEXT: addi r3, r1, 320
-; CHECK-PWR7-NEXT: lbz r4, 304(r1)
-; CHECK-PWR7-NEXT: stxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: lbz r5, 305(r1)
-; CHECK-PWR7-NEXT: lbz r6, 321(r1)
-; CHECK-PWR7-NEXT: lbz r7, 306(r1)
-; CHECK-PWR7-NEXT: lbz r8, 322(r1)
-; CHECK-PWR7-NEXT: lbz r9, 307(r1)
-; CHECK-PWR7-NEXT: lbz r10, 323(r1)
-; CHECK-PWR7-NEXT: lbz r0, 309(r1)
-; CHECK-PWR7-NEXT: lbz r30, 325(r1)
-; CHECK-PWR7-NEXT: lbz r29, 310(r1)
-; CHECK-PWR7-NEXT: lbz r28, 326(r1)
-; CHECK-PWR7-NEXT: lbz r11, 308(r1)
-; CHECK-PWR7-NEXT: lbz r12, 324(r1)
-; CHECK-PWR7-NEXT: lbz r27, 311(r1)
-; CHECK-PWR7-NEXT: lbz r26, 327(r1)
-; CHECK-PWR7-NEXT: lbz r25, 312(r1)
-; CHECK-PWR7-NEXT: sub r5, r5, r6
-; CHECK-PWR7-NEXT: sub r6, r7, r8
-; CHECK-PWR7-NEXT: sub r7, r9, r10
-; CHECK-PWR7-NEXT: sub r9, r0, r30
-; CHECK-PWR7-NEXT: sub r10, r29, r28
-; CHECK-PWR7-NEXT: sub r8, r11, r12
-; CHECK-PWR7-NEXT: srawi r0, r5, 31
-; CHECK-PWR7-NEXT: srawi r30, r6, 31
-; CHECK-PWR7-NEXT: srawi r29, r7, 31
-; CHECK-PWR7-NEXT: srawi r28, r8, 31
-; CHECK-PWR7-NEXT: sub r11, r27, r26
-; CHECK-PWR7-NEXT: srawi r27, r9, 31
-; CHECK-PWR7-NEXT: lbz r24, 328(r1)
-; CHECK-PWR7-NEXT: xor r5, r5, r0
-; CHECK-PWR7-NEXT: xor r6, r6, r30
-; CHECK-PWR7-NEXT: xor r7, r7, r29
-; CHECK-PWR7-NEXT: xor r8, r8, r28
-; CHECK-PWR7-NEXT: xor r9, r9, r27
-; CHECK-PWR7-NEXT: srawi r26, r10, 31
-; CHECK-PWR7-NEXT: sub r5, r5, r0
-; CHECK-PWR7-NEXT: sub r6, r6, r30
-; CHECK-PWR7-NEXT: lbz r0, 313(r1)
-; CHECK-PWR7-NEXT: lbz r30, 329(r1)
-; CHECK-PWR7-NEXT: sub r7, r7, r29
-; CHECK-PWR7-NEXT: lbz r29, 330(r1)
-; CHECK-PWR7-NEXT: sub r8, r8, r28
-; CHECK-PWR7-NEXT: lbz r28, 331(r1)
-; CHECK-PWR7-NEXT: sub r9, r9, r27
-; CHECK-PWR7-NEXT: lbz r27, 332(r1)
-; CHECK-PWR7-NEXT: xor r10, r10, r26
-; CHECK-PWR7-NEXT: sub r10, r10, r26
-; CHECK-PWR7-NEXT: lbz r26, 333(r1)
-; CHECK-PWR7-NEXT: sub r12, r25, r24
-; CHECK-PWR7-NEXT: srawi r25, r11, 31
+; CHECK-PWR7-NEXT: std r14, 368(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r15, 376(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r16, 384(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r17, 392(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r18, 400(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r19, 408(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r20, 416(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r21, 424(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r22, 432(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r23, 440(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r24, 448(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r25, 456(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r26, 464(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r27, 472(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r28, 480(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r29, 488(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r30, 496(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r31, 504(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r2, 360(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: stxvw4x v2, 0, r3
; CHECK-PWR7-NEXT: lbz r3, 320(r1)
+; CHECK-PWR7-NEXT: addi r4, r1, 336
+; CHECK-PWR7-NEXT: stw r3, 60(r1) # 4-byte Folded Spill
+; CHECK-PWR7-NEXT: stxvw4x v3, 0, r4
+; CHECK-PWR7-NEXT: lbz r15, 334(r1)
+; CHECK-PWR7-NEXT: lbz r14, 350(r1)
+; CHECK-PWR7-NEXT: lbz r31, 335(r1)
+; CHECK-PWR7-NEXT: lbz r2, 351(r1)
+; CHECK-PWR7-NEXT: sub r15, r15, r14
+; CHECK-PWR7-NEXT: sub r14, r31, r2
+; CHECK-PWR7-NEXT: srawi r2, r14, 31
+; CHECK-PWR7-NEXT: xor r14, r14, r2
+; CHECK-PWR7-NEXT: lbz r3, 333(r1)
+; CHECK-PWR7-NEXT: lbz r19, 331(r1)
+; CHECK-PWR7-NEXT: lbz r18, 347(r1)
+; CHECK-PWR7-NEXT: sub r19, r19, r18
+; CHECK-PWR7-NEXT: lbz r17, 332(r1)
+; CHECK-PWR7-NEXT: lbz r16, 348(r1)
+; CHECK-PWR7-NEXT: sub r17, r17, r16
+; CHECK-PWR7-NEXT: lbz r23, 329(r1)
+; CHECK-PWR7-NEXT: sub r14, r14, r2
+; CHECK-PWR7-NEXT: lbz r2, 349(r1)
+; CHECK-PWR7-NEXT: lbz r22, 345(r1)
+; CHECK-PWR7-NEXT: lbz r4, 336(r1)
+; CHECK-PWR7-NEXT: lbz r5, 321(r1)
+; CHECK-PWR7-NEXT: lbz r6, 337(r1)
+; CHECK-PWR7-NEXT: lbz r7, 322(r1)
+; CHECK-PWR7-NEXT: lbz r8, 338(r1)
+; CHECK-PWR7-NEXT: lbz r9, 323(r1)
+; CHECK-PWR7-NEXT: lbz r10, 339(r1)
+; CHECK-PWR7-NEXT: lbz r11, 324(r1)
+; CHECK-PWR7-NEXT: lbz r12, 340(r1)
+; CHECK-PWR7-NEXT: lbz r0, 325(r1)
+; CHECK-PWR7-NEXT: lbz r30, 341(r1)
+; CHECK-PWR7-NEXT: lbz r29, 326(r1)
+; CHECK-PWR7-NEXT: lbz r28, 342(r1)
+; CHECK-PWR7-NEXT: lbz r27, 327(r1)
+; CHECK-PWR7-NEXT: lbz r26, 343(r1)
+; CHECK-PWR7-NEXT: sub r3, r3, r2
+; CHECK-PWR7-NEXT: lbz r25, 328(r1)
+; CHECK-PWR7-NEXT: lbz r24, 344(r1)
+; CHECK-PWR7-NEXT: lbz r21, 330(r1)
+; CHECK-PWR7-NEXT: lbz r20, 346(r1)
+; CHECK-PWR7-NEXT: sub r5, r5, r6
+; CHECK-PWR7-NEXT: srawi r18, r3, 31
+; CHECK-PWR7-NEXT: sub r7, r7, r8
+; CHECK-PWR7-NEXT: sub r9, r9, r10
+; CHECK-PWR7-NEXT: sub r11, r11, r12
; CHECK-PWR7-NEXT: sub r0, r0, r30
-; CHECK-PWR7-NEXT: xor r11, r11, r25
-; CHECK-PWR7-NEXT: sub r11, r11, r25
-; CHECK-PWR7-NEXT: lbz r25, 334(r1)
-; CHECK-PWR7-NEXT: sub r4, r4, r3
-; CHECK-PWR7-NEXT: srawi r30, r0, 31
-; CHECK-PWR7-NEXT: srawi r24, r12, 31
-; CHECK-PWR7-NEXT: xor r12, r12, r24
-; CHECK-PWR7-NEXT: sub r12, r12, r24
-; CHECK-PWR7-NEXT: lbz r24, 335(r1)
-; CHECK-PWR7-NEXT: srawi r3, r4, 31
-; CHECK-PWR7-NEXT: xor r4, r4, r3
-; CHECK-PWR7-NEXT: xor r0, r0, r30
-; CHECK-PWR7-NEXT: sub r3, r4, r3
-; CHECK-PWR7-NEXT: stb r3, 48(r1)
-; CHECK-PWR7-NEXT: addi r3, r1, 288
-; CHECK-PWR7-NEXT: stb r12, 176(r1)
-; CHECK-PWR7-NEXT: sub r0, r0, r30
-; CHECK-PWR7-NEXT: lbz r30, 314(r1)
-; CHECK-PWR7-NEXT: stb r11, 160(r1)
-; CHECK-PWR7-NEXT: sub r30, r30, r29
-; CHECK-PWR7-NEXT: stb r0, 192(r1)
-; CHECK-PWR7-NEXT: stb r10, 144(r1)
-; CHECK-PWR7-NEXT: stb r9, 128(r1)
-; CHECK-PWR7-NEXT: stb r8, 112(r1)
-; CHECK-PWR7-NEXT: stb r7, 96(r1)
-; CHECK-PWR7-NEXT: stb r6, 80(r1)
-; CHECK-PWR7-NEXT: srawi r29, r30, 31
-; CHECK-PWR7-NEXT: stb r5, 64(r1)
-; CHECK-PWR7-NEXT: xor r30, r30, r29
-; CHECK-PWR7-NEXT: sub r30, r30, r29
-; CHECK-PWR7-NEXT: lbz r29, 315(r1)
; CHECK-PWR7-NEXT: sub r29, r29, r28
-; CHECK-PWR7-NEXT: stb r30, 208(r1)
-; CHECK-PWR7-NEXT: ld r30, 384(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r28, r29, 31
-; CHECK-PWR7-NEXT: xor r29, r29, r28
-; CHECK-PWR7-NEXT: sub r29, r29, r28
-; CHECK-PWR7-NEXT: lbz r28, 316(r1)
-; CHECK-PWR7-NEXT: sub r28, r28, r27
-; CHECK-PWR7-NEXT: stb r29, 224(r1)
-; CHECK-PWR7-NEXT: ld r29, 376(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r27, r28, 31
-; CHECK-PWR7-NEXT: xor r28, r28, r27
-; CHECK-PWR7-NEXT: sub r28, r28, r27
-; CHECK-PWR7-NEXT: lbz r27, 317(r1)
; CHECK-PWR7-NEXT: sub r27, r27, r26
-; CHECK-PWR7-NEXT: stb r28, 240(r1)
-; CHECK-PWR7-NEXT: ld r28, 368(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r26, r27, 31
-; CHECK-PWR7-NEXT: xor r27, r27, r26
-; CHECK-PWR7-NEXT: sub r27, r27, r26
-; CHECK-PWR7-NEXT: lbz r26, 318(r1)
-; CHECK-PWR7-NEXT: sub r26, r26, r25
-; CHECK-PWR7-NEXT: stb r27, 256(r1)
-; CHECK-PWR7-NEXT: ld r27, 360(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r25, r26, 31
-; CHECK-PWR7-NEXT: xor r26, r26, r25
-; CHECK-PWR7-NEXT: sub r26, r26, r25
-; CHECK-PWR7-NEXT: lbz r25, 319(r1)
; CHECK-PWR7-NEXT: sub r25, r25, r24
-; CHECK-PWR7-NEXT: stb r26, 272(r1)
-; CHECK-PWR7-NEXT: ld r26, 352(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: srawi r31, r15, 31
+; CHECK-PWR7-NEXT: ld r2, 360(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: xor r3, r3, r18
+; CHECK-PWR7-NEXT: srawi r6, r5, 31
+; CHECK-PWR7-NEXT: srawi r8, r7, 31
+; CHECK-PWR7-NEXT: srawi r10, r9, 31
+; CHECK-PWR7-NEXT: srawi r12, r11, 31
+; CHECK-PWR7-NEXT: srawi r30, r0, 31
+; CHECK-PWR7-NEXT: sub r3, r3, r18
+; CHECK-PWR7-NEXT: srawi r18, r19, 31
+; CHECK-PWR7-NEXT: srawi r28, r29, 31
+; CHECK-PWR7-NEXT: ld r16, 384(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: sldi r3, r3, 56
+; CHECK-PWR7-NEXT: srawi r26, r27, 31
; CHECK-PWR7-NEXT: srawi r24, r25, 31
+; CHECK-PWR7-NEXT: xor r19, r19, r18
+; CHECK-PWR7-NEXT: xor r15, r15, r31
+; CHECK-PWR7-NEXT: xor r5, r5, r6
+; CHECK-PWR7-NEXT: std r3, 272(r1)
+; CHECK-PWR7-NEXT: std r3, 280(r1)
+; CHECK-PWR7-NEXT: srawi r3, r17, 31
+; CHECK-PWR7-NEXT: sub r19, r19, r18
+; CHECK-PWR7-NEXT: xor r7, r7, r8
+; CHECK-PWR7-NEXT: sub r15, r15, r31
+; CHECK-PWR7-NEXT: xor r17, r17, r3
+; CHECK-PWR7-NEXT: xor r9, r9, r10
+; CHECK-PWR7-NEXT: xor r11, r11, r12
+; CHECK-PWR7-NEXT: xor r0, r0, r30
+; CHECK-PWR7-NEXT: xor r29, r29, r28
+; CHECK-PWR7-NEXT: xor r27, r27, r26
+; CHECK-PWR7-NEXT: sub r3, r17, r3
; CHECK-PWR7-NEXT: xor r25, r25, r24
; CHECK-PWR7-NEXT: sub r25, r25, r24
-; CHECK-PWR7-NEXT: ld r24, 336(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: stb r25, 288(r1)
-; CHECK-PWR7-NEXT: ld r25, 344(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: sub r27, r27, r26
+; CHECK-PWR7-NEXT: sub r29, r29, r28
+; CHECK-PWR7-NEXT: sldi r3, r3, 56
+; CHECK-PWR7-NEXT: sub r0, r0, r30
+; CHECK-PWR7-NEXT: sub r11, r11, r12
+; CHECK-PWR7-NEXT: sub r9, r9, r10
+; CHECK-PWR7-NEXT: sub r7, r7, r8
+; CHECK-PWR7-NEXT: sub r5, r5, r6
+; CHECK-PWR7-NEXT: sldi r14, r14, 56
+; CHECK-PWR7-NEXT: sldi r15, r15, 56
+; CHECK-PWR7-NEXT: ld r31, 504(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r3, 256(r1)
+; CHECK-PWR7-NEXT: std r3, 264(r1)
+; CHECK-PWR7-NEXT: sldi r3, r19, 56
+; CHECK-PWR7-NEXT: sldi r25, r25, 56
+; CHECK-PWR7-NEXT: sldi r27, r27, 56
+; CHECK-PWR7-NEXT: std r3, 240(r1)
+; CHECK-PWR7-NEXT: std r3, 248(r1)
+; CHECK-PWR7-NEXT: sub r3, r23, r22
+; CHECK-PWR7-NEXT: srawi r23, r3, 31
+; CHECK-PWR7-NEXT: sub r22, r21, r20
+; CHECK-PWR7-NEXT: srawi r21, r22, 31
+; CHECK-PWR7-NEXT: sldi r29, r29, 56
+; CHECK-PWR7-NEXT: sldi r0, r0, 56
+; CHECK-PWR7-NEXT: sldi r11, r11, 56
+; CHECK-PWR7-NEXT: xor r3, r3, r23
+; CHECK-PWR7-NEXT: xor r22, r22, r21
+; CHECK-PWR7-NEXT: sldi r9, r9, 56
+; CHECK-PWR7-NEXT: sldi r7, r7, 56
+; CHECK-PWR7-NEXT: sldi r5, r5, 56
+; CHECK-PWR7-NEXT: ld r30, 496(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r28, 480(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: sub r3, r3, r23
+; CHECK-PWR7-NEXT: sub r22, r22, r21
+; CHECK-PWR7-NEXT: std r14, 304(r1)
+; CHECK-PWR7-NEXT: ld r26, 464(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: sldi r3, r3, 56
+; CHECK-PWR7-NEXT: sldi r22, r22, 56
+; CHECK-PWR7-NEXT: ld r24, 448(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r23, 440(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r14, 312(r1)
+; CHECK-PWR7-NEXT: std r15, 288(r1)
+; CHECK-PWR7-NEXT: std r3, 208(r1)
+; CHECK-PWR7-NEXT: std r3, 216(r1)
+; CHECK-PWR7-NEXT: lwz r3, 60(r1) # 4-byte Folded Reload
+; CHECK-PWR7-NEXT: std r15, 296(r1)
+; CHECK-PWR7-NEXT: ld r21, 424(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r20, 416(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r22, 224(r1)
+; CHECK-PWR7-NEXT: std r22, 232(r1)
+; CHECK-PWR7-NEXT: sub r4, r3, r4
+; CHECK-PWR7-NEXT: std r25, 192(r1)
+; CHECK-PWR7-NEXT: ld r22, 432(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r19, 408(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: srawi r3, r4, 31
+; CHECK-PWR7-NEXT: std r25, 200(r1)
+; CHECK-PWR7-NEXT: ld r25, 456(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r27, 176(r1)
+; CHECK-PWR7-NEXT: std r27, 184(r1)
+; CHECK-PWR7-NEXT: xor r4, r4, r3
+; CHECK-PWR7-NEXT: std r29, 160(r1)
+; CHECK-PWR7-NEXT: ld r27, 472(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r29, 168(r1)
+; CHECK-PWR7-NEXT: std r0, 144(r1)
+; CHECK-PWR7-NEXT: sub r3, r4, r3
+; CHECK-PWR7-NEXT: std r0, 152(r1)
+; CHECK-PWR7-NEXT: ld r29, 488(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r18, 400(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: sldi r3, r3, 56
+; CHECK-PWR7-NEXT: std r11, 128(r1)
+; CHECK-PWR7-NEXT: ld r17, 392(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r11, 136(r1)
+; CHECK-PWR7-NEXT: std r9, 112(r1)
+; CHECK-PWR7-NEXT: std r3, 64(r1)
+; CHECK-PWR7-NEXT: std r3, 72(r1)
+; CHECK-PWR7-NEXT: addi r3, r1, 304
+; CHECK-PWR7-NEXT: std r9, 120(r1)
+; CHECK-PWR7-NEXT: ld r15, 376(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r7, 96(r1)
+; CHECK-PWR7-NEXT: std r7, 104(r1)
+; CHECK-PWR7-NEXT: std r5, 80(r1)
+; CHECK-PWR7-NEXT: std r5, 88(r1)
; CHECK-PWR7-NEXT: lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 272
+; CHECK-PWR7-NEXT: addi r3, r1, 288
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 256
+; CHECK-PWR7-NEXT: addi r3, r1, 272
+; CHECK-PWR7-NEXT: ld r14, 368(r1) # 8-byte Folded Reload
; CHECK-PWR7-NEXT: vmrghb v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 240
+; CHECK-PWR7-NEXT: addi r3, r1, 256
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 224
+; CHECK-PWR7-NEXT: addi r3, r1, 240
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
; CHECK-PWR7-NEXT: vmrghh v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 208
+; CHECK-PWR7-NEXT: addi r3, r1, 224
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 192
+; CHECK-PWR7-NEXT: addi r3, r1, 208
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 176
+; CHECK-PWR7-NEXT: addi r3, r1, 192
; CHECK-PWR7-NEXT: lxvw4x v5, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 160
+; CHECK-PWR7-NEXT: addi r3, r1, 176
; CHECK-PWR7-NEXT: vmrghb v4, v5, v4
; CHECK-PWR7-NEXT: vmrghh v3, v4, v3
; CHECK-PWR7-NEXT: xxmrghw vs0, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 144
+; CHECK-PWR7-NEXT: addi r3, r1, 160
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 128
+; CHECK-PWR7-NEXT: addi r3, r1, 144
; CHECK-PWR7-NEXT: vmrghb v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 112
+; CHECK-PWR7-NEXT: addi r3, r1, 128
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 96
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
+; CHECK-PWR7-NEXT: addi r3, r1, 112
; CHECK-PWR7-NEXT: vmrghh v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 80
+; CHECK-PWR7-NEXT: addi r3, r1, 96
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 64
+; CHECK-PWR7-NEXT: addi r3, r1, 80
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 48
+; CHECK-PWR7-NEXT: addi r3, r1, 64
; CHECK-PWR7-NEXT: lxvw4x v5, 0, r3
; CHECK-PWR7-NEXT: vmrghb v4, v5, v4
; CHECK-PWR7-NEXT: vmrghh v3, v4, v3
; CHECK-PWR7-NEXT: xxmrghw vs1, v3, v2
; CHECK-PWR7-NEXT: xxmrghd v2, vs1, vs0
-; CHECK-PWR7-NEXT: addi r1, r1, 400
+; CHECK-PWR7-NEXT: addi r1, r1, 512
; CHECK-PWR7-NEXT: blr
entry:
%vecext = extractelement <16 x i8> %a, i32 0
diff --git a/llvm/test/CodeGen/PowerPC/test-vector-insert.ll b/llvm/test/CodeGen/PowerPC/test-vector-insert.ll
index 087f2244f0f7d0..73b4ad8a507b82 100644
--- a/llvm/test/CodeGen/PowerPC/test-vector-insert.ll
+++ b/llvm/test/CodeGen/PowerPC/test-vector-insert.ll
@@ -60,7 +60,9 @@ define dso_local <4 x i32> @test(<4 x i32> %a, double %b) {
; CHECK-BE-P7-NEXT: addi r3, r1, -4
; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3
; CHECK-BE-P7-NEXT: lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT: stw r3, -32(r1)
+; CHECK-BE-P7-NEXT: sldi r3, r3, 32
+; CHECK-BE-P7-NEXT: std r3, -32(r1)
+; CHECK-BE-P7-NEXT: std r3, -24(r1)
; CHECK-BE-P7-NEXT: addis r3, r2, .LCPI0_0 at toc@ha
; CHECK-BE-P7-NEXT: addi r3, r3, .LCPI0_0 at toc@l
; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3
@@ -129,7 +131,9 @@ define dso_local <4 x i32> @test2(<4 x i32> %a, float %b) {
; CHECK-BE-P7-NEXT: addi r3, r1, -4
; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3
; CHECK-BE-P7-NEXT: lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT: stw r3, -32(r1)
+; CHECK-BE-P7-NEXT: sldi r3, r3, 32
+; CHECK-BE-P7-NEXT: std r3, -32(r1)
+; CHECK-BE-P7-NEXT: std r3, -24(r1)
; CHECK-BE-P7-NEXT: addis r3, r2, .LCPI1_0 at toc@ha
; CHECK-BE-P7-NEXT: addi r3, r3, .LCPI1_0 at toc@l
; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3
@@ -198,7 +202,9 @@ define dso_local <4 x i32> @test3(<4 x i32> %a, double %b) {
; CHECK-BE-P7-NEXT: addi r3, r1, -4
; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3
; CHECK-BE-P7-NEXT: lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT: stw r3, -32(r1)
+; CHECK-BE-P7-NEXT: sldi r3, r3, 32
+; CHECK-BE-P7-NEXT: std r3, -32(r1)
+; CHECK-BE-P7-NEXT: std r3, -24(r1)
; CHECK-BE-P7-NEXT: addis r3, r2, .LCPI2_0 at toc@ha
; CHECK-BE-P7-NEXT: addi r3, r3, .LCPI2_0 at toc@l
; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3
@@ -267,7 +273,9 @@ define dso_local <4 x i32> @test4(<4 x i32> %a, float %b) {
; CHECK-BE-P7-NEXT: addi r3, r1, -4
; CHECK-BE-P7-NEXT: stfiwx f0, 0, r3
; CHECK-BE-P7-NEXT: lwz r3, -4(r1)
-; CHECK-BE-P7-NEXT: stw r3, -32(r1)
+; CHECK-BE-P7-NEXT: sldi r3, r3, 32
+; CHECK-BE-P7-NEXT: std r3, -32(r1)
+; CHECK-BE-P7-NEXT: std r3, -24(r1)
; CHECK-BE-P7-NEXT: addis r3, r2, .LCPI3_0 at toc@ha
; CHECK-BE-P7-NEXT: addi r3, r3, .LCPI3_0 at toc@l
; CHECK-BE-P7-NEXT: lxvw4x v3, 0, r3
diff --git a/llvm/test/CodeGen/PowerPC/vec-trunc2.ll b/llvm/test/CodeGen/PowerPC/vec-trunc2.ll
index 7a09d5a5e8bb2e..b81bc9f21423f3 100644
--- a/llvm/test/CodeGen/PowerPC/vec-trunc2.ll
+++ b/llvm/test/CodeGen/PowerPC/vec-trunc2.ll
@@ -28,16 +28,31 @@ define dso_local <8 x i8> @test8x32(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
;
; CHECK-BE-LABEL: test8x32:
; CHECK-BE: # %bb.0:
-; CHECK-BE-NEXT: stw r10, -80(r1)
-; CHECK-BE-NEXT: stw r9, -96(r1)
-; CHECK-BE-NEXT: stw r8, -112(r1)
-; CHECK-BE-NEXT: stw r7, -128(r1)
-; CHECK-BE-NEXT: stw r6, -16(r1)
-; CHECK-BE-NEXT: stw r5, -32(r1)
-; CHECK-BE-NEXT: stw r4, -48(r1)
-; CHECK-BE-NEXT: stw r3, -64(r1)
-; CHECK-BE-NEXT: addi r3, r1, -80
-; CHECK-BE-NEXT: lxvw4x vs0, 0, r3
+; CHECK-BE-NEXT: sldi r10, r10, 32
+; CHECK-BE-NEXT: sldi r9, r9, 32
+; CHECK-BE-NEXT: sldi r8, r8, 32
+; CHECK-BE-NEXT: sldi r7, r7, 32
+; CHECK-BE-NEXT: sldi r6, r6, 32
+; CHECK-BE-NEXT: sldi r5, r5, 32
+; CHECK-BE-NEXT: sldi r4, r4, 32
+; CHECK-BE-NEXT: sldi r3, r3, 32
+; CHECK-BE-NEXT: addi r11, r1, -80
+; CHECK-BE-NEXT: std r10, -80(r1)
+; CHECK-BE-NEXT: std r10, -72(r1)
+; CHECK-BE-NEXT: std r9, -96(r1)
+; CHECK-BE-NEXT: std r9, -88(r1)
+; CHECK-BE-NEXT: std r8, -112(r1)
+; CHECK-BE-NEXT: std r8, -104(r1)
+; CHECK-BE-NEXT: std r7, -128(r1)
+; CHECK-BE-NEXT: std r7, -120(r1)
+; CHECK-BE-NEXT: std r6, -16(r1)
+; CHECK-BE-NEXT: std r6, -8(r1)
+; CHECK-BE-NEXT: std r5, -32(r1)
+; CHECK-BE-NEXT: std r5, -24(r1)
+; CHECK-BE-NEXT: std r4, -48(r1)
+; CHECK-BE-NEXT: std r4, -40(r1)
+; CHECK-BE-NEXT: std r3, -64(r1)
+; CHECK-BE-NEXT: std r3, -56(r1)
; CHECK-BE-NEXT: addi r3, r1, -96
; CHECK-BE-NEXT: lxvw4x vs1, 0, r3
; CHECK-BE-NEXT: addi r3, r1, -112
@@ -51,15 +66,16 @@ define dso_local <8 x i8> @test8x32(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
; CHECK-BE-NEXT: addi r3, r1, -48
; CHECK-BE-NEXT: lxvw4x vs6, 0, r3
; CHECK-BE-NEXT: addi r3, r1, -64
+; CHECK-BE-NEXT: lxvw4x vs0, 0, r11
; CHECK-BE-NEXT: lxvw4x vs7, 0, r3
; CHECK-BE-NEXT: addis r3, r2, .LCPI0_0 at toc@ha
+; CHECK-BE-NEXT: addi r3, r3, .LCPI0_0 at toc@l
; CHECK-BE-NEXT: xxmrghw vs0, vs1, vs0
; CHECK-BE-NEXT: xxmrghw vs1, vs3, vs2
; CHECK-BE-NEXT: xxmrghw vs2, vs5, vs4
-; CHECK-BE-NEXT: addi r3, r3, .LCPI0_0 at toc@l
-; CHECK-BE-NEXT: xxmrghd v3, vs1, vs0
; CHECK-BE-NEXT: lxvw4x v2, 0, r3
; CHECK-BE-NEXT: xxmrghw vs3, vs7, vs6
+; CHECK-BE-NEXT: xxmrghd v3, vs1, vs0
; CHECK-BE-NEXT: xxmrghd v4, vs3, vs2
; CHECK-BE-NEXT: vperm v2, v4, v3, v2
; CHECK-BE-NEXT: blr
@@ -136,16 +152,31 @@ define dso_local <8 x i16> @test8x24(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5
;
; CHECK-BE-LABEL: test8x24:
; CHECK-BE: # %bb.0:
-; CHECK-BE-NEXT: sth r10, -16(r1)
-; CHECK-BE-NEXT: sth r9, -32(r1)
-; CHECK-BE-NEXT: sth r8, -48(r1)
-; CHECK-BE-NEXT: sth r7, -64(r1)
-; CHECK-BE-NEXT: sth r6, -80(r1)
-; CHECK-BE-NEXT: sth r5, -96(r1)
-; CHECK-BE-NEXT: sth r4, -112(r1)
-; CHECK-BE-NEXT: sth r3, -128(r1)
-; CHECK-BE-NEXT: addi r3, r1, -16
-; CHECK-BE-NEXT: lxvw4x v2, 0, r3
+; CHECK-BE-NEXT: sldi r10, r10, 48
+; CHECK-BE-NEXT: sldi r9, r9, 48
+; CHECK-BE-NEXT: sldi r8, r8, 48
+; CHECK-BE-NEXT: sldi r7, r7, 48
+; CHECK-BE-NEXT: sldi r6, r6, 48
+; CHECK-BE-NEXT: sldi r5, r5, 48
+; CHECK-BE-NEXT: sldi r4, r4, 48
+; CHECK-BE-NEXT: sldi r3, r3, 48
+; CHECK-BE-NEXT: addi r11, r1, -16
+; CHECK-BE-NEXT: std r10, -16(r1)
+; CHECK-BE-NEXT: std r10, -8(r1)
+; CHECK-BE-NEXT: std r9, -32(r1)
+; CHECK-BE-NEXT: std r9, -24(r1)
+; CHECK-BE-NEXT: std r8, -48(r1)
+; CHECK-BE-NEXT: std r8, -40(r1)
+; CHECK-BE-NEXT: std r7, -64(r1)
+; CHECK-BE-NEXT: std r7, -56(r1)
+; CHECK-BE-NEXT: std r6, -80(r1)
+; CHECK-BE-NEXT: std r6, -72(r1)
+; CHECK-BE-NEXT: std r5, -96(r1)
+; CHECK-BE-NEXT: std r5, -88(r1)
+; CHECK-BE-NEXT: std r4, -112(r1)
+; CHECK-BE-NEXT: std r4, -104(r1)
+; CHECK-BE-NEXT: std r3, -128(r1)
+; CHECK-BE-NEXT: std r3, -120(r1)
; CHECK-BE-NEXT: addi r3, r1, -32
; CHECK-BE-NEXT: lxvw4x v3, 0, r3
; CHECK-BE-NEXT: addi r3, r1, -48
@@ -159,6 +190,7 @@ define dso_local <8 x i16> @test8x24(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5
; CHECK-BE-NEXT: addi r3, r1, -112
; CHECK-BE-NEXT: lxvw4x v6, 0, r3
; CHECK-BE-NEXT: addi r3, r1, -128
+; CHECK-BE-NEXT: lxvw4x v2, 0, r11
; CHECK-BE-NEXT: lxvw4x v7, 0, r3
; CHECK-BE-NEXT: vmrghh v2, v3, v2
; CHECK-BE-NEXT: vmrghh v3, v5, v4
diff --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll
index d1d29a0f884c62..14b3d69f8c2730 100644
--- a/llvm/test/CodeGen/PowerPC/vsx.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx.ll
@@ -2383,7 +2383,9 @@ define <2 x double> @test70(<2 x i8> %a) {
define <2 x i32> @test80(i32 %v) {
; CHECK-LABEL: test80:
; CHECK: # %bb.0:
-; CHECK-NEXT: stw r3, -16(r1)
+; CHECK-NEXT: sldi r3, r3, 32
+; CHECK-NEXT: std r3, -16(r1)
+; CHECK-NEXT: std r3, -8(r1)
; CHECK-NEXT: addi r3, r1, -16
; CHECK-NEXT: lxvw4x vs0, 0, r3
; CHECK-NEXT: addis r3, r2, .LCPI65_0 at toc@ha
@@ -2395,7 +2397,9 @@ define <2 x i32> @test80(i32 %v) {
;
; CHECK-REG-LABEL: test80:
; CHECK-REG: # %bb.0:
-; CHECK-REG-NEXT: stw r3, -16(r1)
+; CHECK-REG-NEXT: sldi r3, r3, 32
+; CHECK-REG-NEXT: std r3, -16(r1)
+; CHECK-REG-NEXT: std r3, -8(r1)
; CHECK-REG-NEXT: addi r3, r1, -16
; CHECK-REG-NEXT: lxvw4x vs0, 0, r3
; CHECK-REG-NEXT: addis r3, r2, .LCPI65_0 at toc@ha
@@ -2407,8 +2411,12 @@ define <2 x i32> @test80(i32 %v) {
;
; CHECK-FISL-LABEL: test80:
; CHECK-FISL: # %bb.0:
-; CHECK-FISL-NEXT: # kill: def $r3 killed $r3 killed $x3
-; CHECK-FISL-NEXT: stw r3, -16(r1)
+; CHECK-FISL-NEXT: mr r4, r3
+; CHECK-FISL-NEXT: # implicit-def: $x3
+; CHECK-FISL-NEXT: mr r3, r4
+; CHECK-FISL-NEXT: sldi r3, r3, 32
+; CHECK-FISL-NEXT: std r3, -8(r1)
+; CHECK-FISL-NEXT: std r3, -16(r1)
; CHECK-FISL-NEXT: addi r3, r1, -16
; CHECK-FISL-NEXT: lxvw4x vs0, 0, r3
; CHECK-FISL-NEXT: xxspltw v2, vs0, 0
diff --git a/llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll b/llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll
index c31b2faedd51dc..069e734088a845 100644
--- a/llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll
+++ b/llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll
@@ -13,7 +13,9 @@ define void @test() local_unnamed_addr #0 align 2 {
; CHECK-BE-NEXT: addi r3, r3, 1
; CHECK-BE-NEXT: xxlxor vs1, vs1, vs1
; CHECK-BE-NEXT: vsrw v2, v2, v2
-; CHECK-BE-NEXT: sth r3, -32(r1)
+; CHECK-BE-NEXT: sldi r3, r3, 48
+; CHECK-BE-NEXT: std r3, -32(r1)
+; CHECK-BE-NEXT: std r3, -24(r1)
; CHECK-BE-NEXT: addi r3, r1, -32
; CHECK-BE-NEXT: lxvw4x vs0, 0, r3
; CHECK-BE-NEXT: addi r3, r1, -16
More information about the llvm-commits
mailing list