[llvm] [PowerPC] improve P10 store forwarding on P7 scalar to vector (PR #102330)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 7 09:44:36 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-powerpc
Author: None (RolandF77)
<details>
<summary>Changes</summary>
Commercial applications are often built for the oldest supported processor, even though they will usually be run on newer processors. Try to make code built for P7 that has scalar to vector operations run better on P10 by making a small change to the P7 scalar to vector code to allow hardware store forwarding on P10.
---
Patch is 35.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102330.diff
9 Files Affected:
- (modified) llvm/lib/Target/PowerPC/PPCISelLowering.cpp (+39-1)
- (modified) llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll (+6-2)
- (modified) llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll (+24-28)
- (modified) llvm/test/CodeGen/PowerPC/load-and-splat.ll (+1)
- (modified) llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll (+210-140)
- (modified) llvm/test/CodeGen/PowerPC/test-vector-insert.ll (+12-4)
- (modified) llvm/test/CodeGen/PowerPC/vec-trunc2.ll (+54-22)
- (modified) llvm/test/CodeGen/PowerPC/vsx.ll (+12-4)
- (modified) llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll (+3-1)
``````````diff
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 1686ec572c855..d1f29a79c9668 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -105,6 +105,10 @@ using namespace llvm;
#define DEBUG_TYPE "ppc-lowering"
+static cl::opt<bool> DisableP10StoreForward("disable-p10-store-forward",
+ cl::desc("disable P10 store forward-friendly conversion"), cl::Hidden,
+ cl::init(false));
+
static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
@@ -985,6 +989,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
+ // LE is P8+/64-bit so direct moves are supported and these operations
+ // are legal. The custom transformation requires 64-bit since we need a
+ // pair of stores that will cover a 128-bit load for P10.
+ if (!DisableP10StoreForward && isPPC64 && !Subtarget.isLittleEndian()) {
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
+ }
setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
@@ -11479,8 +11491,34 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+ SDValue Val = Op.getOperand(0);
+ EVT ValVT = Val.getValueType();
+ // P10 hardware store forwarding requires that a single store contains all
+ // the data for the load. P10 is able to merge a pair of adjacent stores. Try
+ // to avoid load hit store on P10 when running binaries compiled for older
+ // processors by generating two mergeable scalar stores to forward with the
+ // vector load.
+ if (!DisableP10StoreForward && Subtarget.isPPC64() &&
+ !Subtarget.isLittleEndian() && ValVT.isInteger() &&
+ ValVT.getSizeInBits() <= 64) {
+ Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, Val);
+ EVT ShiftAmountTy = getShiftAmountTy(MVT::i64, DAG.getDataLayout());
+ SDValue ShiftBy =
+ DAG.getConstant(64 - Op.getValueType().getScalarSizeInBits(),
+ dl, ShiftAmountTy);
+ Val = DAG.getNode(ISD::SHL, dl, MVT::i64, Val, ShiftBy);
+ SDValue Plus8 = DAG.getNode(ISD::ADD, dl, PtrVT, FIdx,
+ DAG.getConstant(8, dl, PtrVT));
+ SDValue Store2 = DAG.getStore(DAG.getEntryNode(), dl, Val, Plus8,
+ MachinePointerInfo());
+ SDValue Store = DAG.getStore(Store2, dl, Val, FIdx,
+ MachinePointerInfo());
+ return DAG.getLoad(Op.getValueType(), dl, Store, FIdx,
+ MachinePointerInfo());
+ }
+
// Store the input value into Value#0 of the stack slot.
- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Val, FIdx,
MachinePointerInfo());
// Load it out.
return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
diff --git a/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll b/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
index 05edf92d72498..f5515e8ba19bd 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
@@ -14,7 +14,9 @@ define void @test_aix_splatimm(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-AIX-NEXT: slwi 3, 3, 8
; CHECK-AIX-NEXT: neg 3, 3
; CHECK-AIX-NEXT: lwz 6, 0(3)
-; CHECK-AIX-NEXT: sth 3, -16(1)
+; CHECK-AIX-NEXT: sldi 3, 3, 48
+; CHECK-AIX-NEXT: std 3, -16(1)
+; CHECK-AIX-NEXT: std 3, -8(1)
; CHECK-AIX-NEXT: addi 3, 1, -16
; CHECK-AIX-NEXT: lxvw4x 34, 0, 3
; CHECK-AIX-NEXT: srwi 3, 4, 16
@@ -24,9 +26,11 @@ define void @test_aix_splatimm(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-AIX-NEXT: mullw 3, 3, 4
; CHECK-AIX-NEXT: li 4, 0
; CHECK-AIX-NEXT: neg 3, 3
+; CHECK-AIX-NEXT: sldi 3, 3, 48
; CHECK-AIX-NEXT: vsplth 2, 2, 0
; CHECK-AIX-NEXT: stxvw4x 34, 0, 4
-; CHECK-AIX-NEXT: sth 3, -32(1)
+; CHECK-AIX-NEXT: std 3, -32(1)
+; CHECK-AIX-NEXT: std 3, -24(1)
; CHECK-AIX-NEXT: addi 3, 1, -32
; CHECK-AIX-NEXT: lxvw4x 34, 0, 3
; CHECK-AIX-NEXT: vsplth 2, 2, 0
diff --git a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
index cc32a76b22c28..6d35a7281de6b 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
@@ -338,17 +338,16 @@ define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_0:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: li 4, 0
-; PWR7-BE-NEXT: stw 4, -16(1)
-; PWR7-BE-NEXT: stw 3, -32(1)
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
+; PWR7-BE-NEXT: sldi 3, 3, 32
+; PWR7-BE-NEXT: std 3, -32(1)
+; PWR7-BE-NEXT: std 3, -24(1)
; PWR7-BE-NEXT: addis 3, 2, .LCPI8_0 at toc@ha
; PWR7-BE-NEXT: addi 3, 3, .LCPI8_0 at toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -16
-; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -32
-; PWR7-BE-NEXT: lxvw4x 36, 0, 3
-; PWR7-BE-NEXT: vperm 2, 4, 3, 2
+; PWR7-BE-NEXT: lxvw4x 35, 0, 3
+; PWR7-BE-NEXT: vperm 2, 3, 4, 2
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_0:
@@ -402,17 +401,16 @@ define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_1:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: li 4, 0
-; PWR7-BE-NEXT: stw 4, -32(1)
-; PWR7-BE-NEXT: stw 3, -16(1)
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
+; PWR7-BE-NEXT: sldi 3, 3, 32
+; PWR7-BE-NEXT: std 3, -16(1)
+; PWR7-BE-NEXT: std 3, -8(1)
; PWR7-BE-NEXT: addis 3, 2, .LCPI9_0 at toc@ha
; PWR7-BE-NEXT: addi 3, 3, .LCPI9_0 at toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -32
-; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -16
-; PWR7-BE-NEXT: lxvw4x 36, 0, 3
-; PWR7-BE-NEXT: vperm 2, 3, 4, 2
+; PWR7-BE-NEXT: lxvw4x 35, 0, 3
+; PWR7-BE-NEXT: vperm 2, 4, 3, 2
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_1:
@@ -466,17 +464,16 @@ define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_2:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: li 4, 0
-; PWR7-BE-NEXT: stw 4, -32(1)
-; PWR7-BE-NEXT: stw 3, -16(1)
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
+; PWR7-BE-NEXT: sldi 3, 3, 32
+; PWR7-BE-NEXT: std 3, -16(1)
+; PWR7-BE-NEXT: std 3, -8(1)
; PWR7-BE-NEXT: addis 3, 2, .LCPI10_0 at toc@ha
; PWR7-BE-NEXT: addi 3, 3, .LCPI10_0 at toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -32
-; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -16
-; PWR7-BE-NEXT: lxvw4x 36, 0, 3
-; PWR7-BE-NEXT: vperm 2, 3, 4, 2
+; PWR7-BE-NEXT: lxvw4x 35, 0, 3
+; PWR7-BE-NEXT: vperm 2, 4, 3, 2
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_2:
@@ -530,17 +527,16 @@ define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_3:
; PWR7-BE: # %bb.0: # %entry
; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: li 4, 0
-; PWR7-BE-NEXT: stw 4, -32(1)
-; PWR7-BE-NEXT: stw 3, -16(1)
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
+; PWR7-BE-NEXT: sldi 3, 3, 32
+; PWR7-BE-NEXT: std 3, -16(1)
+; PWR7-BE-NEXT: std 3, -8(1)
; PWR7-BE-NEXT: addis 3, 2, .LCPI11_0 at toc@ha
; PWR7-BE-NEXT: addi 3, 3, .LCPI11_0 at toc@l
; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -32
-; PWR7-BE-NEXT: lxvw4x 35, 0, 3
; PWR7-BE-NEXT: addi 3, 1, -16
-; PWR7-BE-NEXT: lxvw4x 36, 0, 3
-; PWR7-BE-NEXT: vperm 2, 3, 4, 2
+; PWR7-BE-NEXT: lxvw4x 35, 0, 3
+; PWR7-BE-NEXT: vperm 2, 4, 3, 2
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_3:
diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
index 6d2f3b3abc42d..bc68ad2a67bf5 100644
--- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll
+++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
@@ -591,6 +591,7 @@ define <16 x i8> @adjusted_lxvwsx(ptr %s, ptr %t) {
; P7: # %bb.0: # %entry
; P7-NEXT: ld r3, 0(r3)
; P7-NEXT: std r3, -16(r1)
+; P7-NEXT: std r3, -8(r1)
; P7-NEXT: addi r3, r1, -16
; P7-NEXT: lxvw4x vs0, 0, r3
; P7-NEXT: xxspltw v2, vs0, 1
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
index 7a6640fea2d1e..bf8c5c96ccbde 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@@ -833,8 +833,18 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
;
; CHECK-PWR7-LABEL: sub_absv_8_ext:
; CHECK-PWR7: # %bb.0: # %entry
-; CHECK-PWR7-NEXT: stdu r1, -400(r1)
-; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 400
+; CHECK-PWR7-NEXT: stdu r1, -512(r1)
+; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 512
+; CHECK-PWR7-NEXT: .cfi_offset r14, -144
+; CHECK-PWR7-NEXT: .cfi_offset r15, -136
+; CHECK-PWR7-NEXT: .cfi_offset r16, -128
+; CHECK-PWR7-NEXT: .cfi_offset r17, -120
+; CHECK-PWR7-NEXT: .cfi_offset r18, -112
+; CHECK-PWR7-NEXT: .cfi_offset r19, -104
+; CHECK-PWR7-NEXT: .cfi_offset r20, -96
+; CHECK-PWR7-NEXT: .cfi_offset r21, -88
+; CHECK-PWR7-NEXT: .cfi_offset r22, -80
+; CHECK-PWR7-NEXT: .cfi_offset r23, -72
; CHECK-PWR7-NEXT: .cfi_offset r24, -64
; CHECK-PWR7-NEXT: .cfi_offset r25, -56
; CHECK-PWR7-NEXT: .cfi_offset r26, -48
@@ -842,184 +852,244 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
; CHECK-PWR7-NEXT: .cfi_offset r28, -32
; CHECK-PWR7-NEXT: .cfi_offset r29, -24
; CHECK-PWR7-NEXT: .cfi_offset r30, -16
-; CHECK-PWR7-NEXT: addi r3, r1, 304
-; CHECK-PWR7-NEXT: std r24, 336(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r25, 344(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r26, 352(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r27, 360(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r28, 368(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r29, 376(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r30, 384(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT: .cfi_offset r31, -8
+; CHECK-PWR7-NEXT: .cfi_offset r2, -152
; CHECK-PWR7-NEXT: addi r3, r1, 320
-; CHECK-PWR7-NEXT: lbz r4, 304(r1)
-; CHECK-PWR7-NEXT: stxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: lbz r5, 305(r1)
-; CHECK-PWR7-NEXT: lbz r6, 321(r1)
-; CHECK-PWR7-NEXT: lbz r7, 306(r1)
-; CHECK-PWR7-NEXT: lbz r8, 322(r1)
-; CHECK-PWR7-NEXT: lbz r9, 307(r1)
-; CHECK-PWR7-NEXT: lbz r10, 323(r1)
-; CHECK-PWR7-NEXT: lbz r0, 309(r1)
-; CHECK-PWR7-NEXT: lbz r30, 325(r1)
-; CHECK-PWR7-NEXT: lbz r29, 310(r1)
-; CHECK-PWR7-NEXT: lbz r28, 326(r1)
-; CHECK-PWR7-NEXT: lbz r11, 308(r1)
-; CHECK-PWR7-NEXT: lbz r12, 324(r1)
-; CHECK-PWR7-NEXT: lbz r27, 311(r1)
-; CHECK-PWR7-NEXT: lbz r26, 327(r1)
-; CHECK-PWR7-NEXT: lbz r25, 312(r1)
-; CHECK-PWR7-NEXT: sub r5, r5, r6
-; CHECK-PWR7-NEXT: sub r6, r7, r8
-; CHECK-PWR7-NEXT: sub r7, r9, r10
-; CHECK-PWR7-NEXT: sub r9, r0, r30
-; CHECK-PWR7-NEXT: sub r10, r29, r28
-; CHECK-PWR7-NEXT: sub r8, r11, r12
-; CHECK-PWR7-NEXT: srawi r0, r5, 31
-; CHECK-PWR7-NEXT: srawi r30, r6, 31
-; CHECK-PWR7-NEXT: srawi r29, r7, 31
-; CHECK-PWR7-NEXT: srawi r28, r8, 31
-; CHECK-PWR7-NEXT: sub r11, r27, r26
-; CHECK-PWR7-NEXT: srawi r27, r9, 31
-; CHECK-PWR7-NEXT: lbz r24, 328(r1)
-; CHECK-PWR7-NEXT: xor r5, r5, r0
-; CHECK-PWR7-NEXT: xor r6, r6, r30
-; CHECK-PWR7-NEXT: xor r7, r7, r29
-; CHECK-PWR7-NEXT: xor r8, r8, r28
-; CHECK-PWR7-NEXT: xor r9, r9, r27
-; CHECK-PWR7-NEXT: srawi r26, r10, 31
-; CHECK-PWR7-NEXT: sub r5, r5, r0
-; CHECK-PWR7-NEXT: sub r6, r6, r30
-; CHECK-PWR7-NEXT: lbz r0, 313(r1)
-; CHECK-PWR7-NEXT: lbz r30, 329(r1)
-; CHECK-PWR7-NEXT: sub r7, r7, r29
-; CHECK-PWR7-NEXT: lbz r29, 330(r1)
-; CHECK-PWR7-NEXT: sub r8, r8, r28
-; CHECK-PWR7-NEXT: lbz r28, 331(r1)
-; CHECK-PWR7-NEXT: sub r9, r9, r27
-; CHECK-PWR7-NEXT: lbz r27, 332(r1)
-; CHECK-PWR7-NEXT: xor r10, r10, r26
-; CHECK-PWR7-NEXT: sub r10, r10, r26
-; CHECK-PWR7-NEXT: lbz r26, 333(r1)
-; CHECK-PWR7-NEXT: sub r12, r25, r24
-; CHECK-PWR7-NEXT: srawi r25, r11, 31
+; CHECK-PWR7-NEXT: std r14, 368(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r15, 376(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r16, 384(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r17, 392(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r18, 400(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r19, 408(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r20, 416(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r21, 424(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r22, 432(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r23, 440(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r24, 448(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r25, 456(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r26, 464(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r27, 472(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r28, 480(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r29, 488(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r30, 496(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r31, 504(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r2, 360(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: stxvw4x v2, 0, r3
; CHECK-PWR7-NEXT: lbz r3, 320(r1)
+; CHECK-PWR7-NEXT: addi r4, r1, 336
+; CHECK-PWR7-NEXT: stw r3, 60(r1) # 4-byte Folded Spill
+; CHECK-PWR7-NEXT: stxvw4x v3, 0, r4
+; CHECK-PWR7-NEXT: lbz r15, 334(r1)
+; CHECK-PWR7-NEXT: lbz r14, 350(r1)
+; CHECK-PWR7-NEXT: lbz r31, 335(r1)
+; CHECK-PWR7-NEXT: lbz r2, 351(r1)
+; CHECK-PWR7-NEXT: sub r15, r15, r14
+; CHECK-PWR7-NEXT: sub r14, r31, r2
+; CHECK-PWR7-NEXT: srawi r2, r14, 31
+; CHECK-PWR7-NEXT: xor r14, r14, r2
+; CHECK-PWR7-NEXT: lbz r3, 333(r1)
+; CHECK-PWR7-NEXT: lbz r19, 331(r1)
+; CHECK-PWR7-NEXT: lbz r18, 347(r1)
+; CHECK-PWR7-NEXT: sub r19, r19, r18
+; CHECK-PWR7-NEXT: lbz r17, 332(r1)
+; CHECK-PWR7-NEXT: lbz r16, 348(r1)
+; CHECK-PWR7-NEXT: sub r17, r17, r16
+; CHECK-PWR7-NEXT: lbz r23, 329(r1)
+; CHECK-PWR7-NEXT: sub r14, r14, r2
+; CHECK-PWR7-NEXT: lbz r2, 349(r1)
+; CHECK-PWR7-NEXT: lbz r22, 345(r1)
+; CHECK-PWR7-NEXT: lbz r4, 336(r1)
+; CHECK-PWR7-NEXT: lbz r5, 321(r1)
+; CHECK-PWR7-NEXT: lbz r6, 337(r1)
+; CHECK-PWR7-NEXT: lbz r7, 322(r1)
+; CHECK-PWR7-NEXT: lbz r8, 338(r1)
+; CHECK-PWR7-NEXT: lbz r9, 323(r1)
+; CHECK-PWR7-NEXT: lbz r10, 339(r1)
+; CHECK-PWR7-NEXT: lbz r11, 324(r1)
+; CHECK-PWR7-NEXT: lbz r12, 340(r1)
+; CHECK-PWR7-NEXT: lbz r0, 325(r1)
+; CHECK-PWR7-NEXT: lbz r30, 341(r1)
+; CHECK-PWR7-NEXT: lbz r29, 326(r1)
+; CHECK-PWR7-NEXT: lbz r28, 342(r1)
+; CHECK-PWR7-NEXT: lbz r27, 327(r1)
+; CHECK-PWR7-NEXT: lbz r26, 343(r1)
+; CHECK-PWR7-NEXT: sub r3, r3, r2
+; CHECK-PWR7-NEXT: lbz r25, 328(r1)
+; CHECK-PWR7-NEXT: lbz r24, 344(r1)
+; CHECK-PWR7-NEXT: lbz r21, 330(r1)
+; CHECK-PWR7-NEXT: lbz r20, 346(r1)
+; CHECK-PWR7-NEXT: sub r5, r5, r6
+; CHECK-PWR7-NEXT: srawi r18, r3, 31
+; CHECK-PWR7-NEXT: sub r7, r7, r8
+; CHECK-PWR7-NEXT: sub r9, r9, r10
+; CHECK-PWR7-NEXT: sub r11, r11, r12
; CHECK-PWR7-NEXT: sub r0, r0, r30
-; CHECK-PWR7-NEXT: xor r11, r11, r25
-; CHECK-PWR7-NEXT: sub r11, r11, r25
-; CHECK-PWR7-NEXT: lbz r25, 334(r1)
-; CHECK-PWR7-NEXT: sub r4, r4, r3
-; CHECK-PWR7-NEXT: srawi r30, r0, 31
-; CHECK-PWR7-NEXT: srawi r24, r12, 31
-; CHECK-PWR7-NEXT: xor r12, r12, r24
-; CHECK-PWR7-NEXT: sub r12, r12, r24
-; CHECK-PWR7-NEXT: lbz r24, 335(r1)
-; CHECK-PWR7-NEXT: srawi r3, r4, 31
-; CHECK-PWR7-NEXT: xor r4, r4, r3
-; CHECK-PWR7-NEXT: xor r0, r0, r30
-; CHECK-PWR7-NEXT: sub r3, r4, r3
-; CHECK-PWR7-NEXT: stb r3, 48(r1)
-; CHECK-PWR7-NEXT: addi r3, r1, 288
-; CHECK-PWR7-NEXT: stb r12, 176(r1)
-; CHECK-PWR7-NEXT: sub r0, r0, r30
-; CHECK-PWR7-NEXT: lbz r30, 314(r1)
-; CHECK-PWR7-NEXT: stb r11, 160(r1)
-; CHECK-PWR7-NEXT: sub r30, r30, r29
-; CHECK-PWR7-NEXT: stb r0, 192(r1)
-; CHECK-PWR7-NEXT: stb r10, 144(r1)
-; CHECK-PWR7-NEXT: stb r9, 128(r1)
-; CHECK-PWR7-NEXT: stb r8, 112(r1)
-; CHECK-PWR7-NEXT: stb r7, 96(r1)
-; CHECK-PWR7-NEXT: stb r6, 80(r1)
-; CHECK-PWR7-NEXT: srawi r29, r30, 31
-; CHECK-PWR7-NEXT: stb r5, 64(r1)
-; CHECK-PWR7-NEXT: xor r30, r30, r29
-; CHECK-PWR7-NEXT: sub r30, r30, r29
-; CHECK-PWR7-NEXT: lbz r29, 315(r1)
; CHECK-PWR7-NEXT: sub r29, r29, r28
-; CHECK-PWR7-NEXT: stb r30, 208(r1)
-; CHECK-PWR7-NEXT: ld r30, 384(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r28, r29, 31
-; CHECK-PWR7-NEXT: xor r29, r29, r28
-; CHECK-PWR7-NEXT: sub r29, r29, r28
-; CHECK-PWR7-NEXT: lbz r28, 316(r1)
-; CHECK-PWR7-NEXT: sub r28, r28, r27
-; CHECK-PWR7-NEXT: stb r29, 224(r1)
-; CHECK-PWR7-NEXT: ld r29, 376(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r27, r28, 31
-; CHECK-PWR7-NEXT: xor r28, r28, r27
-; CHECK-PWR7-NEXT: sub r28, r28, r27
-; CHECK-PWR7-NEXT: lbz r27, 317(r1)
; CHECK-PWR7-NEXT: sub r27, r27, r26
-; CHECK-PWR7-NEXT: stb r28, 240(r1)
-; CHECK-PWR7-NEXT: ld r28, 368(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r26, r27, 31
-; CHECK-PWR7-NEXT: xor r27, r27, r26
-; CHECK-PWR7-NEXT: sub r27, r27, r26
-; CHECK-PWR7-NEXT: lbz r26, 318(r1)
-; CHECK-PWR7-NEXT: sub r26, r26, r25
-; CHECK-PWR7-NEXT: stb r27, 256(r1)
-; CHECK-PWR7-NEXT: ld r27, 360(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r25, r26, 31
-; CHECK-PWR7-NEXT: xor r26, r26, r25
-; CHECK-PWR7-NEXT: sub r26, r26, r25
-; CHECK-PWR7-NEXT: lbz r25, 319(r1)
; CHECK-PWR7-NEXT: sub r25, r25, r24
-; CHECK-PWR7-NEXT: stb r26, 272(r1)
-; CHECK-PWR7-NEXT: ld r26, 352(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: srawi r31, r15, 31
+; CHECK-PWR7-NEXT: ld r2, 360(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: xor r3, r3, r18
+; CHECK-PWR7-NEXT: srawi r6, r5, 31
+; CHECK-PWR7-NEXT: srawi r8, r7, 31
+; CHECK-PWR7-NEXT: srawi r10, r9, 31
+; CHECK-PWR7-NEXT: srawi r12, r11, 31
+; CHECK-PWR7-NEXT: srawi r30, r0, 31
+; CHECK-PWR7-NEXT: sub r3, r3, r18
+; CHECK-PWR7-NEXT: srawi r18, r19, 31
+; CHECK-PWR7-NEXT: srawi r28, r29, 31
+; CHECK-PWR7-NEXT: ld r16, 384(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: sldi r3, r3, 56
+; CHECK-PWR7-NEXT: srawi r26, r27, 31
; CHECK-PWR7-NEXT: srawi r24, r25, 31
+; CHECK-PWR7-NEXT: xor r19, r19, r18
+; CHECK-PWR7-NEXT: xor r15, r15, r31
+; CHECK-PWR7-NEXT: xor r5, r5, r6
+; CHECK-PWR7-NEXT: std r3, 272(r1)
+; CHECK-PWR7-NEXT: std r3, 280(r1)
+; CHECK-PWR7-NEXT: srawi r3, r17, 31
+; CHECK-PWR7-NEXT: sub r19, r19, r18
+; CHECK-PWR7-NEXT: xor r7, r7, r8
+; CHECK-PWR7-NEXT: sub r15, r15, r31
+; CHECK-PWR7-NEXT: xor r17, r17, r3
+; CHECK-PWR7-NEXT: xor r9, r9, r10
+; CHECK-PWR7-NEXT: xor r11, r11, r12
+; CHECK-PWR7-NEXT: xor r0, r0, r30
+; CHECK-PWR7-NEXT: xor r29, r29, r28
+; CHECK-PWR7-NEXT: xor r27, r27, r26
+; CHECK-PWR7-NEXT: sub r3, r17, r3
; CHECK-PWR7-NEXT: xor r25, r25, r24
; CHECK-PWR7-NEXT: sub r25, r25, r24
-; CHECK-PWR7-NEXT: ld r24, 336(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: stb r25, 288(r1)
-; CHECK-PWR7-NEXT: ld r25, 344(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: sub r27, ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/102330
More information about the llvm-commits
mailing list