[llvm] 63cd184 - [PowerPC] use lvx + splat directly for aligned splat load
Chen Zheng via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 7 18:02:28 PST 2021
Author: Chen Zheng
Date: 2021-12-08T02:02:18Z
New Revision: 63cd1842a7f3bdcb0d10b0a007242634ce16db88
URL: https://github.com/llvm/llvm-project/commit/63cd1842a7f3bdcb0d10b0a007242634ce16db88
DIFF: https://github.com/llvm/llvm-project/commit/63cd1842a7f3bdcb0d10b0a007242634ce16db88.diff
LOG: [PowerPC] use lvx + splat directly for aligned splat load
Reviewed By: nemanjai
Differential Revision: https://reviews.llvm.org/D114062
Added:
Modified:
llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
llvm/lib/Target/PowerPC/PPCInstrVSX.td
llvm/test/CodeGen/PowerPC/load-and-splat.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index a2664bcff4ab0..ba74af5ef5f78 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -4464,9 +4464,10 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const {
LoadSDNode *LDN = dyn_cast<LoadSDNode>(N);
StoreSDNode *STN = dyn_cast<StoreSDNode>(N);
+ MemIntrinsicSDNode *MIN = dyn_cast<MemIntrinsicSDNode>(N);
SDValue AddrOp;
- if (LDN)
- AddrOp = LDN->getOperand(1);
+ if (LDN || (MIN && MIN->getOpcode() == PPCISD::LD_SPLAT))
+ AddrOp = N->getOperand(1);
else if (STN)
AddrOp = STN->getOperand(2);
@@ -5973,6 +5974,15 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
if (Type != MVT::v16i8 && Type != MVT::v8i16)
break;
+ // If the alignment for the load is 16 or bigger, we don't need the
+ // permutated mask to get the required value. The value must be the 0
+ // element in big endian target or 7/15 in little endian target in the
+ // result vsx register of lvx instruction.
+ // Select the instruction in the .td file.
+ if (cast<MemIntrinsicSDNode>(N)->getAlign() >= Align(16) &&
+ isOffsetMultipleOf(N, 16))
+ break;
+
SDValue ZeroReg =
CurDAG->getRegister(Subtarget->isPPC64() ? PPC::ZERO8 : PPC::ZERO,
Subtarget->isPPC64() ? MVT::i64 : MVT::i32);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index d92a10c5b2081..110f7d79fbc55 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -158,6 +158,11 @@ def HasP9Vector : Predicate<"Subtarget->hasP9Vector()">;
def NoP9Altivec : Predicate<"!Subtarget->hasP9Altivec()">;
def NoP10Vector: Predicate<"!Subtarget->hasP10Vector()">;
+def PPCldsplatAlign16 : PatFrag<(ops node:$ptr), (PPCldsplat node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getAlign() >= Align(16) &&
+ isOffsetMultipleOf(N, 16);
+}]>;
+
//--------------------- VSX-specific instruction formats ---------------------//
// By default, all VSX instructions are to be selected over their Altivec
// counter parts and they do not have unmodeled sideeffects.
@@ -3180,6 +3185,12 @@ defm : ScalToVecWPermute<
v2f64, (f64 (load ForceXForm:$src)),
(XXPERMDIs (XFLOADf64 ForceXForm:$src), 2),
(SUBREG_TO_REG (i64 1), (XFLOADf64 ForceXForm:$src), sub_64)>;
+
+// Splat loads.
+def : Pat<(v8i16 (PPCldsplatAlign16 ForceXForm:$A)),
+ (v8i16 (VSPLTH 7, (LVX ForceXForm:$A)))>;
+def : Pat<(v16i8 (PPCldsplatAlign16 ForceXForm:$A)),
+ (v16i8 (VSPLTB 15, (LVX ForceXForm:$A)))>;
} // HasVSX, NoP9Vector, IsLittleEndian
let Predicates = [HasVSX, NoP9Vector, IsBigEndian] in {
@@ -3187,6 +3198,12 @@ let Predicates = [HasVSX, NoP9Vector, IsBigEndian] in {
(LXVD2X ForceXForm:$src)>;
def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, ForceXForm:$dst),
(STXVD2X $rS, ForceXForm:$dst)>;
+
+ // Splat loads.
+ def : Pat<(v8i16 (PPCldsplatAlign16 ForceXForm:$A)),
+ (v8i16 (VSPLTH 0, (LVX ForceXForm:$A)))>;
+ def : Pat<(v16i8 (PPCldsplatAlign16 ForceXForm:$A)),
+ (v16i8 (VSPLTB 0, (LVX ForceXForm:$A)))>;
} // HasVSX, NoP9Vector, IsBigEndian
// Any VSX subtarget that only has loads and stores that load in big endian
diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
index 83ee78e63716b..ce44cdc18aae1 100644
--- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll
+++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
@@ -1065,18 +1065,13 @@ define <8 x i16> @test_aligned_v8i16_1(i16* %Ptr) {
;
; P8-LABEL: test_aligned_v8i16_1:
; P8: # %bb.0: # %entry
-; P8-NEXT: lhzx r3, 0, r3
-; P8-NEXT: mtvsrwz v2, r3
-; P8-NEXT: vsplth v2, v2, 3
+; P8-NEXT: lvx v2, 0, r3
+; P8-NEXT: vsplth v2, v2, 7
; P8-NEXT: blr
;
; P7-LABEL: test_aligned_v8i16_1:
; P7: # %bb.0: # %entry
-; P7-NEXT: li r4, 1
; P7-NEXT: lvx v2, 0, r3
-; P7-NEXT: lvsl v4, 0, r3
-; P7-NEXT: lvx v3, r4, r3
-; P7-NEXT: vperm v2, v2, v3, v4
; P7-NEXT: vsplth v2, v2, 0
; P7-NEXT: blr
;
@@ -1088,18 +1083,13 @@ define <8 x i16> @test_aligned_v8i16_1(i16* %Ptr) {
;
; P8-AIX32-LABEL: test_aligned_v8i16_1:
; P8-AIX32: # %bb.0: # %entry
-; P8-AIX32-NEXT: lhzx r3, 0, r3
-; P8-AIX32-NEXT: mtvsrwz v2, r3
-; P8-AIX32-NEXT: vsplth v2, v2, 3
+; P8-AIX32-NEXT: lvx v2, 0, r3
+; P8-AIX32-NEXT: vsplth v2, v2, 0
; P8-AIX32-NEXT: blr
;
; P7-AIX32-LABEL: test_aligned_v8i16_1:
; P7-AIX32: # %bb.0: # %entry
-; P7-AIX32-NEXT: li r4, 1
; P7-AIX32-NEXT: lvx v2, 0, r3
-; P7-AIX32-NEXT: lvsl v4, 0, r3
-; P7-AIX32-NEXT: lvx v3, r4, r3
-; P7-AIX32-NEXT: vperm v2, v2, v3, v4
; P7-AIX32-NEXT: vsplth v2, v2, 0
; P7-AIX32-NEXT: blr
entry:
@@ -1119,19 +1109,15 @@ define <8 x i16> @test_aligned_v8i16_2(i16* %Ptr) {
;
; P8-LABEL: test_aligned_v8i16_2:
; P8: # %bb.0: # %entry
-; P8-NEXT: lhz r3, 32(r3)
-; P8-NEXT: mtvsrwz v2, r3
-; P8-NEXT: vsplth v2, v2, 3
+; P8-NEXT: addi r3, r3, 32
+; P8-NEXT: lvx v2, 0, r3
+; P8-NEXT: vsplth v2, v2, 7
; P8-NEXT: blr
;
; P7-LABEL: test_aligned_v8i16_2:
; P7: # %bb.0: # %entry
-; P7-NEXT: li r4, 1
; P7-NEXT: addi r3, r3, 32
; P7-NEXT: lvx v2, 0, r3
-; P7-NEXT: lvx v3, r4, r3
-; P7-NEXT: lvsl v4, 0, r3
-; P7-NEXT: vperm v2, v2, v3, v4
; P7-NEXT: vsplth v2, v2, 0
; P7-NEXT: blr
;
@@ -1144,19 +1130,15 @@ define <8 x i16> @test_aligned_v8i16_2(i16* %Ptr) {
;
; P8-AIX32-LABEL: test_aligned_v8i16_2:
; P8-AIX32: # %bb.0: # %entry
-; P8-AIX32-NEXT: lhz r3, 32(r3)
-; P8-AIX32-NEXT: mtvsrwz v2, r3
-; P8-AIX32-NEXT: vsplth v2, v2, 3
+; P8-AIX32-NEXT: addi r3, r3, 32
+; P8-AIX32-NEXT: lvx v2, 0, r3
+; P8-AIX32-NEXT: vsplth v2, v2, 0
; P8-AIX32-NEXT: blr
;
; P7-AIX32-LABEL: test_aligned_v8i16_2:
; P7-AIX32: # %bb.0: # %entry
-; P7-AIX32-NEXT: li r4, 1
; P7-AIX32-NEXT: addi r3, r3, 32
; P7-AIX32-NEXT: lvx v2, 0, r3
-; P7-AIX32-NEXT: lvx v3, r4, r3
-; P7-AIX32-NEXT: lvsl v4, 0, r3
-; P7-AIX32-NEXT: vperm v2, v2, v3, v4
; P7-AIX32-NEXT: vsplth v2, v2, 0
; P7-AIX32-NEXT: blr
entry:
@@ -1176,16 +1158,13 @@ define <16 x i8> @test_aligned_v16i8_1(i8* %Ptr) {
;
; P8-LABEL: test_aligned_v16i8_1:
; P8: # %bb.0: # %entry
-; P8-NEXT: lbzx r3, 0, r3
-; P8-NEXT: mtvsrwz v2, r3
-; P8-NEXT: vspltb v2, v2, 7
+; P8-NEXT: lvx v2, 0, r3
+; P8-NEXT: vspltb v2, v2, 15
; P8-NEXT: blr
;
; P7-LABEL: test_aligned_v16i8_1:
; P7: # %bb.0: # %entry
-; P7-NEXT: lvsl v2, 0, r3
-; P7-NEXT: lvx v3, 0, r3
-; P7-NEXT: vperm v2, v3, v3, v2
+; P7-NEXT: lvx v2, 0, r3
; P7-NEXT: vspltb v2, v2, 0
; P7-NEXT: blr
;
@@ -1197,16 +1176,13 @@ define <16 x i8> @test_aligned_v16i8_1(i8* %Ptr) {
;
; P8-AIX32-LABEL: test_aligned_v16i8_1:
; P8-AIX32: # %bb.0: # %entry
-; P8-AIX32-NEXT: lbzx r3, 0, r3
-; P8-AIX32-NEXT: mtvsrwz v2, r3
-; P8-AIX32-NEXT: vspltb v2, v2, 7
+; P8-AIX32-NEXT: lvx v2, 0, r3
+; P8-AIX32-NEXT: vspltb v2, v2, 0
; P8-AIX32-NEXT: blr
;
; P7-AIX32-LABEL: test_aligned_v16i8_1:
; P7-AIX32: # %bb.0: # %entry
-; P7-AIX32-NEXT: lvsl v2, 0, r3
-; P7-AIX32-NEXT: lvx v3, 0, r3
-; P7-AIX32-NEXT: vperm v2, v3, v3, v2
+; P7-AIX32-NEXT: lvx v2, 0, r3
; P7-AIX32-NEXT: vspltb v2, v2, 0
; P7-AIX32-NEXT: blr
entry:
@@ -1226,17 +1202,15 @@ define <16 x i8> @test_aligned_v16i8_2(i8* %Ptr) {
;
; P8-LABEL: test_aligned_v16i8_2:
; P8: # %bb.0: # %entry
-; P8-NEXT: lbz r3, 16(r3)
-; P8-NEXT: mtvsrwz v2, r3
-; P8-NEXT: vspltb v2, v2, 7
+; P8-NEXT: addi r3, r3, 16
+; P8-NEXT: lvx v2, 0, r3
+; P8-NEXT: vspltb v2, v2, 15
; P8-NEXT: blr
;
; P7-LABEL: test_aligned_v16i8_2:
; P7: # %bb.0: # %entry
; P7-NEXT: addi r3, r3, 16
-; P7-NEXT: lvsl v2, 0, r3
-; P7-NEXT: lvx v3, 0, r3
-; P7-NEXT: vperm v2, v3, v3, v2
+; P7-NEXT: lvx v2, 0, r3
; P7-NEXT: vspltb v2, v2, 0
; P7-NEXT: blr
;
@@ -1249,17 +1223,15 @@ define <16 x i8> @test_aligned_v16i8_2(i8* %Ptr) {
;
; P8-AIX32-LABEL: test_aligned_v16i8_2:
; P8-AIX32: # %bb.0: # %entry
-; P8-AIX32-NEXT: lbz r3, 16(r3)
-; P8-AIX32-NEXT: mtvsrwz v2, r3
-; P8-AIX32-NEXT: vspltb v2, v2, 7
+; P8-AIX32-NEXT: addi r3, r3, 16
+; P8-AIX32-NEXT: lvx v2, 0, r3
+; P8-AIX32-NEXT: vspltb v2, v2, 0
; P8-AIX32-NEXT: blr
;
; P7-AIX32-LABEL: test_aligned_v16i8_2:
; P7-AIX32: # %bb.0: # %entry
; P7-AIX32-NEXT: addi r3, r3, 16
-; P7-AIX32-NEXT: lvsl v2, 0, r3
-; P7-AIX32-NEXT: lvx v3, 0, r3
-; P7-AIX32-NEXT: vperm v2, v3, v3, v2
+; P7-AIX32-NEXT: lvx v2, 0, r3
; P7-AIX32-NEXT: vspltb v2, v2, 0
; P7-AIX32-NEXT: blr
entry:
More information about the llvm-commits
mailing list