[llvm] 755e008 - [X86] Remove isel patterns for X86VBroadcast+trunc+extload. Replace with DAG combines.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 13 18:19:51 PDT 2020
Author: Craig Topper
Date: 2020-03-13T18:12:16-07:00
New Revision: 755e00876cd383785668e88c871c4e4ef9b648a9
URL: https://github.com/llvm/llvm-project/commit/755e00876cd383785668e88c871c4e4ef9b648a9
DIFF: https://github.com/llvm/llvm-project/commit/755e00876cd383785668e88c871c4e4ef9b648a9.diff
LOG: [X86] Remove isel patterns for X86VBroadcast+trunc+extload. Replace with DAG combines.
This is a little more complicated than I'd like it to be. We have
to manually match a trunc+srl+load pattern that generic DAG
combine won't do for us due to isTypeDesirableForOp.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86InstrAVX512.td
llvm/lib/Target/X86/X86InstrSSE.td
llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f71ec4840409..521dc578da98 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35231,22 +35231,74 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
// Due to isTypeDesirableForOp, we won't always shrink a load truncated to
// i16. So shrink it ourselves if we can make a broadcast_load.
if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
- Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) &&
- Src.getOperand(0).hasOneUse()) {
+ Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
assert(Subtarget.hasAVX2() && "Expected AVX2");
- LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
- if (LN->isSimple()) {
- SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
- SDValue BcastLd =
- DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
- MVT::i16, LN->getPointerInfo(),
- LN->getAlignment(),
- LN->getMemOperand()->getFlags());
- DCI.CombineTo(N.getNode(), BcastLd);
- DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
- DCI.recursivelyDeleteUnusedNodes(LN);
- return N; // Return N so it doesn't get rechecked!
+ SDValue TruncIn = Src.getOperand(0);
+
+ // If this is a truncate of a non extending load we can just narrow it to
+ // use a broadcast_load.
+ if (ISD::isNormalLoad(TruncIn.getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
+ // Unless its volatile or atomic.
+ if (LN->isSimple()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+ MVT::i16, LN->getPointerInfo(),
+ LN->getAlignment(),
+ LN->getMemOperand()->getFlags());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ // If this is a truncate of an i16 extload, we can directly replace it.
+ if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
+ ISD::isEXTLoad(Src.getOperand(0).getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
+ if (LN->getMemoryVT().getSizeInBits() == 16) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ // If this is a truncate of load that has been shifted right, we can
+ // offset the pointer and use a narrower load.
+ if (TruncIn.getOpcode() == ISD::SRL &&
+ TruncIn.getOperand(0).hasOneUse() &&
+ isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
+ ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
+ unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
+ // Make sure the shift amount and the load size are divisible by 16.
+ // Don't do this if the load is volatile or atomic.
+ if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
+ LN->isSimple()) {
+ unsigned Offset = ShiftAmt / 8;
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), Offset, DL);
+ SDValue Ops[] = { LN->getChain(), Ptr };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+ MVT::i16,
+ LN->getPointerInfo().getWithOffset(Offset),
+ MinAlign(LN->getAlignment(), Offset),
+ LN->getMemOperand()->getFlags());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N; // Return N so it doesn't get rechecked!
+ }
}
}
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 12c8f0bcdd85..8ed20cdabf7a 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -1423,53 +1423,6 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
AVX5128IBase, EVEX;
}
-let Predicates = [HasVLX, HasBWI] in {
- // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
- // This means we'll encounter truncated i32 loads; match that here.
- def : Pat<(v8i16 (X86VBroadcast
- (i16 (trunc (extloadi32i16 addr:$src))))),
- (VPBROADCASTWZ128rm addr:$src)>;
- def : Pat<(v8i16 (X86VBroadcast
- (i16 (trunc (zextloadi32i16 addr:$src))))),
- (VPBROADCASTWZ128rm addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast
- (i16 (trunc (extloadi32i16 addr:$src))))),
- (VPBROADCASTWZ256rm addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast
- (i16 (trunc (zextloadi32i16 addr:$src))))),
- (VPBROADCASTWZ256rm addr:$src)>;
-
- def : Pat<(v8i16 (X86VBroadcast
- (i16 (trunc (extloadi64i16 addr:$src))))),
- (VPBROADCASTWZ128rm addr:$src)>;
- def : Pat<(v8i16 (X86VBroadcast
- (i16 (trunc (zextloadi64i16 addr:$src))))),
- (VPBROADCASTWZ128rm addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast
- (i16 (trunc (extloadi64i16 addr:$src))))),
- (VPBROADCASTWZ256rm addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast
- (i16 (trunc (zextloadi64i16 addr:$src))))),
- (VPBROADCASTWZ256rm addr:$src)>;
-}
-let Predicates = [HasBWI] in {
- // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
- // This means we'll encounter truncated i32 loads; match that here.
- def : Pat<(v32i16 (X86VBroadcast
- (i16 (trunc (extloadi32i16 addr:$src))))),
- (VPBROADCASTWZrm addr:$src)>;
- def : Pat<(v32i16 (X86VBroadcast
- (i16 (trunc (zextloadi32i16 addr:$src))))),
- (VPBROADCASTWZrm addr:$src)>;
-
- def : Pat<(v32i16 (X86VBroadcast
- (i16 (trunc (extloadi64i16 addr:$src))))),
- (VPBROADCASTWZrm addr:$src)>;
- def : Pat<(v32i16 (X86VBroadcast
- (i16 (trunc (zextloadi64i16 addr:$src))))),
- (VPBROADCASTWZrm addr:$src)>;
-}
-
//===----------------------------------------------------------------------===//
// AVX-512 BROADCAST SUBVECTORS
//
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 7e6fa62e132e..393d1bfe9ffe 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -7514,36 +7514,6 @@ defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastl
defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
v2i64, v4i64, NoVLX>;
-let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
- // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
- // This means we'll encounter truncated i32 loads; match that here.
- def : Pat<(v8i16 (X86VBroadcast
- (i16 (trunc (extloadi32i16 addr:$src))))),
- (VPBROADCASTWrm addr:$src)>;
- def : Pat<(v8i16 (X86VBroadcast
- (i16 (trunc (zextloadi32i16 addr:$src))))),
- (VPBROADCASTWrm addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast
- (i16 (trunc (extloadi32i16 addr:$src))))),
- (VPBROADCASTWYrm addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast
- (i16 (trunc (zextloadi32i16 addr:$src))))),
- (VPBROADCASTWYrm addr:$src)>;
-
- def : Pat<(v8i16 (X86VBroadcast
- (i16 (trunc (extloadi64i16 addr:$src))))),
- (VPBROADCASTWrm addr:$src)>;
- def : Pat<(v8i16 (X86VBroadcast
- (i16 (trunc (zextloadi64i16 addr:$src))))),
- (VPBROADCASTWrm addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast
- (i16 (trunc (extloadi64i16 addr:$src))))),
- (VPBROADCASTWYrm addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast
- (i16 (trunc (zextloadi64i16 addr:$src))))),
- (VPBROADCASTWYrm addr:$src)>;
-}
-
let Predicates = [HasAVX2, NoVLX] in {
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 15e6101df3ae..eb4e2edf157a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -3280,20 +3280,10 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i64(i64* %ptr) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: insert_dup_elt1_mem_v8i16_i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movq (%rdi), %rax
-; AVX2-NEXT: shrq $16, %rax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_dup_elt1_mem_v8i16_i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: movq (%rdi), %rax
-; AVX512VL-NEXT: shrq $16, %rax
-; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v8i16_i64:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %xmm0
+; AVX2OR512VL-NEXT: retq
;
; XOPAVX1-LABEL: insert_dup_elt1_mem_v8i16_i64:
; XOPAVX1: # %bb.0:
@@ -3304,10 +3294,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i64(i64* %ptr) {
;
; XOPAVX2-LABEL: insert_dup_elt1_mem_v8i16_i64:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: movq (%rdi), %rax
-; XOPAVX2-NEXT: shrq $16, %rax
-; XOPAVX2-NEXT: vmovd %eax, %xmm0
-; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; XOPAVX2-NEXT: vpbroadcastw 2(%rdi), %xmm0
; XOPAVX2-NEXT: retq
%tmp = load i64, i64* %ptr, align 4
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 6f18c5e897db..8c12c0d2e9fc 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -7500,20 +7500,10 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i64(i64* %ptr) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: insert_dup_elt1_mem_v16i16_i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movq (%rdi), %rax
-; AVX2-NEXT: shrq $16, %rax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_dup_elt1_mem_v16i16_i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: movq (%rdi), %rax
-; AVX512VL-NEXT: shrq $16, %rax
-; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i16_i64:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0
+; AVX2OR512VL-NEXT: retq
;
; XOPAVX1-LABEL: insert_dup_elt1_mem_v16i16_i64:
; XOPAVX1: # %bb.0:
@@ -7525,10 +7515,7 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i64(i64* %ptr) {
;
; XOPAVX2-LABEL: insert_dup_elt1_mem_v16i16_i64:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: movq (%rdi), %rax
-; XOPAVX2-NEXT: shrq $16, %rax
-; XOPAVX2-NEXT: vmovd %eax, %xmm0
-; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; XOPAVX2-NEXT: vpbroadcastw 2(%rdi), %ymm0
; XOPAVX2-NEXT: retq
%tmp = load i64, i64* %ptr, align 4
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 6724ba2e7d5e..0e79116884f4 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -310,18 +310,13 @@ define <32 x i16> @insert_dup_mem_v16i16_i64(i64* %ptr) {
define <32 x i16> @insert_dup_elt1_mem_v16i16_i64(i64* %ptr) {
; KNL-LABEL: insert_dup_elt1_mem_v16i16_i64:
; KNL: ## %bb.0:
-; KNL-NEXT: movq (%rdi), %rax
-; KNL-NEXT: shrq $16, %rax
-; KNL-NEXT: vmovd %eax, %xmm0
-; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
+; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0
; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: insert_dup_elt1_mem_v16i16_i64:
; SKX: ## %bb.0:
-; SKX-NEXT: movq (%rdi), %rax
-; SKX-NEXT: shrq $16, %rax
-; SKX-NEXT: vpbroadcastw %eax, %zmm0
+; SKX-NEXT: vpbroadcastw 2(%rdi), %zmm0
; SKX-NEXT: retq
%tmp = load i64, i64* %ptr, align 4
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
More information about the llvm-commits
mailing list