[llvm] r370294 - [X86] Add a DAG combine to combine INSERTPS and VBROADCAST of a scalar load. Remove corresponding isel patterns.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 28 22:48:48 PDT 2019
Author: ctopper
Date: Wed Aug 28 22:48:48 2019
New Revision: 370294
URL: http://llvm.org/viewvc/llvm-project?rev=370294&view=rev
Log:
[X86] Add a DAG combine to combine INSERTPS and VBROADCAST of a scalar load. Remove corresponding isel patterns.
We had an isel pattern to perform this, but its better to
do it in DAG combine as a simplification. This also fixes the lack
of patterns for AVX512 targets.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86InstrSSE.td
llvm/trunk/test/CodeGen/X86/sse41.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=370294&r1=370293&r2=370294&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Aug 28 22:48:48 2019
@@ -33550,46 +33550,57 @@ static SDValue combineTargetShuffle(SDVa
// Attempt to merge insertps Op0 with an inner target shuffle node.
SmallVector<int, 8> TargetMask0;
SmallVector<SDValue, 2> Ops0;
- if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
- return SDValue();
+ if (setTargetShuffleZeroElements(Op0, TargetMask0, Ops0)) {
+ bool Updated = false;
+ bool UseInput00 = false;
+ bool UseInput01 = false;
+ for (int i = 0; i != 4; ++i) {
+ int M = TargetMask0[i];
+ if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
+ // No change if element is already zero or the inserted element.
+ continue;
+ } else if (isUndefOrZero(M)) {
+ // If the target mask is undef/zero then we must zero the element.
+ InsertPSMask |= (1u << i);
+ Updated = true;
+ continue;
+ }
+
+ // The input vector element must be inline.
+ if (M != i && M != (i + 4))
+ return SDValue();
+
+ // Determine which inputs of the target shuffle we're using.
+ UseInput00 |= (0 <= M && M < 4);
+ UseInput01 |= (4 <= M);
+ }
- bool Updated = false;
- bool UseInput00 = false;
- bool UseInput01 = false;
- for (int i = 0; i != 4; ++i) {
- int M = TargetMask0[i];
- if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
- // No change if element is already zero or the inserted element.
- continue;
- } else if (isUndefOrZero(M)) {
- // If the target mask is undef/zero then we must zero the element.
- InsertPSMask |= (1u << i);
+ // If we're not using both inputs of the target shuffle then use the
+ // referenced input directly.
+ if (UseInput00 && !UseInput01) {
+ Updated = true;
+ Op0 = Ops0[0];
+ } else if (!UseInput00 && UseInput01) {
Updated = true;
- continue;
+ Op0 = Ops0[1];
}
- // The input vector element must be inline.
- if (M != i && M != (i + 4))
- return SDValue();
-
- // Determine which inputs of the target shuffle we're using.
- UseInput00 |= (0 <= M && M < 4);
- UseInput01 |= (4 <= M);
- }
-
- // If we're not using both inputs of the target shuffle then use the
- // referenced input directly.
- if (UseInput00 && !UseInput01) {
- Updated = true;
- Op0 = Ops0[0];
- } else if (!UseInput00 && UseInput01) {
- Updated = true;
- Op0 = Ops0[1];
+ if (Updated)
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
}
- if (Updated)
- return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
- DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ // If we're inserting an element from a vbroadcast of a load, fold the
+ // load into the X86insertps instruction. We need to convert the scalar
+ // load to a vector and clear the source lane of the INSERTPS control.
+ if (Op1.getOpcode() == X86ISD::VBROADCAST && Op1.hasOneUse() &&
+ Op1.getOperand(0).hasOneUse() &&
+ !Op1.getOperand(0).getValueType().isVector() &&
+ ISD::isNormalLoad(Op1.getOperand(0).getNode()))
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
+ Op1.getOperand(0)),
+ DAG.getConstant(InsertPSMask & 0x3f, DL, MVT::i8));
return SDValue();
}
Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=370294&r1=370293&r2=370294&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Wed Aug 28 22:48:48 2019
@@ -5323,19 +5323,6 @@ let ExeDomain = SSEPackedSingle in {
defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
}
-let Predicates = [UseAVX] in {
- // If we're inserting an element from a vbroadcast of a load, fold the
- // load into the X86insertps instruction.
- // FIXME: Why are these here? This looks like a demanded bits issue.
- // FIXME: Missing AVX512 equivalents.
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
- (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
- (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
- (X86VBroadcast (v4f32 (nonvolatile_load addr:$src2))), imm:$src3)),
- (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
-}
-
//===----------------------------------------------------------------------===//
// SSE4.1 - Round Instructions
//===----------------------------------------------------------------------===//
Modified: llvm/trunk/test/CodeGen/X86/sse41.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse41.ll?rev=370294&r1=370293&r2=370294&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse41.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse41.ll Wed Aug 28 22:48:48 2019
@@ -1559,9 +1559,8 @@ define <4 x float> @insertps_from_broadc
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; X86-AVX512-NEXT: vbroadcastss (%ecx,%eax,4), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0c,0x81]
-; X86-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-AVX512-NEXT: vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x04,0x81,0x30]
+; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0]
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
;
; X64-SSE-LABEL: insertps_from_broadcast_loadf32:
@@ -1578,9 +1577,8 @@ define <4 x float> @insertps_from_broadc
;
; X64-AVX512-LABEL: insertps_from_broadcast_loadf32:
; X64-AVX512: ## %bb.0:
-; X64-AVX512-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0c,0xb7]
-; X64-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-AVX512-NEXT: vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x04,0xb7,0x30]
+; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0]
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
%1 = getelementptr inbounds float, float* %fb, i64 %index
%2 = load float, float* %1, align 4
@@ -1611,9 +1609,8 @@ define <4 x float> @insertps_from_broadc
; X86-AVX512-LABEL: insertps_from_broadcast_loadv4f32:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT: vbroadcastss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x08]
-; X86-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-AVX512-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30]
+; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0]
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
;
; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32:
@@ -1631,9 +1628,8 @@ define <4 x float> @insertps_from_broadc
;
; X64-AVX512-LABEL: insertps_from_broadcast_loadv4f32:
; X64-AVX512: ## %bb.0:
-; X64-AVX512-NEXT: vbroadcastss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0f]
-; X64-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-AVX512-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30]
+; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0]
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
%1 = load <4 x float>, <4 x float>* %b, align 4
%2 = extractelement <4 x float> %1, i32 0
More information about the llvm-commits
mailing list