[llvm] r364977 - [X86] Add a DAG combine for turning *_extend_vector_inreg+load into an appropriate extload if the load isn't volatile.
Topper, Craig via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 9 16:44:53 PDT 2019
Can you try this patch and see if it helps?
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 40e3070..2812b08 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5788,7 +5788,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
}
static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
- if (!Load)
+ if (!Load || !ISD::isNormalLoad(Load))
return nullptr;
SDValue Ptr = Load->getBasePtr();
From: Jordan Rupprecht <rupprecht at google.com>
Sent: Tuesday, July 9, 2019 4:28 PM
To: Topper, Craig <craig.topper at intel.com>
Cc: llvm-commits <llvm-commits at lists.llvm.org>
Subject: Re: [llvm] r364977 - [X86] Add a DAG combine for turning *_extend_vector_inreg+load into an appropriate extload if the load isn't volatile.
Sure:
.LCPI3_0:
- .quad 12884901889 # 0x300000001
- .quad 12884901889 # 0x300000001
-.LCPI3_1:
- .quad 1 # 0x1
- .quad 1 # 0x1
-.LCPI3_2:
- .quad 8 # 0x8
- .quad 8 # 0x8
-.LCPI3_6:
+ .byte 0 # 0x0
+ .byte 0 # 0x0
+ .byte 0 # 0x0
+ .byte 0 # 0x0
+ .byte 0 # 0x0
+ .byte 0 # 0x0
+ .byte 0 # 0x0
+ .byte 0 # 0x0
+ .byte 8 # 0x8
+ .byte 0 # 0x0
+ .byte 0 # 0x0
+ .byte 0 # 0x0
+ .byte 0 # 0x0
+ .byte 0 # 0x0
+ .byte 0 # 0x0
+ .byte 0 # 0x0
+.LCPI3_4:
.zero 16,205
.section .rodata.cst4,"aM", at progbits,4
.p2align 2
-.LCPI3_3:
+.LCPI3_1:
.long 1066192077 # float 1.10000002
-.LCPI3_4:
+.LCPI3_2:
.long 1067030938 # float 1.20000005
-.LCPI3_5:
+.LCPI3_3:
.long 1067869798 # float 1.29999995
.section .text,"ax", at progbits,unique,4
.globl _ZN35FloatArrayTest_SomeAllowedMask_Test8TestBodyEv
.p2align 5, 0x90
.type _ZN35FloatArrayTest_SomeAllowedMask_Test8TestBodyEv, at function
_ZN35FloatArrayTest_SomeAllowedMask_Test8TestBodyEv: # @_ZN35FloatArrayTest_SomeAllowedMask_Test8TestBodyEv
.cfi_startproc
# %bb.0:
On Tue, Jul 9, 2019 at 4:25 PM Topper, Craig <craig.topper at intel.com<mailto:craig.topper at intel.com>> wrote:
Ca you also send the values for the 3 constant pool entries before and after the change LCPI3_0, LCPI3_1, and LCPI3_2.
From: Jordan Rupprecht <rupprecht at google.com<mailto:rupprecht at google.com>>
Sent: Tuesday, July 9, 2019 4:21 PM
To: Topper, Craig <craig.topper at intel.com<mailto:craig.topper at intel.com>>
Cc: llvm-commits <llvm-commits at lists.llvm.org<mailto:llvm-commits at lists.llvm.org>>
Subject: Re: [llvm] r364977 - [X86] Add a DAG combine for turning *_extend_vector_inreg+load into an appropriate extload if the load isn't volatile.
I have the C++ code reduced to this:
#include <array>
#include <vector>
#include "gtest/gtest.h"
using FloatArray = std::array<float, 8>;
class Bitmap {
public:
Bitmap() : bitmap_(0) {}
explicit Bitmap(int bitmap) : bitmap_(bitmap) {}
bool Get(int index) const { return bitmap_ & (1 << index); }
void Set(int index) { bitmap_ |= (1 << index); }
private:
int bitmap_;
};
const FloatArray MaskFloatArray(const FloatArray& input_array,
Bitmap mask_types) {
FloatArray tmp_array;
for (size_t i = 0; i < 8; ++i)
tmp_array[i] = mask_types.Get(i) ? input_array[i] : 0;
return tmp_array;
}
namespace {
Bitmap FromIndexesBitmap(const std::vector<int>& indexes) {
Bitmap bm;
for (size_t i = 0; i < indexes.size(); ++i) bm.Set(indexes[i]);
return bm;
}
} // namespace
TEST(FloatArrayTest, AllAllowedMask) {
FloatArray input = {{0, 1.1, 1.2, 1.3}};
const FloatArray output =
MaskFloatArray(input, FromIndexesBitmap({0, 1, 2, 3, 4}));
EXPECT_EQ(output, input);
}
TEST(FloatArrayTest, SomeAllowedMask) {
FloatArray input = {{0, 1.1, 1.2, 1.3}};
const FloatArray output = MaskFloatArray(input, FromIndexesBitmap({1, 3}));
FloatArray expected_output = {{0, 1.1, 0, 1.3}};
EXPECT_EQ(output, expected_output);
}
With the test failure being the mismatch in SomeAllowedMask:
Expected equality of these values:
output
Which is: { 0, 0, 0, 1.3, 0, 0, 0, 0 }
expected_output
Which is: { 0, 1.1, 0, 1.3, 0, 0, 0, 0 }
Stack trace:
0x5650b1e08424: FloatArrayTest_SomeAllowedMask_Test::TestBody() @ ../sysdeps/x86_64/start.S:121
The major difference seems to be the generated assembly for the test body:
FloatArrayTest_SomeAllowedMask_Test::TestBody(): # @FloatArrayTest_SomeAllowedMask_Test::TestBody()
.cfi_startproc
# %bb.0:
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
pushq %rbx
subq $104, %rsp
.cfi_offset %rbx, -24
movl $8, %edi
callq operator new(unsigned long)@PLT
movabsq $12884901889, %rcx # imm = 0x300000001
- pmovzxdq .LCPI3_0(%rip), %xmm0 # xmm0 = mem[0],zero,mem[1],zero
- movdqa .LCPI3_1(%rip), %xmm1 # xmm1 = [1,1]
- psllq %xmm0, %xmm1
- pblendw $240, .LCPI3_2(%rip), %xmm1 # xmm1 = xmm1[0,1,2,3],mem[4,5,6,7]
movq %rcx, (%rax)
- pshufd $78, %xmm1, %xmm0 # xmm0 = xmm1[2,3,0,1]
- por %xmm1, %xmm0
+ movl $8, %ecx
+ movd %ecx, %xmm0
+ por .LCPI3_0(%rip), %xmm0
movd %xmm0, %ecx
movl $0, -64(%rbp)
pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
+ xorps %xmm1, %xmm1
testb $2, %cl
jne .LBB3_1
Sadly I can't get this to reproduce outside of our internal testing setup. Is the above enough for you to make sense of the test failure cause?
On Tue, Jul 9, 2019 at 11:02 AM Jordan Rupprecht <rupprecht at google.com<mailto:rupprecht at google.com>> wrote:
I'm still pruning down the list of what's required to trigger it, but the features we're building with are: "-msse4.2 -mpclmul -maes"
On Tue, Jul 9, 2019 at 10:56 AM Topper, Craig <craig.topper at intel.com<mailto:craig.topper at intel.com>> wrote:
Do you know what ISA features its being compiled for? I think I shouldn’t be doing this unless at least SSE4.1 is supported, but I don’t know if that would cause a failure.
From: Jordan Rupprecht <rupprecht at google.com<mailto:rupprecht at google.com>>
Sent: Monday, July 8, 2019 4:34 PM
To: Topper, Craig <craig.topper at intel.com<mailto:craig.topper at intel.com>>
Cc: llvm-commits <llvm-commits at lists.llvm.org<mailto:llvm-commits at lists.llvm.org>>
Subject: Re: [llvm] r364977 - [X86] Add a DAG combine for turning *_extend_vector_inreg+load into an appropriate extload if the load isn't volatile.
FYI, we're seeing some kinda strange test failures after this patch. I'm unable to get a reproducer, but it's on some utility code that is (at a very high level) implementing a vector<bool>-type thing, and is now returning 0 for some elements that were previously set.
On Tue, Jul 2, 2019 at 4:20 PM Craig Topper via llvm-commits <llvm-commits at lists.llvm.org<mailto:llvm-commits at lists.llvm.org>> wrote:
Author: ctopper
Date: Tue Jul 2 16:20:03 2019
New Revision: 364977
URL: http://llvm.org/viewvc/llvm-project?rev=364977&view=rev
Log:
[X86] Add a DAG combine for turning *_extend_vector_inreg+load into an appropriate extload if the load isn't volatile.
Remove the corresponding isel patterns that did the same thing without checking for volatile.
This fixes another variation of PR42079
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86InstrAVX512.td
llvm/trunk/lib/Target/X86/X86InstrSSE.td
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=364977&r1=364976&r2=364977&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Jul 2 16:20:03 2019
@@ -1876,6 +1876,7 @@ X86TargetLowering::X86TargetLowering(con
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
@@ -43914,16 +43915,35 @@ static SDValue combinePMULDQ(SDNode *N,
}
static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue In = N->getOperand(0);
+
+ // Try to merge vector loads and extend_inreg to an extload.
+ if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
+ In.hasOneUse()) {
+ auto *Ld = cast<LoadSDNode>(In);
+ if (!Ld->isVolatile()) {
+ MVT SVT = In.getSimpleValueType().getVectorElementType();
+ ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
+ VT.getVectorNumElements());
+ SDValue Load =
+ DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+ return Load;
+ }
+ }
+
// Disabling for widening legalization for now. We can enable if we find a
// case that needs it. Otherwise it can be deleted when we switch to
// widening legalization.
if (ExperimentalVectorWideningLegalization)
return SDValue();
- EVT VT = N->getValueType(0);
- SDValue In = N->getOperand(0);
-
// Combine (ext_invec (ext_invec X)) -> (ext_invec X)
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (In.getOpcode() == N->getOpcode() &&
@@ -43932,7 +43952,7 @@ static SDValue combineExtInVec(SDNode *N
// Attempt to combine as a shuffle.
// TODO: SSE41 support
- if (Subtarget.hasAVX()) {
+ if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
SDValue Op(N, 0);
if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
@@ -44010,7 +44030,9 @@ SDValue X86TargetLowering::PerformDAGCom
case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
case ISD::ANY_EXTEND_VECTOR_INREG:
- case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, Subtarget);
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
+ Subtarget);
case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=364977&r1=364976&r2=364977&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Tue Jul 2 16:20:03 2019
@@ -9632,21 +9632,15 @@ multiclass AVX512_pmovx_patterns<string
(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
- def : Pat<(v8i16 (InVecOp (loadv16i8 addr:$src))),
- (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
}
let Predicates = [HasVLX] in {
def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v4i32 addr:$src)))),
(!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (loadv16i8 addr:$src))),
- (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
(!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (loadv16i8 addr:$src))),
- (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
@@ -9654,15 +9648,11 @@ multiclass AVX512_pmovx_patterns<string
(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (loadv8i16 addr:$src))),
- (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v4i32 addr:$src)))),
(!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (loadv8i16 addr:$src))),
- (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
@@ -9670,37 +9660,27 @@ multiclass AVX512_pmovx_patterns<string
(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))),
(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (loadv4i32 addr:$src))),
- (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
}
let Predicates = [HasVLX] in {
def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))),
- (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v4i32 addr:$src)))),
(!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))),
- (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))),
- (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
}
// 512-bit patterns
let Predicates = [HasAVX512] in {
def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
- def : Pat<(v8i64 (InVecOp (loadv16i8 addr:$src))),
- (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
}
}
Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=364977&r1=364976&r2=364977&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Tue Jul 2 16:20:03 2019
@@ -4947,8 +4947,6 @@ multiclass SS41I_pmovx_avx2_patterns<str
(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))),
- (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
(!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
@@ -4957,15 +4955,11 @@ multiclass SS41I_pmovx_avx2_patterns<str
(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))),
- (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))),
- (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
}
}
_______________________________________________
llvm-commits mailing list
llvm-commits at lists.llvm.org<mailto:llvm-commits at lists.llvm.org>
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190709/953e6e94/attachment-0001.html>
More information about the llvm-commits
mailing list