[llvm] 4e8c028 - [X86] Stop reduceMaskedLoadToScalarLoad/reduceMaskedStoreToScalarStore from creating scalar i64 load/stores in 32-bit mode
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sun Sep 20 13:47:30 PDT 2020
Author: Craig Topper
Date: 2020-09-20T13:46:59-07:00
New Revision: 4e8c028158b56d9c2142a62464e8e0686bde3584
URL: https://github.com/llvm/llvm-project/commit/4e8c028158b56d9c2142a62464e8e0686bde3584
DIFF: https://github.com/llvm/llvm-project/commit/4e8c028158b56d9c2142a62464e8e0686bde3584.diff
LOG: [X86] Stop reduceMaskedLoadToScalarLoad/reduceMaskedStoreToScalarStore from creating scalar i64 load/stores in 32-bit mode
If we emit a scalar i64 load/store it will get type legalized to two i32 load/stores.
Differential Revision: https://reviews.llvm.org/D87862
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/masked_load.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ca149248a479..f0c66cc879c8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -44499,7 +44499,8 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
/// mask have already been optimized in IR, so we don't bother with those here.
static SDValue
reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
assert(ML->isUnindexed() && "Unexpected indexed masked load!");
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
@@ -44516,14 +44517,25 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
SDLoc DL(ML);
EVT VT = ML->getValueType(0);
EVT EltVT = VT.getVectorElementType();
+
+ EVT CastVT = VT;
+ if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
+ EltVT = MVT::f64;
+ CastVT =
+ EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
+ }
+
SDValue Load =
DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
ML->getPointerInfo().getWithOffset(Offset),
Alignment, ML->getMemOperand()->getFlags());
+ SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
+
// Insert the loaded element into the appropriate place in the vector.
- SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
- ML->getPassThru(), Load, VecIndex);
+ SDValue Insert =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
+ Insert = DAG.getBitcast(VT, Insert);
return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
}
@@ -44586,7 +44598,8 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
return SDValue();
if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
- if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
+ if (SDValue ScalarLoad =
+ reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
return ScalarLoad;
// TODO: Do some AVX512 subsets benefit from this transform?
@@ -44623,7 +44636,8 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
/// mask have already been optimized in IR, so we don't bother with those here.
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
// is profitable. Endianness would also have to be considered.
@@ -44636,10 +44650,17 @@ static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
// Extract the one scalar element that is actually being stored.
SDLoc DL(MS);
- EVT VT = MS->getValue().getValueType();
+ SDValue Value = MS->getValue();
+ EVT VT = Value.getValueType();
EVT EltVT = VT.getVectorElementType();
- SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
- MS->getValue(), VecIndex);
+ if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
+ EltVT = MVT::f64;
+ EVT CastVT =
+ EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
+ Value = DAG.getBitcast(CastVT, Value);
+ }
+ SDValue Extract =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
// Store that element at the appropriate offset from the base pointer.
return DAG.getStore(MS->getChain(), DL, Extract, Addr,
@@ -44661,7 +44682,7 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
if (Mst->isTruncatingStore())
return SDValue();
- if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
+ if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
return ScalarStore;
// If the mask value has been legalized to a non-boolean vector, try to
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 30f4e9f56526..2d2fb44cdfbd 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -6504,8 +6504,7 @@ define <2 x i64> @mload_constmask_v2i64(<2 x i64>* %addr, <2 x i64> %dst) {
; X86-AVX512-LABEL: mload_constmask_v2i64:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: vpinsrd $2, 8(%eax), %xmm0, %xmm0
-; X86-AVX512-NEXT: vpinsrd $3, 12(%eax), %xmm0, %xmm0
+; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; X86-AVX512-NEXT: retl
%res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x i64> %dst)
ret <2 x i64> %res
@@ -7109,10 +7108,9 @@ define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
; X86-AVX512-LABEL: load_one_mask_bit_set3:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX512-NEXT: vpinsrd $0, 16(%eax), %xmm1, %xmm1
-; X86-AVX512-NEXT: vpinsrd $1, 20(%eax), %xmm1, %xmm1
-; X86-AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX512-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-AVX512-NEXT: retl
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
ret <4 x i64> %res
More information about the llvm-commits
mailing list