[llvm] Revert "[NVPTX] Lower 16xi8 and 8xi8 stores efficiently (#73646)" (PR #74518)
Artem Belevich via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 5 12:31:15 PST 2023
https://github.com/Artem-B created https://github.com/llvm/llvm-project/pull/74518
This reverts commit 173fcf7da592acd284dc50749558fd36928861f0.
Needs to constrain the optimization to properly aligned loads/stores only. https://github.com/llvm/llvm-project/pull/73646#issuecomment-1841454559
>From 1a8b5736af19a9dc1cff120cdbe82054a2041d91 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Tue, 5 Dec 2023 12:26:58 -0800
Subject: [PATCH] Revert "[NVPTX] Lower 16xi8 and 8xi8 stores efficiently
(#73646)"
This reverts commit 173fcf7da592acd284dc50749558fd36928861f0.
Needs to constrain the optimization to properly aligned loads/stores only.
https://github.com/llvm/llvm-project/pull/73646#issuecomment-1841454559
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 53 ++------------------
llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 7 +--
llvm/test/CodeGen/NVPTX/vector-stores.ll | 16 ------
3 files changed, 7 insertions(+), 69 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b975825dae4b6..61285c6ba98df 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -508,7 +508,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand);
- // Conversion to/from i8/i8x4 is always legal.
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
@@ -718,8 +717,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// We have some custom DAG combine patterns for these nodes
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
- ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::STORE,
- ISD::UREM, ISD::VSELECT});
+ ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
+ ISD::VSELECT});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -2917,6 +2916,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
MemSD->getMemoryVT(), MemSD->getMemOperand());
+ // return DCI.CombineTo(N, NewSt, true);
return NewSt;
}
@@ -5557,51 +5557,6 @@ static SDValue PerformLOADCombine(SDNode *N,
DL);
}
-// Lower a v16i8 (or a v8i8) store into a StoreV4 (or StoreV2) operation with
-// i32 results instead of letting ReplaceLoadVector split it into smaller stores
-// during legalization. This is done at dag-combine1 time, so that vector
-// operations with i8 elements can be optimised away instead of being needlessly
-// split during legalization, which involves storing to the stack and loading it
-// back.
-static SDValue PerformSTORECombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- SelectionDAG &DAG = DCI.DAG;
- StoreSDNode *ST = cast<StoreSDNode>(N);
- EVT VT = ST->getValue().getValueType();
- if (VT != MVT::v16i8 && VT != MVT::v8i8)
- return SDValue();
-
- // Create a v4i32 vector store operation, effectively <4 x v4i8>.
- unsigned Opc = VT == MVT::v16i8 ? NVPTXISD::StoreV4 : NVPTXISD::StoreV2;
- EVT NewVT = VT == MVT::v16i8 ? MVT::v4i32 : MVT::v2i32;
- unsigned NumElts = NewVT.getVectorNumElements();
-
- // Create a vector of the type required by the new store: v16i8 -> v4i32.
- SDValue NewStoreValue = DCI.DAG.getBitcast(NewVT, ST->getValue());
-
- // Operands for the store.
- SmallVector<SDValue, 8> Ops;
- Ops.reserve(N->getNumOperands() + NumElts - 1);
- // Chain value.
- Ops.push_back(N->ops().front());
-
- SDLoc DL(N);
- SmallVector<SDValue> Elts(NumElts);
- // Break v4i32 (or v2i32) into four (or two) elements.
- for (unsigned I = 0; I < NumElts; ++I)
- Elts[I] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
- NewStoreValue.getValueType().getVectorElementType(),
- NewStoreValue, DAG.getIntPtrConstant(I, DL));
- Ops.append(Elts.begin(), Elts.end());
- // Any remaining operands.
- Ops.append(N->op_begin() + 2, N->op_end());
-
- SDValue NewStore = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
- Ops, NewVT, ST->getMemOperand());
- // Return the new chain.
- return NewStore.getValue(0);
-}
-
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -5623,8 +5578,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformSETCCCombine(N, DCI, STI.getSmVersion());
case ISD::LOAD:
return PerformLOADCombine(N, DCI);
- case ISD::STORE:
- return PerformSTORECombine(N, DCI);
case NVPTXISD::StoreRetval:
case NVPTXISD::StoreRetvalV2:
case NVPTXISD::StoreRetvalV4:
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 55cf6fb825762..1ec68b4a271ba 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -790,9 +790,10 @@ define void @test_ldst_v8i8(ptr %a, ptr %b) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8i8_param_1];
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8i8_param_0];
-; CHECK-NEXT: ld.u32 %r1, [%rd1+4];
-; CHECK-NEXT: ld.u32 %r2, [%rd1];
-; CHECK-NEXT: st.v2.u32 [%rd2], {%r2, %r1};
+; CHECK-NEXT: ld.u32 %r1, [%rd1];
+; CHECK-NEXT: ld.u32 %r2, [%rd1+4];
+; CHECK-NEXT: st.u32 [%rd2+4], %r2;
+; CHECK-NEXT: st.u32 [%rd2], %r1;
; CHECK-NEXT: ret;
%t1 = load <8 x i8>, ptr %a
store <8 x i8> %t1, ptr %b, align 16
diff --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll
index 8248bdbc1ee1c..df14553a77205 100644
--- a/llvm/test/CodeGen/NVPTX/vector-stores.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll
@@ -37,19 +37,3 @@ define void @v16i8(ptr %a, ptr %b) {
store <16 x i8> %v, ptr %b
ret void
}
-
-; CHECK-LABEL: .visible .func v16i8_store
-define void @v16i8_store(ptr %a, <16 x i8> %v) {
- ; CHECK: ld.param.u64 %rd1, [v16i8_store_param_0];
- ; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [v16i8_store_param_1];
- ; CHECK-NEXT: st.v4.u32 [%rd1], {%r1, %r2, %r3, %r4};
- store <16 x i8> %v, ptr %a
- ret void
-}
-
-; CHECK-LABEL: .visible .func v8i8_store
-define void @v8i8_store(ptr %a, <8 x i8> %v) {
- ; CHECK: st.v2.u32
- store <8 x i8> %v, ptr %a
- ret void
-}
More information about the llvm-commits
mailing list