[llvm] 5db8d6f - [AMDGPU][CodeGen] Support (base | offset) SMEM loads.
Ivan Kosarev via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 5 06:22:20 PDT 2022
Author: Ivan Kosarev
Date: 2022-09-05T14:22:06+01:00
New Revision: 5db8d6fd2bd6db3cdc6290b538f054ca0833d763
URL: https://github.com/llvm/llvm-project/commit/5db8d6fd2bd6db3cdc6290b538f054ca0833d763
DIFF: https://github.com/llvm/llvm-project/commit/5db8d6fd2bd6db3cdc6290b538f054ca0833d763.diff
LOG: [AMDGPU][CodeGen] Support (base | offset) SMEM loads.
Prevents generation of unnecessary s_or_b32 instructions.
Reviewed By: foad
Differential Revision: https://reviews.llvm.org/D132552
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index 1bbdc39a7a5e1..4b71d11f0c3bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -8,6 +8,7 @@
#include "AMDGPUGlobalISelUtils.h"
#include "GCNSubtarget.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/IR/Constants.h"
#include "llvm/Support/LowLevelTypeImpl.h"
@@ -16,7 +17,8 @@ using namespace llvm;
using namespace MIPatternMatch;
std::pair<Register, unsigned>
-AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
+AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
+ GISelKnownBits *KnownBits) {
MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
if (!Def)
return std::make_pair(Reg, 0);
@@ -43,6 +45,11 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
return std::make_pair(Def->getOperand(1).getReg(), Offset);
}
+ Register Base;
+ if (KnownBits && mi_match(Reg, MRI, m_GOr(m_Reg(Base), m_ICst(Offset))) &&
+ KnownBits->maskedValueIsZero(Base, APInt(32, Offset)))
+ return std::make_pair(Base, Offset);
+
// Handle G_PTRTOINT (G_PTR_ADD base, const) case
if (Def->getOpcode() == TargetOpcode::G_PTRTOINT) {
MachineInstr *Base;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 5c600d059b7a7..9f7c00b9f0b3b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -17,13 +17,15 @@ namespace llvm {
class MachineRegisterInfo;
class GCNSubtarget;
+class GISelKnownBits;
class LLT;
namespace AMDGPU {
/// Returns base register and constant offset.
std::pair<Register, unsigned>
-getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg);
+getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
+ GISelKnownBits *KnownBits = nullptr);
bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask);
bool hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget, const LLT &Ty);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 5972e9093ae90..8425948be92b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1987,7 +1987,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
// wraparound, because s_load instructions perform the addition in 64 bits.
- if (Addr.getValueType() == MVT::i32 && !Addr->getFlags().hasNoUnsignedWrap())
+ if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
+ !Addr->getFlags().hasNoUnsignedWrap())
return false;
SDValue N0, N1;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index d85bf107f34e8..1fa7596200171 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1438,7 +1438,7 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
.addImm(0);
} else {
std::tie(BaseOffset, ImmOffset) =
- AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
+ AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KnownBits);
if (Readfirstlane) {
// We have the constant offset now, so put the readfirstlane back on the
@@ -2654,15 +2654,14 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
/// Return the register to use for the index value, and the subregister to use
/// for the indirectly accessed register.
static std::pair<Register, unsigned>
-computeIndirectRegIndex(MachineRegisterInfo &MRI,
- const SIRegisterInfo &TRI,
- const TargetRegisterClass *SuperRC,
- Register IdxReg,
- unsigned EltSize) {
+computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,
+ const TargetRegisterClass *SuperRC, Register IdxReg,
+ unsigned EltSize, GISelKnownBits &KnownBits) {
Register IdxBaseReg;
int Offset;
- std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
+ std::tie(IdxBaseReg, Offset) =
+ AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &KnownBits);
if (IdxBaseReg == AMDGPU::NoRegister) {
// This will happen if the index is a known constant. This should ordinarily
// be legalized out, but handle it as a register just in case.
@@ -2713,8 +2712,8 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
const bool Is64 = DstTy.getSizeInBits() == 64;
unsigned SubReg;
- std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
- DstTy.getSizeInBits() / 8);
+ std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
+ *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KnownBits);
if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
if (DstTy.getSizeInBits() != 32 && !Is64)
@@ -2795,7 +2794,7 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
unsigned SubReg;
std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
- ValSize / 8);
+ ValSize / 8, *KnownBits);
const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
STI.useVGPRIndexMode();
@@ -4918,7 +4917,7 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
Register SOffset;
unsigned Offset;
std::tie(SOffset, Offset) =
- AMDGPU::getBaseWithConstantOffset(*MRI, Root.getReg());
+ AMDGPU::getBaseWithConstantOffset(*MRI, Root.getReg(), KnownBits);
if (!SOffset)
return None;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
index b10980f0f0759..2d04e103f588d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -110,7 +110,6 @@ define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset(<4 x i32> inreg %ba
ret void
}
-; TODO: Select S_BUFFER_LOAD_DWORD_SGPR_IMM here.
; GCN-LABEL: name: test_buffer_load_sgpr_or_imm_offset
; SDAG-DAG: %[[BASE0:.*]]:sgpr_32 = COPY $sgpr0
; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1
@@ -118,18 +117,16 @@ define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset(<4 x i32> inreg %ba
; SDAG-DAG: %[[BASE3:.*]]:sgpr_32 = COPY $sgpr3
; SDAG-DAG: %[[INDEX:.*]]:sgpr_32 = COPY $sgpr4
; SDAG-DAG: %[[SHIFT:.*]]:sreg_32 = S_LSHL_B32 %[[INDEX]],
-; SDAG-DAG: %[[OR:.*]]:sreg_32 = S_OR_B32 killed %[[SHIFT]],
; SDAG-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3
-; SDAG: S_BUFFER_LOAD_DWORD_SGPR killed %[[BASE]], killed %[[OR]], 0
+; SDAG: S_BUFFER_LOAD_DWORD_SGPR_IMM killed %[[BASE]], killed %[[SHIFT]], 5,
; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0
; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1
; GISEL-DAG: %[[BASE2:.*]]:sreg_32 = COPY $sgpr2
; GISEL-DAG: %[[BASE3:.*]]:sreg_32 = COPY $sgpr3
; GISEL-DAG: %[[INDEX:.*]]:sreg_32 = COPY $sgpr4
; GISEL-DAG: %[[SHIFT:.*]]:sreg_32 = S_LSHL_B32 %[[INDEX]],
-; GISEL-DAG: %[[OR:.*]]:sreg_32 = S_OR_B32 %[[SHIFT]],
; GISEL-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3
-; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[OR]], 0,
+; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[SHIFT]], 5,
define amdgpu_cs void @test_buffer_load_sgpr_or_imm_offset(<4 x i32> inreg %base, i32 inreg %i, i32 addrspace(1)* inreg %out) {
%shift = shl i32 %i, 7
%off = or i32 %shift, 5
More information about the llvm-commits
mailing list