[llvm] fd63e46 - AMDGPU/GlobalISel: Apply load bitcast to s.buffer.load intrinsic
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 3 05:54:36 PDT 2020
Author: Matt Arsenault
Date: 2020-08-03T08:54:29-04:00
New Revision: fd63e46941fc48d4cc777ef94e185637898d0adb
URL: https://github.com/llvm/llvm-project/commit/fd63e46941fc48d4cc777ef94e185637898d0adb
DIFF: https://github.com/llvm/llvm-project/commit/fd63e46941fc48d4cc777ef94e185637898d0adb.diff
LOG: AMDGPU/GlobalISel: Apply load bitcast to s.buffer.load intrinsic
Should also apply this to the non-scalar buffer loads.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index cc97e11707ab..b40870024cc4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -122,20 +122,23 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
};
}
+static LLT getBitcastRegisterType(const LLT Ty) {
+ const unsigned Size = Ty.getSizeInBits();
+
+ LLT CoercedTy;
+ if (Size <= 32) {
+ // <2 x s8> -> s16
+ // <4 x s8> -> s32
+ return LLT::scalar(Size);
+ }
+
+ return LLT::scalarOrVector(Size / 32, 32);
+}
+
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
- unsigned Size = Ty.getSizeInBits();
-
- LLT CoercedTy;
- if (Size <= 32) {
- // <2 x s8> -> s16
- // <4 x s8> -> s32
- CoercedTy = LLT::scalar(Size);
- } else
- CoercedTy = LLT::scalarOrVector(Size / 32, 32);
-
- return std::make_pair(TypeIdx, CoercedTy);
+ return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
};
}
@@ -335,6 +338,20 @@ static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
!loadStoreBitcastWorkaround(Ty);
}
+/// Return true if a load or store of the type should be lowered with a bitcast
+/// to a
diff erent type.
+static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
+ const unsigned MemSizeInBits) {
+ const unsigned Size = Ty.getSizeInBits();
+ if (Size != MemSizeInBits)
+ return Size <= 32 && Ty.isVector();
+
+ if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
+ return true;
+ return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
+ !isRegisterVectorElementType(Ty.getElementType());
+}
+
AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const GCNTargetMachine &TM)
: ST(ST_) {
@@ -1048,16 +1065,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// 16-bit vector parts.
Actions.bitcastIf(
[=](const LegalityQuery &Query) -> bool {
- const LLT Ty = Query.Types[0];
- const unsigned Size = Ty.getSizeInBits();
-
- if (Size != Query.MMODescrs[0].SizeInBits)
- return Size <= 32 && Ty.isVector();
-
- if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
- return true;
- return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
- !isRegisterVectorElementType(Ty.getElementType());
+ return shouldBitcastLoadStoreType(ST, Query.Types[0],
+ Query.MMODescrs[0].SizeInBits);
}, bitcastToRegisterType(0));
Actions
@@ -4137,8 +4146,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
}
bool AMDGPULegalizerInfo::legalizeSBufferLoad(
- MachineInstr &MI, MachineIRBuilder &B,
- GISelChangeObserver &Observer) const {
+ LegalizerHelper &Helper, MachineInstr &MI) const {
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ GISelChangeObserver &Observer = Helper.Observer;
+
Register Dst = MI.getOperand(0).getReg();
LLT Ty = B.getMRI()->getType(Dst);
unsigned Size = Ty.getSizeInBits();
@@ -4146,6 +4157,13 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
Observer.changingInstr(MI);
+ if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
+ Ty = getBitcastRegisterType(Ty);
+ Helper.bitcastDst(MI, Ty, 0);
+ Dst = MI.getOperand(0).getReg();
+ B.setInsertPt(B.getMBB(), MI);
+ }
+
// FIXME: We don't really need this intermediate instruction. The intrinsic
// should be fixed to have a memory operand. Since it's readnone, we're not
// allowed to add one.
@@ -4167,8 +4185,6 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
// always be legal. We may need to restore this to a 96-bit result if it turns
// out this needs to be converted to a vector load during RegBankSelect.
if (!isPowerOf2_32(Size)) {
- LegalizerHelper Helper(MF, *this, Observer, B);
-
if (Ty.isVector())
Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
else
@@ -4360,7 +4376,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return true;
}
case Intrinsic::amdgcn_s_buffer_load:
- return legalizeSBufferLoad(MI, B, Helper.Observer);
+ return legalizeSBufferLoad(Helper, MI);
case Intrinsic::amdgcn_raw_buffer_store:
case Intrinsic::amdgcn_struct_buffer_store:
return legalizeBufferStore(MI, MRI, B, false, false);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index fe4e17db48a6..332d675c1a88 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -167,9 +167,7 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
GISelChangeObserver &Observer,
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const;
- bool legalizeSBufferLoad(
- MachineInstr &MI, MachineIRBuilder &B,
- GISelChangeObserver &Observer) const;
+ bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B,
bool IsInc) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
index 8860ca6ba5e5..9aee145ec1d1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
@@ -67,9 +67,10 @@ body: |
; GCN-LABEL: name: s_buffer_load_v6s16
; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<8 x s16>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4)
- ; GCN: [[EXTRACT:%[0-9]+]]:_(<6 x s16>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<8 x s16>), 0
- ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<6 x s16>)
+ ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4)
+ ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0
+ ; GCN: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[EXTRACT]](<3 x s32>)
+ ; GCN: S_ENDPGM 0, implicit [[BITCAST]](<6 x s16>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(s32) = G_CONSTANT i32 0
%2:_(<6 x s16>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0
@@ -124,13 +125,83 @@ body: |
; GCN-LABEL: name: s_buffer_load_v12s8
; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<16 x s8>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4)
- ; GCN: [[EXTRACT:%[0-9]+]]:_(<12 x s8>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<16 x s8>), 0
- ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<12 x s8>)
+ ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4)
+ ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0
+ ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>)
+ ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+ ; GCN: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32)
+ ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GCN: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32)
+ ; GCN: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+ ; GCN: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C3]](s32)
+ ; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+ ; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+ ; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+ ; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32)
+ ; GCN: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32)
+ ; GCN: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32)
+ ; GCN: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C3]](s32)
+ ; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+ ; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32)
+ ; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32)
+ ; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32)
+ ; GCN: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32)
+ ; GCN: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32)
+ ; GCN: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C3]](s32)
+ ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
+ ; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32)
+ ; GCN: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32)
+ ; GCN: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32)
+ ; GCN: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; GCN: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+ ; GCN: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C4]]
+ ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+ ; GCN: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C4]]
+ ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
+ ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+ ; GCN: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+ ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32)
+ ; GCN: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C4]]
+ ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32)
+ ; GCN: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C4]]
+ ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
+ ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+ ; GCN: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
+ ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY5]](s32)
+ ; GCN: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C4]]
+ ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY6]](s32)
+ ; GCN: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C4]]
+ ; GCN: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
+ ; GCN: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+ ; GCN: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
+ ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY7]](s32)
+ ; GCN: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C4]]
+ ; GCN: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32)
+ ; GCN: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C4]]
+ ; GCN: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32)
+ ; GCN: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]]
+ ; GCN: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
+ ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
+ ; GCN: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C4]]
+ ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
+ ; GCN: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C4]]
+ ; GCN: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C2]](s32)
+ ; GCN: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]]
+ ; GCN: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
+ ; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
+ ; GCN: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C4]]
+ ; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
+ ; GCN: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C4]]
+ ; GCN: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32)
+ ; GCN: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]]
+ ; GCN: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+ ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
+ ; GCN: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<12 x s16>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(s32) = G_CONSTANT i32 0
%2:_(<12 x s8>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0
- S_ENDPGM 0, implicit %2
+ %3:_(<12 x s16>) = G_ANYEXT %2
+ S_ENDPGM 0, implicit %3
...
More information about the llvm-commits
mailing list