[llvm] 4ed7c6e - [AMDGPU] Only match correct type for a16
Sebastian Neubauer via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 25 06:07:05 PST 2022
Author: Sebastian Neubauer
Date: 2022-01-25T14:59:16+01:00
New Revision: 4ed7c6eec97925281f4c2a02ec7030dab750ba34
URL: https://github.com/llvm/llvm-project/commit/4ed7c6eec97925281f4c2a02ec7030dab750ba34
DIFF: https://github.com/llvm/llvm-project/commit/4ed7c6eec97925281f4c2a02ec7030dab750ba34.diff
LOG: [AMDGPU] Only match correct type for a16
Addresses are floats when a sampler is present and unsigned integers
when no sampler is present.
Therefore, only zext instructions, not sext instructions should match.
Also match integer constants that can be truncated.
Differential Revision: https://reviews.llvm.org/D118043
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index c3a326945557e..4f1d700bcd842 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -58,24 +58,37 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
// Check if a value can be converted to a 16-bit value without losing
// precision.
-static bool canSafelyConvertTo16Bit(Value &V) {
+// The value is expected to be either a float (IsFloat = true) or an unsigned
+// integer (IsFloat = false).
+static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
Type *VTy = V.getType();
if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
// The value is already 16-bit, so we don't want to convert to 16-bit again!
return false;
}
- if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
- // We need to check that if we cast the index down to a half, we do not lose
- // precision.
- APFloat FloatValue(ConstFloat->getValueAPF());
- bool LosesInfo = true;
- FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
- return !LosesInfo;
+ if (IsFloat) {
+ if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
+ // We need to check that if we cast the index down to a half, we do not
+ // lose precision.
+ APFloat FloatValue(ConstFloat->getValueAPF());
+ bool LosesInfo = true;
+ FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
+ &LosesInfo);
+ return !LosesInfo;
+ }
+ } else {
+ if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
+ // We need to check that if we cast the index down to an i16, we do not
+ // lose precision.
+ APInt IntValue(ConstInt->getValue());
+ return IntValue.getActiveBits() <= 16;
+ }
}
+
Value *CastSrc;
- if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
- match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
- match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
+ bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
+ : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
+ if (IsExt) {
Type *CastSrcTy = CastSrc->getType();
if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
return true;
@@ -203,6 +216,10 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
if (!ST->hasA16() && !ST->hasG16())
return None;
+ // Address is interpreted as float if the instruction has a sampler or as
+ // unsigned int if there is no sampler.
+ bool HasSampler =
+ AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
bool FloatCoord = false;
// true means derivatives can be converted to 16 bit, coordinates not
bool OnlyDerivatives = false;
@@ -211,7 +228,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
Value *Coord = II.getOperand(OperandIndex);
// If the values are not derived from 16-bit values, we cannot optimize.
- if (!canSafelyConvertTo16Bit(*Coord)) {
+ if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
if (OperandIndex < ImageDimIntr->CoordStart ||
ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
return None;
@@ -232,7 +249,9 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
// Check if there is a bias parameter and if it can be converted to f16
if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
- if (!canSafelyConvertTo16Bit(*Bias))
+ assert(HasSampler &&
+ "Only image instructions with a sampler can have a bias");
+ if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
OnlyDerivatives = true;
}
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 709d531b8cee6..894e0ef860646 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -3667,6 +3667,105 @@ define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V2(<2 x float> addrspa
ret void
}
+define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const(<2 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %slice) {
+; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_const(
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half 0xH3400, half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT: store <2 x float> [[RES]], <2 x float> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT: ret void
+;
+ %dsdh32 = fpext half %dsdh to float
+ %dtdh32 = fpext half %dtdh to float
+ %dsdv32 = fpext half %dsdv to float
+ %dtdv32 = fpext half %dtdv to float
+ %s32 = fpext half %s to float
+ %slice32 = fpext half %slice to float
+ %res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float 0.25, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ store <2 x float> %res, <2 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const_noopt(<2 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %slice) {
+; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_const_noopt(
+; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
+; CHECK-NEXT: [[SLICE32:%.*]] = fpext half [[SLICE:%.*]] to float
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S32]], float 1.000000e+10, float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT: store <2 x float> [[RES]], <2 x float> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT: ret void
+;
+ %dsdh32 = fpext half %dsdh to float
+ %dtdh32 = fpext half %dtdh to float
+ %dsdv32 = fpext half %dsdv to float
+ %dtdv32 = fpext half %dtdv to float
+ %s32 = fpext half %s to float
+ %slice32 = fpext half %slice to float
+ %res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float 1.0e+10, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ store <2 x float> %res, <2 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @image_load_a16_mip_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) {
+; CHECK-LABEL: @image_load_a16_mip_1d(
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT: ret void
+;
+ %s32 = zext i16 %s to i32
+ %res = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s32, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @image_load_a16_mip_1d_noopt(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) {
+; CHECK-LABEL: @image_load_a16_mip_1d_noopt(
+; CHECK-NEXT: [[S32:%.*]] = sext i16 [[S:%.*]] to i32
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S32]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT: ret void
+;
+ %s32 = sext i16 %s to i32
+ %res = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s32, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @image_load_a16_mip_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s, i16 %t) {
+; CHECK-LABEL: @image_load_a16_mip_2d(
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S:%.*]], i16 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT: ret void
+;
+ %s32 = zext i16 %s to i32
+ %t32 = zext i16 %t to i32
+ %res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s32, i32 %t32, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @image_load_a16_mip_2d_const(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) {
+; CHECK-LABEL: @image_load_a16_mip_2d_const(
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S:%.*]], i16 -1, <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT: ret void
+;
+ %s32 = zext i16 %s to i32
+ %res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s32, i32 65535, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @image_load_a16_mip_2d_const_noopt(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) {
+; CHECK-LABEL: @image_load_a16_mip_2d_const_noopt(
+; CHECK-NEXT: [[S32:%.*]] = zext i16 [[S:%.*]] to i32
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 [[S32]], i32 65536, <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT: ret void
+;
+ %s32 = zext i16 %s to i32
+ %res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s32, i32 65536, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
; --------------------------------------------------------------------
; llvm.amdgcn.image.sample g16
; --------------------------------------------------------------------
More information about the llvm-commits
mailing list