[llvm] 0530fdb - [AMDGPU] Fix LOD bias in A16 combine
Sebastian Neubauer via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 21 03:09:31 PST 2022
Author: Sebastian Neubauer
Date: 2022-01-21T12:09:06+01:00
New Revision: 0530fdbbbb84ea3024a4a8f7156ff716f00ffd48
URL: https://github.com/llvm/llvm-project/commit/0530fdbbbb84ea3024a4a8f7156ff716f00ffd48
DIFF: https://github.com/llvm/llvm-project/commit/0530fdbbbb84ea3024a4a8f7156ff716f00ffd48.diff
LOG: [AMDGPU] Fix LOD bias in A16 combine
As the codegen fix in D111754, the LOD bias needs to be converted to 16
bits. Fix this in the combine.
Differential Revision: https://reviews.llvm.org/D116038
Added:
Modified:
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/MIMGInstructions.td
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 861545b445a33..c5d266eb57ecf 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -558,6 +558,9 @@ class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr
// {offset} {bias} {z-compare}
list<AMDGPUArg> ExtraAddrArgs = extra_addr;
+ bit Offset = false;
+ bit Bias = false;
+ bit ZCompare = false;
bit Gradients = false;
// Name of the {lod} or {clamp} argument that is appended to the coordinates,
@@ -571,6 +574,7 @@ defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = {
multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod,
list<AMDGPUArg> extra_addr> {
def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>;
+ let Offset = true in
def NAME#lcmod#_o : AMDGPUSampleVariant<
ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>;
}
@@ -578,6 +582,7 @@ defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = {
multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod,
list<AMDGPUArg> extra_addr> {
defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>;
+ let ZCompare = true in
defm NAME : AMDGPUSampleHelper_Offset<
"_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>;
}
@@ -591,6 +596,7 @@ defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = {
defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = {
defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>;
+ let Bias = true in
defm AMDGPUSample : AMDGPUSampleHelper_Clamp<
"_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>;
let LodOrClamp = "lod" in
@@ -618,6 +624,9 @@ class AMDGPUDimProfile<string opmod,
list<LLVMType> RetTypes = [];
list<AMDGPUArg> DataArgs = [];
list<AMDGPUArg> ExtraAddrArgs = [];
+ bit Offset = false;
+ bit Bias = false;
+ bit ZCompare = false;
bit Gradients = false;
string LodClampMip = "";
@@ -652,6 +661,9 @@ class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod,
let RetTypes = base.RetTypes;
let DataArgs = base.DataArgs;
let ExtraAddrArgs = base.ExtraAddrArgs;
+ let Offset = base.Offset;
+ let Bias = base.Bias;
+ let ZCompare = base.ZCompare;
let Gradients = base.Gradients;
let LodClampMip = base.LodClampMip;
}
@@ -662,6 +674,9 @@ class AMDGPUDimSampleProfile<string opmod,
let IsSample = true;
let RetTypes = [llvm_any_ty];
let ExtraAddrArgs = sample.ExtraAddrArgs;
+ let Offset = sample.Offset;
+ let Bias = sample.Bias;
+ let ZCompare = sample.ZCompare;
let Gradients = sample.Gradients;
let LodClampMip = sample.LodOrClamp;
}
@@ -702,7 +717,10 @@ class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim>
class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
int NumDataArgs = !size(P_.DataArgs);
int NumDmaskArgs = !not(P_.IsAtomic);
- int NumExtraAddrArgs = !size(P_.ExtraAddrArgs);
+ int NumOffsetArgs = !if(P_.Offset, 1, 0);
+ int NumBiasArgs = !if(P_.Bias, 1, 0);
+ int NumZCompareArgs = !if(P_.ZCompare, 1, 0);
+ int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs);
int NumVAddrArgs = !size(P_.AddrArgs);
int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0);
int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs));
@@ -710,6 +728,9 @@ class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
int NumSampArgs = !if(P_.IsSample, 2, 0);
int DmaskArgIndex = NumDataArgs;
int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs);
+ int OffsetArgIndex = VAddrArgIndex;
+ int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs);
+ int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs);
int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs);
int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs);
int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index db84b87669241..5eb7cf89abb24 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -127,14 +127,20 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
FloatCoord = Coord->getType()->isFloatingPointTy();
}
- if (OnlyDerivatives) {
- if (!ST->hasG16())
- return None;
- } else {
- if (!ST->hasA16())
- OnlyDerivatives = true; // Only supports G16
+ if (!OnlyDerivatives && !ST->hasA16())
+ OnlyDerivatives = true; // Only supports G16
+
+ // Check if there is a bias parameter and if it can be converted to f16
+ if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
+ Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
+ if (!canSafelyConvertTo16Bit(*Bias))
+ OnlyDerivatives = true;
}
+ if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
+ ImageDimIntr->CoordStart))
+ return None;
+
Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
: Type::getInt16Ty(II.getContext());
@@ -143,8 +149,13 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
return None;
ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
- if (!OnlyDerivatives)
+ if (!OnlyDerivatives) {
ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
+
+ // Change the bias type
+ if (ImageDimIntr->NumBiasArgs != 0)
+ ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
+ }
Function *I =
Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
@@ -158,6 +169,12 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
}
+ // Convert the bias
+ if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
+ Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
+ Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
+ }
+
CallInst *NewCall = IC.Builder.CreateCall(I, Args);
NewCall->takeName(&II);
NewCall->copyMetadata(II);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 673011f48289e..e7ee364476824 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -49,6 +49,9 @@ struct ImageDimIntrinsicInfo {
unsigned BaseOpcode;
MIMGDim Dim;
+ uint8_t NumOffsetArgs;
+ uint8_t NumBiasArgs;
+ uint8_t NumZCompareArgs;
uint8_t NumGradients;
uint8_t NumDmask;
uint8_t NumData;
@@ -57,6 +60,9 @@ struct ImageDimIntrinsicInfo {
uint8_t DMaskIndex;
uint8_t VAddrStart;
+ uint8_t OffsetIndex;
+ uint8_t BiasIndex;
+ uint8_t ZCompareIndex;
uint8_t GradientStart;
uint8_t CoordStart;
uint8_t LodIndex;
@@ -68,6 +74,7 @@ struct ImageDimIntrinsicInfo {
uint8_t TexFailCtrlIndex;
uint8_t CachePolicyIndex;
+ uint8_t BiasTyArg;
uint8_t GradientTyArg;
uint8_t CoordTyArg;
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 20b2b0f1be0ce..5092e0f553e2a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4273,15 +4273,18 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
if ((I < Intr->GradientStart) ||
(I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
(I >= Intr->CoordStart && !IsA16)) {
- // Handle any gradient or coordinate operands that should not be packed
if ((I < Intr->GradientStart) && IsA16 &&
(B.getMRI()->getType(AddrReg) == S16)) {
+ assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
// Special handling of bias when A16 is on. Bias is of type half but
// occupies full 32-bit.
PackedAddrs.push_back(
B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
.getReg(0));
} else {
+ assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
+ "Bias needs to be converted to 16 bit in A16 mode");
+ // Handle any gradient or coordinate operands that should not be packed
AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
PackedAddrs.push_back(AddrReg);
}
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 6dd886367302a..1d8a558359378 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1070,6 +1070,9 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
AMDGPUDimProps Dim = I.P.Dim;
AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>;
+ bits<8> NumOffsetArgs = DimEval.NumOffsetArgs;
+ bits<8> NumBiasArgs = DimEval.NumBiasArgs;
+ bits<8> NumZCompareArgs = DimEval.NumZCompareArgs;
bits<8> NumGradients = DimEval.NumGradientArgs;
bits<8> NumDmask = DimEval.NumDmaskArgs;
bits<8> NumData = DimEval.NumDataArgs;
@@ -1078,6 +1081,9 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
bits<8> DMaskIndex = DimEval.DmaskArgIndex;
bits<8> VAddrStart = DimEval.VAddrArgIndex;
+ bits<8> OffsetIndex = DimEval.OffsetArgIndex;
+ bits<8> BiasIndex = DimEval.BiasArgIndex;
+ bits<8> ZCompareIndex = DimEval.ZCompareArgIndex;
bits<8> GradientStart = DimEval.GradientArgIndex;
bits<8> CoordStart = DimEval.CoordArgIndex;
bits<8> LodIndex = DimEval.LodArgIndex;
@@ -1089,6 +1095,8 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
bits<8> TexFailCtrlIndex = DimEval.TexFailCtrlArgIndex;
bits<8> CachePolicyIndex = DimEval.CachePolicyArgIndex;
+ bits<8> BiasTyArg = !add(I.P.NumRetAndDataAnyTypes,
+ !if(!eq(NumOffsetArgs, 0), 0, I.P.ExtraAddrArgs[0].Type.isAny));
bits<8> GradientTyArg = !add(I.P.NumRetAndDataAnyTypes,
!foldl(0, I.P.ExtraAddrArgs, cnt, arg, !add(cnt, arg.Type.isAny)));
bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0));
@@ -1096,10 +1104,10 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
def ImageDimIntrinsicTable : GenericTable {
let FilterClass = "ImageDimIntrinsicInfo";
- let Fields = ["Intr", "BaseOpcode", "Dim", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs",
- "DMaskIndex", "VAddrStart", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd",
+ let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs",
+ "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd",
"RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex",
- "GradientTyArg", "CoordTyArg"];
+ "BiasTyArg", "GradientTyArg", "CoordTyArg"];
string TypeOf_BaseOpcode = "MIMGBaseOpcode";
string TypeOf_Dim = "MIMGDim";
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5176ba44afad6..26229b40f4dc5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6316,12 +6316,18 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
// Push back extra arguments.
for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
+ assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
// Special handling of bias when A16 is on. Bias is of type half but
// occupies full 32-bit.
- SDValue bias = DAG.getBuildVector( MVT::v2f16, DL, {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
- VAddrs.push_back(bias);
- } else
+ SDValue Bias = DAG.getBuildVector(
+ MVT::v2f16, DL,
+ {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
+ VAddrs.push_back(Bias);
+ } else {
+ assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
+ "Bias needs to be converted to 16 bit in A16 mode");
VAddrs.push_back(Op.getOperand(ArgOffset + I));
+ }
}
if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index a6ddcdd0a4a0e..9607fe63f4637 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -3019,9 +3019,23 @@ define amdgpu_kernel void @image_sample_a16_c_cl_2d(<4 x float> addrspace(1)* %o
ret void
}
-define amdgpu_kernel void @image_sample_a16_b_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s) {
-; CHECK-LABEL: @image_sample_a16_b_1d(
-; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+define amdgpu_kernel void @image_sample_a16_b16_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s) {
+; CHECK-LABEL: @image_sample_a16_b16_1d(
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT: ret void
+;
+ %bias32 = fpext half %bias to float
+ %s32 = fpext half %s to float
+ %res = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float %bias32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @image_sample_a16_b32_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s) {
+; CHECK-LABEL: @image_sample_a16_b32_1d(
+; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
; CHECK-NEXT: ret void
;
@@ -3031,9 +3045,25 @@ define amdgpu_kernel void @image_sample_a16_b_1d(<4 x float> addrspace(1)* %out,
ret void
}
-define amdgpu_kernel void @image_sample_a16_b_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) {
-; CHECK-LABEL: @image_sample_a16_b_2d(
-; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+define amdgpu_kernel void @image_sample_a16_b16_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t) {
+; CHECK-LABEL: @image_sample_a16_b16_2d(
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT: ret void
+;
+ %bias32 = fpext half %bias to float
+ %s32 = fpext half %s to float
+ %t32 = fpext half %t to float
+ %res = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float %bias32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @image_sample_a16_b32_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) {
+; CHECK-LABEL: @image_sample_a16_b32_2d(
+; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
+; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
; CHECK-NEXT: ret void
;
@@ -3044,9 +3074,23 @@ define amdgpu_kernel void @image_sample_a16_b_2d(<4 x float> addrspace(1)* %out,
ret void
}
-define amdgpu_kernel void @image_sample_a16_c_b_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s) {
-; CHECK-LABEL: @image_sample_a16_c_b_1d(
-; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+define amdgpu_kernel void @image_sample_a16_c_b16_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s) {
+; CHECK-LABEL: @image_sample_a16_c_b16_1d(
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT: ret void
+;
+ %bias32 = fpext half %bias to float
+ %s32 = fpext half %s to float
+ %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias32, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @image_sample_a16_c_b32_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s) {
+; CHECK-LABEL: @image_sample_a16_c_b32_1d(
+; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
; CHECK-NEXT: ret void
;
@@ -3056,9 +3100,25 @@ define amdgpu_kernel void @image_sample_a16_c_b_1d(<4 x float> addrspace(1)* %ou
ret void
}
-define amdgpu_kernel void @image_sample_a16_c_b_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) {
-; CHECK-LABEL: @image_sample_a16_c_b_2d(
-; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+define amdgpu_kernel void @image_sample_a16_c_b16_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t) {
+; CHECK-LABEL: @image_sample_a16_c_b16_2d(
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT: ret void
+;
+ %bias32 = fpext half %bias to float
+ %s32 = fpext half %s to float
+ %t32 = fpext half %t to float
+ %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float %bias32, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @image_sample_a16_c_b32_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) {
+; CHECK-LABEL: @image_sample_a16_c_b32_2d(
+; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
+; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
; CHECK-NEXT: ret void
;
@@ -3069,9 +3129,25 @@ define amdgpu_kernel void @image_sample_a16_c_b_2d(<4 x float> addrspace(1)* %ou
ret void
}
-define amdgpu_kernel void @image_sample_a16_b_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_b_cl_1d(
-; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+define amdgpu_kernel void @image_sample_a16_b16_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %clamp) {
+; CHECK-LABEL: @image_sample_a16_b16_cl_1d(
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT: ret void
+;
+ %bias32 = fpext half %bias to float
+ %s32 = fpext half %s to float
+ %clamp32 = fpext half %clamp to float
+ %res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float %bias32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @image_sample_a16_b32_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %clamp) {
+; CHECK-LABEL: @image_sample_a16_b32_cl_1d(
+; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
+; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
; CHECK-NEXT: ret void
;
@@ -3082,9 +3158,27 @@ define amdgpu_kernel void @image_sample_a16_b_cl_1d(<4 x float> addrspace(1)* %o
ret void
}
-define amdgpu_kernel void @image_sample_a16_b_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_b_cl_2d(
-; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+define amdgpu_kernel void @image_sample_a16_b16_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t, half %clamp) {
+; CHECK-LABEL: @image_sample_a16_b16_cl_2d(
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT: ret void
+;
+ %bias32 = fpext half %bias to float
+ %s32 = fpext half %s to float
+ %t32 = fpext half %t to float
+ %clamp32 = fpext half %clamp to float
+ %res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float %bias32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @image_sample_a16_b32_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) {
+; CHECK-LABEL: @image_sample_a16_b32_cl_2d(
+; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
+; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float
+; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
; CHECK-NEXT: ret void
;
@@ -3096,9 +3190,25 @@ define amdgpu_kernel void @image_sample_a16_b_cl_2d(<4 x float> addrspace(1)* %o
ret void
}
-define amdgpu_kernel void @image_sample_a16_c_b_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_c_b_cl_1d(
-; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+define amdgpu_kernel void @image_sample_a16_c_b16_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %clamp) {
+; CHECK-LABEL: @image_sample_a16_c_b16_cl_1d(
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT: ret void
+;
+ %bias32 = fpext half %bias to float
+ %s32 = fpext half %s to float
+ %clamp32 = fpext half %clamp to float
+ %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias32, float %zcompare, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @image_sample_a16_c_b32_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %clamp) {
+; CHECK-LABEL: @image_sample_a16_c_b32_cl_1d(
+; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
+; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
; CHECK-NEXT: ret void
;
@@ -3109,9 +3219,27 @@ define amdgpu_kernel void @image_sample_a16_c_b_cl_1d(<4 x float> addrspace(1)*
ret void
}
-define amdgpu_kernel void @image_sample_a16_c_b_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_c_b_cl_2d(
-; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+define amdgpu_kernel void @image_sample_a16_c_b16_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t, half %clamp) {
+; CHECK-LABEL: @image_sample_a16_c_b16_cl_2d(
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT: ret void
+;
+ %bias32 = fpext half %bias to float
+ %s32 = fpext half %s to float
+ %t32 = fpext half %t to float
+ %clamp32 = fpext half %clamp to float
+ %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float %bias32, float %zcompare, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @image_sample_a16_c_b32_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) {
+; CHECK-LABEL: @image_sample_a16_c_b32_cl_2d(
+; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
+; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float
+; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float
+; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
; CHECK-NEXT: ret void
;
More information about the llvm-commits
mailing list