[llvm-branch-commits] [clang] [llvm] [AMDGPU] Add v_cvt_sr|pk_bf8|fp8_f16 gfx1250 instructions (PR #151415)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jul 30 15:55:17 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mc
@llvm/pr-subscribers-backend-amdgpu
Author: Stanislav Mekhanoshin (rampitec)
<details>
<summary>Changes</summary>
---
Patch is 122.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151415.diff
28 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+4)
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+138)
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+24)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+4)
- (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+23-1)
- (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp (+3)
- (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp (+3)
- (modified) llvm/lib/Target/AMDGPU/SIDefines.h (+1)
- (modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+1)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+2)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1)
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.td (+2)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+3)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+1)
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+51)
- (modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+16)
- (added) llvm/test/CodeGen/AMDGPU/code-size-estimate-gfx1250.ll (+28)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.f16.ll (+539)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s (+145)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s (+145)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16-fake16.s (+64)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16.s (+64)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8-fake16.s (+64)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8.s (+64)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s (+25)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt (+167-3)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp16.txt (+64)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp8.txt (+64)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index ec00fadf3039a..172ac467f7cad 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -702,6 +702,10 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f16, "sV2h", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f16, "sV2h", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f16, "ihiUiIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f16, "ihiUiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_u4_u8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 2595442ba7f9e..1c67fc3879bff 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -398,6 +398,144 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
}
+// CHECK-LABEL: @test_cvt_pk_bf8_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x half> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.cvt.pk.bf8.f16(<2 x half> [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT: ret void
+//
+void test_cvt_pk_bf8_f16(global short* out, half2 a)
+{
+ *out = __builtin_amdgcn_cvt_pk_bf8_f16(a);
+}
+
+// CHECK-LABEL: @test_cvt_pk_fp8_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x half> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.cvt.pk.fp8.f16(<2 x half> [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT: ret void
+//
+void test_cvt_pk_fp8_f16(global short* out, half2 a)
+{
+ *out = __builtin_amdgcn_cvt_pk_fp8_f16(a);
+}
+
+// CHECK-LABEL: @test_cvt_sr_bf8_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5)
+// CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
+// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store half [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 0)
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i32 1)
+// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP10]], i32 [[TMP11]], i32 [[TMP12]], i32 2)
+// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
+// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP15]], i32 [[TMP16]], i32 [[TMP17]], i32 3)
+// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
+// CHECK-NEXT: ret void
+//
+void test_cvt_sr_bf8_f16(global int* out, half a, uint sr, int old)
+{
+ *out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 0);
+ *out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 1);
+ *out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 2);
+ *out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 3);
+}
+
+// CHECK-LABEL: @test_cvt_sr_fp8_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5)
+// CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
+// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
+// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store half [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: store i16 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 2
+// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP0]], i32 [[CONV]], i32 [[TMP2]], i32 0)
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[TMP6]] to i32
+// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP5]], i32 [[CONV1]], i32 [[TMP7]], i32 1)
+// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP11:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP11]] to i32
+// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP10]], i32 [[CONV2]], i32 [[TMP12]], i32 2)
+// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
+// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP16:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[CONV3:%.*]] = sext i16 [[TMP16]] to i32
+// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP15]], i32 [[CONV3]], i32 [[TMP17]], i32 3)
+// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
+// CHECK-NEXT: ret void
+//
+void test_cvt_sr_fp8_f16(global int* out, half a, short sr, int old)
+{
+ *out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 0);
+ *out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 1);
+ *out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 2);
+ *out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 3);
+}
+
// CHECK-LABEL: @test_sat_pk4_i4_i8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index c29f6c740f1a4..4a50558ca8e86 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -605,6 +605,30 @@ def int_amdgcn_cvt_pk_f16_bf8 : DefaultAttrsIntrinsic<
[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrSpeculatable]
>, ClangBuiltin<"__builtin_amdgcn_cvt_pk_f16_bf8">;
+def int_amdgcn_cvt_pk_fp8_f16
+ : DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty],
+ [IntrNoMem, IntrSpeculatable]>,
+ ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f16">;
+
+def int_amdgcn_cvt_pk_bf8_f16
+ : DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty],
+ [IntrNoMem, IntrSpeculatable]>,
+ ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f16">;
+
+// llvm.amdgcn.cvt.sr.fp8.f16 i32 vdst, half src, i32 seed, i32 old, imm byte_sel [0..3]
+// byte_sel selects byte to write in vdst.
+def int_amdgcn_cvt_sr_fp8_f16 : DefaultAttrsIntrinsic<
+ [llvm_i32_ty], [llvm_half_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
+>, ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f16">;
+
+// llvm.amdgcn.cvt.sr.bf8.f16 i32 vdst, half src, i32 seed, i32 old, imm byte_sel [0..3]
+// byte_sel selects byte to write in vdst.
+def int_amdgcn_cvt_sr_bf8_f16 : DefaultAttrsIntrinsic<
+ [llvm_i32_ty], [llvm_half_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
+>, ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f16">;
+
class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
[DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
>, ClangBuiltin<"__builtin_amdgcn_"#name>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 25f428ac6c8a9..6bca2fec17c6d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4577,6 +4577,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32:
case Intrinsic::amdgcn_cvt_pk_f16_fp8:
case Intrinsic::amdgcn_cvt_pk_f16_bf8:
+ case Intrinsic::amdgcn_cvt_pk_fp8_f16:
+ case Intrinsic::amdgcn_cvt_pk_bf8_f16:
+ case Intrinsic::amdgcn_cvt_sr_fp8_f16:
+ case Intrinsic::amdgcn_cvt_sr_bf8_f16:
case Intrinsic::amdgcn_sat_pk4_i4_i8:
case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 44e65b3588888..a4ea8cf423f84 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -689,6 +689,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
bool isVSrc_v2f16() const { return isVSrc_f16() || isLiteralImm(MVT::v2f16); }
+ bool isVSrc_NoInline_v2f16() const { return isVSrc_v2f16(); }
+
bool isVISrcB32() const {
return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i32);
}
@@ -2036,6 +2038,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_KIMM16:
return &APFloat::IEEEhalf();
case AMDGPU::OPERAND_REG_IMM_BF16:
@@ -2405,6 +2408,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2FP32:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
case AMDGPU::OPERAND_KIMM32:
@@ -2456,6 +2460,9 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
setImmKindConst();
return;
}
+ [[fallthrough]];
+
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
setImmKindLiteral();
@@ -3761,6 +3768,9 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
OperandType == AMDGPU::OPERAND_REG_INLINE_C_BF16)
return AMDGPU::isInlinableLiteralBF16(Val, hasInv2PiInlineImm());
+ if (OperandType == AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16)
+ return false;
+
llvm_unreachable("invalid operand type");
}
default:
@@ -9421,7 +9431,19 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
- Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12)) {
+ Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp8_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp8_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp8_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp8_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_gfx1250 ||
+ Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_gfx1250)) {
Inst.addOperand(Inst.getOperand(0));
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 11b072e05a6e1..15088ac25863f 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -540,6 +540,8 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType,
printImmediateBFloat16(static_cast<uint16_t>(Imm), STI, O))
return;
break;
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+ break;
default:
llvm_unreachable("bad operand type");
}
@@ -770,6 +772,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index c49ad798c1824..f3580842c6ff0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -341,6 +341,9 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding(
return AMDGPU::getInlineEncodingV2BF16(static_cast<uint32_t>(Imm))
.value_or(255);
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+ return 255;
+
case AMDGPU::OPERAND_KIMM32:
case AMDGPU::OPERAND_KIMM16:
case AMDGPU::OPERAND_KIMM64:
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 40b8bcde005c0..c56414519a6fe 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -208,6 +208,7 @@ enum OperandType : unsigned {
OPERAND_REG_IMM_V2BF16,
OPERAND_REG_IMM_V2FP16,
OPERAND_REG_IMM_V2INT16,
+ OPERAND_REG_IMM_NOINLINE_V2FP16,
OPERAND_REG_IMM_V2INT32,
OPERAND_REG_IMM_V2FP32,
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index b77da4d612dd4..e934152d08acb 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -468,6 +468,7 @@ bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c2da937552240..044a681bfed3d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4438,6 +4438,8 @@ bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
return AMDGPU::isInlinableLiteralV2BF16(Imm);
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+ return false;
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 7324b6a347447..efcc88e564e65 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2859,6 +2859,7 @@ def VOP_I16_F16 : VOPProf...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/151415
More information about the llvm-branch-commits
mailing list