[clang] [llvm] [Clang][AMDGPU] Add __builtin_amdgcn_cvt_off_f32_i4 (PR #133741)
Juan Manuel Martinez CaamaƱo via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 2 08:33:15 PDT 2025
https://github.com/jmmartinez updated https://github.com/llvm/llvm-project/pull/133741
>From 41af38793161b0f1535c98c4695c36e081ef2f67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Thu, 27 Mar 2025 17:46:34 +0100
Subject: [PATCH 01/10] [Clang][AMDGPU] Add __builtin_amdgcn_cvt_off_f32_i4
This builtin maps to V_CVT_OFF_F32_I4 which treats its input as a 4-bit
signed integer and returns 0.0625f * src .
SWDEV-518861
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 +
.../builtins-amdgcn-cvt-off-f32-i4.cl | 15 ++
.../builtins-amdgcn-cvt-off-f32-i4-err.cl | 8 +
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 +
.../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 9 +
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 1 +
llvm/lib/Target/AMDGPU/VOP1Instructions.td | 5 +
.../AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll | 23 +++
.../AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll | 158 ++++++++++++++++++
10 files changed, 227 insertions(+)
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl
create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 44ef404aee72f..f38148cc795dc 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -140,6 +140,7 @@ BUILTIN(__builtin_amdgcn_cvt_pknorm_u16, "E2Usff", "nc")
BUILTIN(__builtin_amdgcn_cvt_pk_i16, "E2sii", "nc")
BUILTIN(__builtin_amdgcn_cvt_pk_u16, "E2UsUiUi", "nc")
BUILTIN(__builtin_amdgcn_cvt_pk_u8_f32, "UifUiUi", "nc")
+BUILTIN(__builtin_amdgcn_cvt_off_f32_i4, "fUi", "nc")
BUILTIN(__builtin_amdgcn_sad_u8, "UiUiUiUi", "nc")
BUILTIN(__builtin_amdgcn_msad_u8, "UiUiUiUi", "nc")
BUILTIN(__builtin_amdgcn_sad_hi_u8, "UiUiUiUi", "nc")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl
new file mode 100644
index 0000000000000..6dc235f9cc6c7
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl
@@ -0,0 +1,15 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL1.2 \
+// RUN: -emit-llvm -o - | FileCheck %s
+
+// CHECK-LABEL: @test_builtin_amdgcn_cvt_off_f32_i4(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store i32 [[N:%.*]], ptr addrspace(5) [[N_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.cvt.off.f32.i4(i32 [[TMP0]])
+// CHECK-NEXT: ret float [[TMP1]]
+//
+float test_builtin_amdgcn_cvt_off_f32_i4(unsigned n) {
+ return __builtin_amdgcn_cvt_off_f32_i4(n);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl b/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl
new file mode 100644
index 0000000000000..f5b02b80c37da
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple amdgcn-- -verify -S -o - %s
+
+void test_builtin_amdgcn_cvt_off_f32_i4(unsigned n) {
+ struct A{ unsigned x; } a;
+ __builtin_amdgcn_cvt_off_f32_i4(n, n); // expected-error {{too many arguments to function call, expected 1, have 2}}
+ __builtin_amdgcn_cvt_off_f32_i4(); // expected-error {{too few arguments to function call, expected 1, have 0}}
+ __builtin_amdgcn_cvt_off_f32_i4(a); // expected-error {{passing '__private struct A' to parameter of incompatible type 'unsigned int'}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f53016f62abbe..ebac0f9029791 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3375,6 +3375,12 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
[llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
+// llvm.amdgcn.cvt.off.fp32.i4 int srcA
+def int_amdgcn_cvt_off_f32_i4: ClangBuiltin<"__builtin_amdgcn_cvt_off_f32_i4">,
+ DefaultAttrsIntrinsic<[llvm_float_ty],
+ [llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable]>;
+
//===----------------------------------------------------------------------===//
// gfx950 intrinsics
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3246e575ea6a9..533ad349f7500 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -6042,6 +6042,7 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
// TODO: Handle more intrinsics
switch (IntrinsicID) {
case Intrinsic::amdgcn_cubeid:
+ case Intrinsic::amdgcn_cvt_off_f32_i4:
return true;
case Intrinsic::amdgcn_frexp_mant: {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 7cd97e95b0189..535fda7393bc1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -729,6 +729,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
break;
}
+ case Intrinsic::amdgcn_cvt_off_f32_i4: {
+ ConstantInt *CArg = dyn_cast<ConstantInt>(II.getArgOperand(0));
+ if (!CArg)
+ break;
+ int CI4BitAsInt = CArg->getValue().trunc(4).getSExtValue();
+ float ResVal = 0.0625 * CI4BitAsInt;
+ Constant *Res = ConstantFP::get(II.getType(), ResVal);
+ return IC.replaceInstUsesWith(II, Res);
+ }
case Intrinsic::amdgcn_ubfe:
case Intrinsic::amdgcn_sbfe: {
// Decompose simple cases into standard shifts.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 7df1e634b21ba..1d0e81db5a5db 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4585,6 +4585,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
case Intrinsic::amdgcn_cvt_f32_fp8:
case Intrinsic::amdgcn_cvt_f32_bf8:
+ case Intrinsic::amdgcn_cvt_off_f32_i4:
case Intrinsic::amdgcn_cvt_pk_f32_fp8:
case Intrinsic::amdgcn_cvt_pk_f32_bf8:
case Intrinsic::amdgcn_cvt_pk_fp8_f32:
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index def06c1e9a0d7..1dae2e432eb8c 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1578,6 +1578,11 @@ class MovDPP8Pattern<Predicate Pred, Instruction Inst, ValueType vt> : GCNPat <
let OtherPredicates = [Pred];
}
+def : GCNPat <
+ (f32 (int_amdgcn_cvt_off_f32_i4 i32:$src)),
+ (V_CVT_OFF_F32_I4_e32 VGPR_32:$src)
+>;
+
foreach vt = Reg32Types.types in {
def : MovDPP8Pattern<isGFX10Only, V_MOV_B32_dpp8_gfx10, vt>;
def : MovDPP8Pattern<isGFX11Only, V_MOV_B32_dpp8_gfx11, vt>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll
new file mode 100644
index 0000000000000..a25d9c30a2331
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs %s --global-isel -o - | FileCheck %s
+
+declare float @llvm.amdgcn.cvt.off.f32.i4(i32)
+
+define amdgpu_cs float @cvt_var(i32 %a) {
+; CHECK-LABEL: cvt_var:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cvt_off_f32_i4_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 %a)
+ ret float %ret
+}
+
+define amdgpu_cs float @cvt_imm() {
+; CHECK-LABEL: cvt_imm:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_cvt_off_f32_i4_e32 v0, 4
+; CHECK-NEXT: ; return to shader part epilog
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 4)
+ ret float %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll b/llvm/test/CodeGen/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll
new file mode 100644
index 0000000000000..bac02bd61d0a9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll
@@ -0,0 +1,158 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instcombine < %s | FileCheck %s
+
+declare float @llvm.amdgcn.cvt.off.f32.i4(i32)
+
+define float @cvt_var(i32 %a) {
+; CHECK-LABEL: define float @cvt_var(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.cvt.off.f32.i4(i32 [[A]])
+; CHECK-NEXT: ret float [[RET]]
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 %a)
+ ret float %ret
+}
+
+define float @cvt_imm_0() {
+; CHECK-LABEL: define float @cvt_imm_0() {
+; CHECK-NEXT: ret float 0.000000e+00
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 0)
+ ret float %ret
+}
+
+define float @cvt_imm_1() {
+; CHECK-LABEL: define float @cvt_imm_1() {
+; CHECK-NEXT: ret float 6.250000e-02
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 1)
+ ret float %ret
+}
+
+define float @cvt_imm_2() {
+; CHECK-LABEL: define float @cvt_imm_2() {
+; CHECK-NEXT: ret float 1.250000e-01
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 2)
+ ret float %ret
+}
+
+define float @cvt_imm_3() {
+; CHECK-LABEL: define float @cvt_imm_3() {
+; CHECK-NEXT: ret float 1.875000e-01
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 3)
+ ret float %ret
+}
+
+define float @cvt_imm_4() {
+; CHECK-LABEL: define float @cvt_imm_4() {
+; CHECK-NEXT: ret float 2.500000e-01
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 4)
+ ret float %ret
+}
+
+define float @cvt_imm_5() {
+; CHECK-LABEL: define float @cvt_imm_5() {
+; CHECK-NEXT: ret float 3.125000e-01
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 5)
+ ret float %ret
+}
+
+define float @cvt_imm_6() {
+; CHECK-LABEL: define float @cvt_imm_6() {
+; CHECK-NEXT: ret float 3.750000e-01
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 6)
+ ret float %ret
+}
+
+define float @cvt_imm_7() {
+; CHECK-LABEL: define float @cvt_imm_7() {
+; CHECK-NEXT: ret float 4.375000e-01
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 7)
+ ret float %ret
+}
+
+define float @cvt_imm_8() {
+; CHECK-LABEL: define float @cvt_imm_8() {
+; CHECK-NEXT: ret float -5.000000e-01
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 8)
+ ret float %ret
+}
+
+define float @cvt_imm_9() {
+; CHECK-LABEL: define float @cvt_imm_9() {
+; CHECK-NEXT: ret float -4.375000e-01
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 9)
+ ret float %ret
+}
+
+define float @cvt_imm_10() {
+; CHECK-LABEL: define float @cvt_imm_10() {
+; CHECK-NEXT: ret float -3.750000e-01
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 10)
+ ret float %ret
+}
+
+define float @cvt_imm_11() {
+; CHECK-LABEL: define float @cvt_imm_11() {
+; CHECK-NEXT: ret float -3.125000e-01
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 11)
+ ret float %ret
+}
+
+define float @cvt_imm_12() {
+; CHECK-LABEL: define float @cvt_imm_12() {
+; CHECK-NEXT: ret float -2.500000e-01
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 12)
+ ret float %ret
+}
+
+define float @cvt_imm_13() {
+; CHECK-LABEL: define float @cvt_imm_13() {
+; CHECK-NEXT: ret float -1.875000e-01
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 13)
+ ret float %ret
+}
+
+define float @cvt_imm_14() {
+; CHECK-LABEL: define float @cvt_imm_14() {
+; CHECK-NEXT: ret float -1.250000e-01
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 14)
+ ret float %ret
+}
+
+define float @cvt_imm_15() {
+; CHECK-LABEL: define float @cvt_imm_15() {
+; CHECK-NEXT: ret float -6.250000e-02
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 15)
+ ret float %ret
+}
+
+define float @cvt_imm_underflow() {
+; CHECK-LABEL: define float @cvt_imm_underflow() {
+; CHECK-NEXT: ret float -6.250000e-02
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 -1)
+ ret float %ret
+}
+
+define float @cvt_imm_overflow() {
+; CHECK-LABEL: define float @cvt_imm_overflow() {
+; CHECK-NEXT: ret float 0.000000e+00
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 16)
+ ret float %ret
+}
>From dc78e8a6e014b1addf2e99af3cdd0bc6b979eb4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Tue, 1 Apr 2025 09:47:43 +0200
Subject: [PATCH 02/10] [Review] Turn builtin input from unsigned to signed
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 +-
clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index f38148cc795dc..b1480675753b1 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -140,7 +140,7 @@ BUILTIN(__builtin_amdgcn_cvt_pknorm_u16, "E2Usff", "nc")
BUILTIN(__builtin_amdgcn_cvt_pk_i16, "E2sii", "nc")
BUILTIN(__builtin_amdgcn_cvt_pk_u16, "E2UsUiUi", "nc")
BUILTIN(__builtin_amdgcn_cvt_pk_u8_f32, "UifUiUi", "nc")
-BUILTIN(__builtin_amdgcn_cvt_off_f32_i4, "fUi", "nc")
+BUILTIN(__builtin_amdgcn_cvt_off_f32_i4, "fi", "nc")
BUILTIN(__builtin_amdgcn_sad_u8, "UiUiUiUi", "nc")
BUILTIN(__builtin_amdgcn_msad_u8, "UiUiUiUi", "nc")
BUILTIN(__builtin_amdgcn_sad_hi_u8, "UiUiUiUi", "nc")
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl b/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl
index f5b02b80c37da..30ffbfc130a94 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl
@@ -1,8 +1,8 @@
// RUN: %clang_cc1 -triple amdgcn-- -verify -S -o - %s
-void test_builtin_amdgcn_cvt_off_f32_i4(unsigned n) {
+void test_builtin_amdgcn_cvt_off_f32_i4(int n) {
struct A{ unsigned x; } a;
__builtin_amdgcn_cvt_off_f32_i4(n, n); // expected-error {{too many arguments to function call, expected 1, have 2}}
__builtin_amdgcn_cvt_off_f32_i4(); // expected-error {{too few arguments to function call, expected 1, have 0}}
- __builtin_amdgcn_cvt_off_f32_i4(a); // expected-error {{passing '__private struct A' to parameter of incompatible type 'unsigned int'}}
+ __builtin_amdgcn_cvt_off_f32_i4(a); // expected-error {{passing '__private struct A' to parameter of incompatible type 'int'}}
}
>From 1cf0a9c62b1039f1ff92667a84dff2f659f57110 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Tue, 1 Apr 2025 10:45:43 +0200
Subject: [PATCH 03/10] [Review] Added signed OpenCL CodeGen test
---
.../builtins-amdgcn-cvt-off-f32-i4.cl | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl
index 6dc235f9cc6c7..f554d2f72f869 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl
@@ -2,7 +2,7 @@
// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL1.2 \
// RUN: -emit-llvm -o - | FileCheck %s
-// CHECK-LABEL: @test_builtin_amdgcn_cvt_off_f32_i4(
+// CHECK-LABEL: @test_builtin_amdgcn_cvt_off_f32_i4_ui(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: store i32 [[N:%.*]], ptr addrspace(5) [[N_ADDR]], align 4
@@ -10,6 +10,18 @@
// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.cvt.off.f32.i4(i32 [[TMP0]])
// CHECK-NEXT: ret float [[TMP1]]
//
-float test_builtin_amdgcn_cvt_off_f32_i4(unsigned n) {
+float test_builtin_amdgcn_cvt_off_f32_i4_ui(unsigned n) {
+ return __builtin_amdgcn_cvt_off_f32_i4(n);
+}
+
+// CHECK-LABEL: @test_builtin_amdgcn_cvt_off_f32_i4_i(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store i32 [[N:%.*]], ptr addrspace(5) [[N_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.cvt.off.f32.i4(i32 [[TMP0]])
+// CHECK-NEXT: ret float [[TMP1]]
+//
+float test_builtin_amdgcn_cvt_off_f32_i4_i(int n) {
return __builtin_amdgcn_cvt_off_f32_i4(n);
}
>From c10a091e804fd896fdcacc50a49589b858af3eba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Tue, 1 Apr 2025 15:33:29 +0200
Subject: [PATCH 04/10] [Review] Add line in the Release notes
---
clang/docs/ReleaseNotes.rst | 1 +
1 file changed, 1 insertion(+)
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index daad01919ecd4..231bab37c1c91 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -165,6 +165,7 @@ Non-comprehensive list of changes in this release
- Support parsing the `cc` operand modifier and alias it to the `c` modifier (#GH127719).
- Added `__builtin_elementwise_exp10`.
+- For AMDPGU targets, added `__builtin_v_cvt_off_f32_i4` that maps to the `v_cvt_off_f32_i4` instruction.
New Compiler Flags
------------------
>From 73efe546b97bb5d1d1dc83dcb1de7e7023276d2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Tue, 1 Apr 2025 16:39:37 +0200
Subject: [PATCH 05/10] [Review] Move pattern into instruction declaration
---
llvm/lib/Target/AMDGPU/VOP1Instructions.td | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 1dae2e432eb8c..170e794af1b4d 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -317,7 +317,7 @@ defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>;
let ReadsModeReg = 0, mayRaiseFPException = 0 in {
defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
-defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP1_F32_I32>;
+defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP1_F32_I32, int_amdgcn_cvt_off_f32_i4>;
} // End ReadsModeReg = 0, mayRaiseFPException = 0
} // End SchedRW = [WriteFloatCvt]
@@ -1578,11 +1578,6 @@ class MovDPP8Pattern<Predicate Pred, Instruction Inst, ValueType vt> : GCNPat <
let OtherPredicates = [Pred];
}
-def : GCNPat <
- (f32 (int_amdgcn_cvt_off_f32_i4 i32:$src)),
- (V_CVT_OFF_F32_I4_e32 VGPR_32:$src)
->;
-
foreach vt = Reg32Types.types in {
def : MovDPP8Pattern<isGFX10Only, V_MOV_B32_dpp8_gfx10, vt>;
def : MovDPP8Pattern<isGFX11Only, V_MOV_B32_dpp8_gfx11, vt>;
>From 89520eec0984418ae776c11e5a6cb987048f4562 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Tue, 1 Apr 2025 17:15:27 +0200
Subject: [PATCH 06/10] [Review] Handle undef & poison and update tests
---
.../Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 10 ++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll | 8 ++++++--
.../AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll | 16 ++++++++++++++++
3 files changed, 32 insertions(+), 2 deletions(-)
rename llvm/test/{CodeGen => Transforms/InstCombine}/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll (90%)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 535fda7393bc1..6c48037649cd7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -730,9 +730,19 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
break;
}
case Intrinsic::amdgcn_cvt_off_f32_i4: {
+ Value* Arg = II.getArgOperand(0);
+ Type *Ty = II.getType();
+
+ if (isa<PoisonValue>(Arg))
+ return IC.replaceInstUsesWith(II, PoisonValue::get(Ty));
+
+ if(IC.getSimplifyQuery().isUndefValue(Arg))
+ return IC.replaceInstUsesWith(II, Constant::getNullValue(Ty));
+
ConstantInt *CArg = dyn_cast<ConstantInt>(II.getArgOperand(0));
if (!CArg)
break;
+
int CI4BitAsInt = CArg->getValue().trunc(4).getSExtValue();
float ResVal = 0.0625 * CI4BitAsInt;
Constant *Res = ConstantFP::get(II.getType(), ResVal);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll
index a25d9c30a2331..e504eb7a5a124 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll
@@ -1,6 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs %s -o - | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs %s --global-isel -o - | FileCheck %s
+; RUN: llc --global-isel=0 -mtriple=amdgcn -mcpu=tahiti %s -o - | FileCheck %s
+; RUN: llc --global-isel=1 -mtriple=amdgcn -mcpu=tahiti %s -o - | FileCheck %s
+; RUN: llc --global-isel=0 -mtriple=amdgcn -mcpu=tonga %s -o - | FileCheck %s
+; RUN: llc --global-isel=1 -mtriple=amdgcn -mcpu=tonga %s -o - | FileCheck %s
+; RUN: llc --global-isel=0 -mtriple=amdgcn -mcpu=gfx90a %s -o - | FileCheck %s
+; RUN: llc --global-isel=1 -mtriple=amdgcn -mcpu=gfx90a %s -o - | FileCheck %s
declare float @llvm.amdgcn.cvt.off.f32.i4(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll
similarity index 90%
rename from llvm/test/CodeGen/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll
rename to llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll
index bac02bd61d0a9..0850c4e92ef02 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll
@@ -156,3 +156,19 @@ define float @cvt_imm_overflow() {
%ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 16)
ret float %ret
}
+
+define float @cvt_imm_poison() {
+; CHECK-LABEL: define float @cvt_imm_poison() {
+; CHECK-NEXT: ret float poison
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 poison)
+ ret float %ret
+}
+
+define float @cvt_imm_undef() {
+; CHECK-LABEL: define float @cvt_imm_undef() {
+; CHECK-NEXT: ret float 0.000000e+00
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 undef)
+ ret float %ret
+}
>From eea660c46bdc789be4ac02bb3089712d9d1b7e6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Tue, 1 Apr 2025 17:16:59 +0200
Subject: [PATCH 07/10] CI4BitAsInt -> CArg4BitAsInt
---
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 6c48037649cd7..ab7d8b28152b8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -743,8 +743,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (!CArg)
break;
- int CI4BitAsInt = CArg->getValue().trunc(4).getSExtValue();
- float ResVal = 0.0625 * CI4BitAsInt;
+ int CArg4BitAsInt = CArg->getValue().trunc(4).getSExtValue();
+ float ResVal = 0.0625 * CArg4BitAsInt;
Constant *Res = ConstantFP::get(II.getType(), ResVal);
return IC.replaceInstUsesWith(II, Res);
}
>From 4b7a154af1afbc2105dd43189e99ff9ecdc17a93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Tue, 1 Apr 2025 17:26:35 +0200
Subject: [PATCH 08/10] [Review] Forgot the constexpr case
---
.../AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll
index 0850c4e92ef02..1082c6ddb898b 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instcombine < %s | FileCheck %s
-declare float @llvm.amdgcn.cvt.off.f32.i4(i32)
+ at gv = constant i32 0
define float @cvt_var(i32 %a) {
; CHECK-LABEL: define float @cvt_var(
@@ -157,18 +157,27 @@ define float @cvt_imm_overflow() {
ret float %ret
}
-define float @cvt_imm_poison() {
-; CHECK-LABEL: define float @cvt_imm_poison() {
+define float @cvt_poison() {
+; CHECK-LABEL: define float @cvt_poison() {
; CHECK-NEXT: ret float poison
;
%ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 poison)
ret float %ret
}
-define float @cvt_imm_undef() {
-; CHECK-LABEL: define float @cvt_imm_undef() {
+define float @cvt_undef() {
+; CHECK-LABEL: define float @cvt_undef() {
; CHECK-NEXT: ret float 0.000000e+00
;
%ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 undef)
ret float %ret
}
+
+define float @cvt_constexpr() {
+; CHECK-LABEL: define float @cvt_constexpr() {
+; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.cvt.off.f32.i4(i32 ptrtoint (ptr @gv to i32))
+; CHECK-NEXT: ret float [[RET]]
+;
+ %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 ptrtoint (ptr @gv to i32))
+ ret float %ret
+}
>From 57f299e49f02c5de4fa0e8bbbf3452496f80e323 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Tue, 1 Apr 2025 17:26:35 +0200
Subject: [PATCH 09/10] [Review] Tabulate the operation result to avoid
operations that could depend on the host
---
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index ab7d8b28152b8..2dd85afdfd2ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -743,9 +743,13 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (!CArg)
break;
- int CArg4BitAsInt = CArg->getValue().trunc(4).getSExtValue();
- float ResVal = 0.0625 * CArg4BitAsInt;
- Constant *Res = ConstantFP::get(II.getType(), ResVal);
+ // Tabulated 0.0625 * (sext (CArg & 0xf)).
+ constexpr size_t ResValsSize = 16;
+ const float ResVals[ResValsSize] = {
+ 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
+ -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
+ Constant *Res =
+ ConstantFP::get(Ty, ResVals[CArg->getZExtValue() % ResValsSize]);
return IC.replaceInstUsesWith(II, Res);
}
case Intrinsic::amdgcn_ubfe:
>From e87dd398714821a81d8ee319d6b9f9bf7bf9f901 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Wed, 2 Apr 2025 17:25:18 +0200
Subject: [PATCH 10/10] [Review] static constexpr and mask
---
llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 2dd85afdfd2ca..00967bee0b387 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -745,11 +745,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
// Tabulated 0.0625 * (sext (CArg & 0xf)).
constexpr size_t ResValsSize = 16;
- const float ResVals[ResValsSize] = {
+ static constexpr float ResVals[ResValsSize] = {
0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
-0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
Constant *Res =
- ConstantFP::get(Ty, ResVals[CArg->getZExtValue() % ResValsSize]);
+ ConstantFP::get(Ty, ResVals[CArg->getZExtValue() & (ResValsSize - 1)]);
return IC.replaceInstUsesWith(II, Res);
}
case Intrinsic::amdgcn_ubfe:
More information about the llvm-commits
mailing list