[clang] [llvm] AMDGPU: Change bitop3 intrinsic operand to i32 (PR #118647)
Matt Arsenault via cfe-commits
cfe-commits at lists.llvm.org
Wed Dec 4 08:28:15 PST 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/118647
>From 03f9150adebd56e83dceaf45d07225a4d47e721d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 4 Dec 2024 09:18:42 -0500
Subject: [PATCH 1/2] AMDGPU: Change bitop3 intrinsic operand to i32
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 +-
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 12 +++----
.../test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll | 32 +++++++++----------
4 files changed, 24 insertions(+), 24 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 112c26d20db14e..92418b9104ad14 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -774,7 +774,7 @@ def int_amdgcn_prng_b32 : DefaultAttrsIntrinsic<
def int_amdgcn_bitop3 :
DefaultAttrsIntrinsic<[llvm_anyint_ty],
- [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i8_ty],
+ [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]>;
} // TargetPrefix = "amdgcn"
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index d8eb9d155315a6..fc8c12a674e466 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1271,7 +1271,7 @@ def ByteSel : NamedIntOperand<"byte_sel"> {
let Validator = "isUInt<2>";
}
-def BitOp3 : CustomOperand<i8, 1, "BitOp3">;
+def BitOp3 : CustomOperand<i32, 1, "BitOp3">;
def bitop3_0 : DefaultOperand<BitOp3, 0>;
class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index ff9376e635af96..a00785bf29c77a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1291,28 +1291,28 @@ let SubtargetPredicate = isGFX12Plus in {
let SubtargetPredicate = HasBitOp3Insts in {
let isReMaterializable = 1 in {
defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16",
- VOP3_BITOP3_Profile<VOPProfile_True16<VOPProfile <[i16, i16, i16, i16, i8]>>,
+ VOP3_BITOP3_Profile<VOPProfile_True16<VOPProfile <[i16, i16, i16, i16, i32]>>,
VOP3_OPSEL>>;
defm V_BITOP3_B32 : VOP3Inst <"v_bitop3_b32",
- VOP3_BITOP3_Profile<VOPProfile <[i32, i32, i32, i32, i8]>, VOP3_REGULAR>>;
+ VOP3_BITOP3_Profile<VOPProfile <[i32, i32, i32, i32, i32]>, VOP3_REGULAR>>;
}
def : GCNPat<
- (i32 (int_amdgcn_bitop3 i32:$src0, i32:$src1, i32:$src2, i8:$bitop3)),
+ (i32 (int_amdgcn_bitop3 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)),
(i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3))
>;
def : GCNPat<
- (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i8:$bitop3)),
+ (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
(i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
>;
def : GCNPat<
- (i32 (BITOP3_32 i32:$src0, i32:$src1, i32:$src2, i8:$bitop3)),
+ (i32 (BITOP3_32 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)),
(i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3))
>;
def : GCNPat<
- (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i8:$bitop3)),
+ (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)),
(i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
>;
} // End SubtargetPredicate = HasBitOp3Insts
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll
index ff2f4db0d7a5f9..b6232cbc384967 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll
@@ -2,15 +2,15 @@
; RUN: llc -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950-SDAG %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950-GISEL %s
-declare i32 @llvm.amdgcn.bitop3.i32(i32, i32, i32, i8)
-declare i16 @llvm.amdgcn.bitop3.i16(i16, i16, i16, i8)
+declare i32 @llvm.amdgcn.bitop3.i32(i32, i32, i32, i32)
+declare i16 @llvm.amdgcn.bitop3.i16(i16, i16, i16, i32)
define amdgpu_ps float @bitop3_b32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: bitop3_b32_vvv:
; GCN: ; %bb.0:
; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0xf
; GCN-NEXT: ; return to shader part epilog
- %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 %c, i8 15)
+ %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 %c, i32 15)
%ret_cast = bitcast i32 %ret to float
ret float %ret_cast
}
@@ -20,7 +20,7 @@ define amdgpu_ps float @bitop3_b32_svv(i32 inreg %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_bitop3_b32 v0, s0, v0, v1 bitop3:0x10
; GCN-NEXT: ; return to shader part epilog
- %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 %c, i8 16)
+ %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 %c, i32 16)
%ret_cast = bitcast i32 %ret to float
ret float %ret_cast
}
@@ -31,7 +31,7 @@ define amdgpu_ps float @bitop3_b32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_bitop3_b32 v0, s0, v1, v0 bitop3:0x11
; GCN-NEXT: ; return to shader part epilog
- %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 %c, i8 17)
+ %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 %c, i32 17)
%ret_cast = bitcast i32 %ret to float
ret float %ret_cast
}
@@ -43,7 +43,7 @@ define amdgpu_ps float @bitop3_b32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c)
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: v_bitop3_b32 v0, s0, v0, v1 bitop3:0x12
; GCN-NEXT: ; return to shader part epilog
- %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 %c, i8 18)
+ %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 %c, i32 18)
%ret_cast = bitcast i32 %ret to float
ret float %ret_cast
}
@@ -60,7 +60,7 @@ define amdgpu_ps float @bitop3_b32_vvi(i32 %a, i32 %b) {
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e8
; GFX950-GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x13
; GFX950-GISEL-NEXT: ; return to shader part epilog
- %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 1000, i8 19)
+ %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 1000, i32 19)
%ret_cast = bitcast i32 %ret to float
ret float %ret_cast
}
@@ -79,7 +79,7 @@ define amdgpu_ps float @bitop3_b32_vii(i32 %a) {
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e8
; GFX950-GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x14
; GFX950-GISEL-NEXT: ; return to shader part epilog
- %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 2000, i32 1000, i8 20)
+ %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 2000, i32 1000, i32 20)
%ret_cast = bitcast i32 %ret to float
ret float %ret_cast
}
@@ -102,7 +102,7 @@ define amdgpu_ps float @bitop3_b32_iii() {
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e8
; GFX950-GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x15
; GFX950-GISEL-NEXT: ; return to shader part epilog
- %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 3000, i32 2000, i32 1000, i8 21)
+ %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 3000, i32 2000, i32 1000, i32 21)
%ret_cast = bitcast i32 %ret to float
ret float %ret_cast
}
@@ -112,7 +112,7 @@ define amdgpu_ps half @bitop3_b16_vvv(i16 %a, i16 %b, i16 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:0xf
; GCN-NEXT: ; return to shader part epilog
- %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 %c, i8 15)
+ %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 %c, i32 15)
%ret_cast = bitcast i16 %ret to half
ret half %ret_cast
}
@@ -122,7 +122,7 @@ define amdgpu_ps half @bitop3_b16_svv(i16 inreg %a, i16 %b, i16 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_bitop3_b16 v0, s0, v0, v1 bitop3:0x10
; GCN-NEXT: ; return to shader part epilog
- %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 %c, i8 16)
+ %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 %c, i32 16)
%ret_cast = bitcast i16 %ret to half
ret half %ret_cast
}
@@ -133,7 +133,7 @@ define amdgpu_ps half @bitop3_b16_ssv(i16 inreg %a, i16 inreg %b, i16 %c) {
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_bitop3_b16 v0, s0, v1, v0 bitop3:0x11
; GCN-NEXT: ; return to shader part epilog
- %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 %c, i8 17)
+ %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 %c, i32 17)
%ret_cast = bitcast i16 %ret to half
ret half %ret_cast
}
@@ -145,7 +145,7 @@ define amdgpu_ps half @bitop3_b16_sss(i16 inreg %a, i16 inreg %b, i16 inreg %c)
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: v_bitop3_b16 v0, s0, v0, v1 bitop3:0x12
; GCN-NEXT: ; return to shader part epilog
- %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 %c, i8 18)
+ %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 %c, i32 18)
%ret_cast = bitcast i16 %ret to half
ret half %ret_cast
}
@@ -162,7 +162,7 @@ define amdgpu_ps half @bitop3_b16_vvi(i16 %a, i16 %b) {
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e8
; GFX950-GISEL-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:0x13
; GFX950-GISEL-NEXT: ; return to shader part epilog
- %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 1000, i8 19)
+ %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 1000, i32 19)
%ret_cast = bitcast i16 %ret to half
ret half %ret_cast
}
@@ -181,7 +181,7 @@ define amdgpu_ps half @bitop3_b16_vii(i16 %a) {
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e8
; GFX950-GISEL-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:0x14
; GFX950-GISEL-NEXT: ; return to shader part epilog
- %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 2000, i16 1000, i8 20)
+ %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 2000, i16 1000, i32 20)
%ret_cast = bitcast i16 %ret to half
ret half %ret_cast
}
@@ -203,7 +203,7 @@ define amdgpu_ps half @bitop3_b16_iii() {
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e8
; GFX950-GISEL-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:0x15
; GFX950-GISEL-NEXT: ; return to shader part epilog
- %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 3000, i16 2000, i16 1000, i8 21)
+ %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 3000, i16 2000, i16 1000, i32 21)
%ret_cast = bitcast i16 %ret to half
ret half %ret_cast
}
>From 3a948435d3f7f59f4f2fa376c574d9388031822c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 4 Dec 2024 11:14:46 -0500
Subject: [PATCH 2/2] Fix clang builtins
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 ++--
clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 4758f6053ccb6d..14c1746716cdd6 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -616,8 +616,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32, "V6UiV32fUif", "nc
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16, "V6UiV32yUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16, "V6UiV32hUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32, "V6UiV32fUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
-TARGET_BUILTIN(__builtin_amdgcn_bitop3_b32, "iiiiIUc", "nc", "bitop3-insts")
-TARGET_BUILTIN(__builtin_amdgcn_bitop3_b16, "ssssIUc", "nc", "bitop3-insts")
+TARGET_BUILTIN(__builtin_amdgcn_bitop3_b32, "iiiiIUi", "nc", "bitop3-insts")
+TARGET_BUILTIN(__builtin_amdgcn_bitop3_b16, "ssssIUi", "nc", "bitop3-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf16_f32, "V2yV2yfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
index f1259ef678f1e0..8251d6c213e3d1 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -1673,7 +1673,7 @@ void test_cvt_scalef32_sr_fp8_f32(global unsigned *out, float src, uint seed, fl
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.bitop3.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i8 1)
+// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.bitop3.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 1)
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
// CHECK-NEXT: ret void
@@ -1696,7 +1696,7 @@ void test_bitop3_b32(global uint* out, uint a, uint b, uint c)
// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr addrspace(5) [[A_ADDR]], align 2
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr addrspace(5) [[B_ADDR]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[C_ADDR]], align 2
-// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.bitop3.i16(i16 [[TMP0]], i16 [[TMP1]], i16 [[TMP2]], i8 1)
+// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.bitop3.i16(i16 [[TMP0]], i16 [[TMP1]], i16 [[TMP2]], i32 1)
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[TMP4]], align 2
// CHECK-NEXT: ret void
More information about the cfe-commits
mailing list