[llvm] [AMDGPU] Use "v_bfi_b32 x, y, -1" to implement (y | ~x) (PR #156653)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 3 05:22:55 PDT 2025
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/156653
None
>From f0e40f42e05cf0ca1727fe603e4eee2d260a0866 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 3 Sep 2025 13:19:55 +0100
Subject: [PATCH] [AMDGPU] Use "v_bfi_b32 x, y, -1" to implement (y | ~x)
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 16 +++++++
llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll | 42 +++++++------------
llvm/test/CodeGen/AMDGPU/andorn2.ll | 6 +--
llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll | 9 ++--
llvm/test/CodeGen/AMDGPU/bitop3.ll | 6 +--
.../CodeGen/AMDGPU/dag-divergence-atomic.ll | 3 +-
.../insert_waitcnt_for_precise_memory.ll | 22 ++++------
.../CodeGen/AMDGPU/private-memory-atomics.ll | 3 +-
8 files changed, 47 insertions(+), 60 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9cc9af7575db6..dd9f20019bb4d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2496,6 +2496,22 @@ def : AMDGPUPatIgnoreCopies <
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
>;
+// (y | ~x)
+def : AMDGPUPatIgnoreCopies <
+ (DivergentBinFrag<or> i32:$y, (not_oneuse i32:$x)),
+ (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, (i32 -1))
+>;
+
+// 64-bit version
+def : AMDGPUPatIgnoreCopies <
+ (DivergentBinFrag<or> i64:$y, (not_oneuse i64:$x)),
+ (REG_SEQUENCE VReg_64,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), (i32 -1)), sub0,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), (i32 -1)), sub1)
+>;
+
// SHA-256 Ch function
// z ^ (x & (y ^ z))
def : AMDGPUPatIgnoreCopies <
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
index afabc7b62386f..917b50f14bfc4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@@ -99,15 +99,13 @@ define i32 @v_orn2_i32(i32 %src0, i32 %src1) {
; GCN-LABEL: v_orn2_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_not_b32_e32 v1, v1
-; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: v_bfi_b32 v0, v1, v0, -1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_orn2_i32:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
-; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v1, v0, -1
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%not.src1 = xor i32 %src1, -1
%or = or i32 %src0, %not.src1
@@ -117,14 +115,12 @@ define i32 @v_orn2_i32(i32 %src0, i32 %src1) {
define amdgpu_ps float @v_orn2_i32_sv(i32 inreg %src0, i32 %src1) {
; GCN-LABEL: v_orn2_i32_sv:
; GCN: ; %bb.0:
-; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: v_or_b32_e32 v0, s2, v0
+; GCN-NEXT: v_bfi_b32 v0, v0, s2, -1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_orn2_i32_sv:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
-; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, s2, -1
; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i32 %src1, -1
%or = or i32 %src0, %not.src1
@@ -135,14 +131,12 @@ define amdgpu_ps float @v_orn2_i32_sv(i32 inreg %src0, i32 %src1) {
define amdgpu_ps float @v_orn2_i32_vs(i32 %src0, i32 inreg %src1) {
; GCN-LABEL: v_orn2_i32_vs:
; GCN: ; %bb.0:
-; GCN-NEXT: s_not_b32 s0, s2
-; GCN-NEXT: v_or_b32_e32 v0, s0, v0
+; GCN-NEXT: v_bfi_b32 v0, s2, v0, -1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_orn2_i32_vs:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_not_b32 s0, s2
-; GFX10PLUS-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX10PLUS-NEXT: v_bfi_b32 v0, s2, v0, -1
; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i32 %src1, -1
%or = or i32 %src0, %not.src1
@@ -247,19 +241,15 @@ define i64 @v_orn2_i64(i64 %src0, i64 %src1) {
; GCN-LABEL: v_orn2_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_not_b32_e32 v2, v2
-; GCN-NEXT: v_not_b32_e32 v3, v3
-; GCN-NEXT: v_or_b32_e32 v0, v0, v2
-; GCN-NEXT: v_or_b32_e32 v1, v1, v3
+; GCN-NEXT: v_bfi_b32 v0, v2, v0, -1
+; GCN-NEXT: v_bfi_b32 v1, v3, v1, -1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_orn2_i64:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_not_b32_e32 v2, v2
-; GFX10PLUS-NEXT: v_not_b32_e32 v3, v3
-; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX10PLUS-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v2, v0, -1
+; GFX10PLUS-NEXT: v_bfi_b32 v1, v3, v1, -1
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%not.src1 = xor i64 %src1, -1
%or = or i64 %src0, %not.src1
@@ -269,18 +259,14 @@ define i64 @v_orn2_i64(i64 %src0, i64 %src1) {
define amdgpu_ps <2 x float> @v_orn2_i64_sv(i64 inreg %src0, i64 %src1) {
; GCN-LABEL: v_orn2_i64_sv:
; GCN: ; %bb.0:
-; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: v_not_b32_e32 v1, v1
-; GCN-NEXT: v_or_b32_e32 v0, s2, v0
-; GCN-NEXT: v_or_b32_e32 v1, s3, v1
+; GCN-NEXT: v_bfi_b32 v0, v0, s2, -1
+; GCN-NEXT: v_bfi_b32 v1, v1, s3, -1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_orn2_i64_sv:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
-; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
-; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0
-; GFX10PLUS-NEXT: v_or_b32_e32 v1, s3, v1
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, s2, -1
+; GFX10PLUS-NEXT: v_bfi_b32 v1, v1, s3, -1
; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i64 %src1, -1
%or = or i64 %src0, %not.src1
diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll
index d0e32fc205144..1527d50e28b35 100644
--- a/llvm/test/CodeGen/AMDGPU/andorn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll
@@ -72,8 +72,7 @@ entry:
}
; GCN-LABEL: {{^}}vector_orn2_i32_s_v_one_use
-; GCN: v_not_b32
-; GCN: v_or_b32
+; GCN: v_bfi_b32
define amdgpu_kernel void @vector_orn2_i32_s_v_one_use(
ptr addrspace(1) %r0, i32 %s) {
entry:
@@ -85,8 +84,7 @@ entry:
}
; GCN-LABEL: {{^}}vector_orn2_i32_v_s_one_use
-; GCN: s_not_b32
-; GCN: v_or_b32
+; GCN: v_bfi_b32
define amdgpu_kernel void @vector_orn2_i32_v_s_one_use(
ptr addrspace(1) %r0, i32 %s) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
index 2cd50b3b1b2a2..8816c5998a3fb 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
@@ -11,8 +11,7 @@ define i32 @atomic_nand_i32_lds(ptr addrspace(3) %ptr) nounwind {
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, v1
-; GCN-NEXT: v_not_b32_e32 v1, v2
-; GCN-NEXT: v_or_b32_e32 v1, -5, v1
+; GCN-NEXT: v_bfi_b32 v1, v2, -5, -1
; GCN-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
@@ -37,8 +36,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v3, v2
-; GCN-NEXT: v_not_b32_e32 v2, v3
-; GCN-NEXT: v_or_b32_e32 v2, -5, v2
+; GCN-NEXT: v_bfi_b32 v2, v3, -5, -1
; GCN-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_wbinvl1_vol
@@ -64,8 +62,7 @@ define i32 @atomic_nand_i32_flat(ptr %ptr) nounwind {
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v3, v2
-; GCN-NEXT: v_not_b32_e32 v2, v3
-; GCN-NEXT: v_or_b32_e32 v2, -5, v2
+; GCN-NEXT: v_bfi_b32 v2, v3, -5, -1
; GCN-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_wbinvl1_vol
diff --git a/llvm/test/CodeGen/AMDGPU/bitop3.ll b/llvm/test/CodeGen/AMDGPU/bitop3.ll
index 52d4780005aad..478460595b5b7 100644
--- a/llvm/test/CodeGen/AMDGPU/bitop3.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitop3.ll
@@ -183,9 +183,8 @@ define amdgpu_ps float @test_63(i32 %a, i32 %b) {
;
; GFX950-GISEL-LABEL: test_63:
; GFX950-GISEL: ; %bb.0:
-; GFX950-GISEL-NEXT: v_not_b32_e32 v0, v0
; GFX950-GISEL-NEXT: v_not_b32_e32 v1, v1
-; GFX950-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT: v_bfi_b32 v0, v0, v1, -1
; GFX950-GISEL-NEXT: ; return to shader part epilog
;
; GFX1250-SDAG-LABEL: test_63:
@@ -195,10 +194,9 @@ define amdgpu_ps float @test_63(i32 %a, i32 %b) {
;
; GFX1250-GISEL-LABEL: test_63:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_not_b32_e32 v0, v0
; GFX1250-GISEL-NEXT: v_not_b32_e32 v1, v1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX1250-GISEL-NEXT: v_bfi_b32 v0, v0, v1, -1
; GFX1250-GISEL-NEXT: ; return to shader part epilog
%nota = xor i32 %a, -1
%notb = xor i32 %b, -1
diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
index 12de3750640db..9c03c850c8242 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
@@ -128,8 +128,7 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1)
; CHECK-NEXT: .LBB5_1: ; %atomicrmw.start
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_mov_b32_e32 v3, v0
-; CHECK-NEXT: v_not_b32_e32 v0, v3
-; CHECK-NEXT: v_or_b32_e32 v2, -2, v0
+; CHECK-NEXT: v_bfi_b32 v2, v3, -2, -1
; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index a3b0a7768ca67..24512f2f7905a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -146,8 +146,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v3, v2
-; GFX9-NEXT: v_not_b32_e32 v2, v3
-; GFX9-NEXT: v_or_b32_e32 v2, -5, v2
+; GFX9-NEXT: v_bfi_b32 v2, v3, -5, -1
; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -169,8 +168,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_not_b32_e32 v2, v3
-; GFX90A-NEXT: v_or_b32_e32 v2, -5, v2
+; GFX90A-NEXT: v_bfi_b32 v2, v3, -5, -1
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -194,8 +192,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_not_b32_e32 v2, v3
-; GFX10-NEXT: v_or_b32_e32 v2, -5, v2
+; GFX10-NEXT: v_bfi_b32 v2, v3, -5, -1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -219,8 +216,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
; GFX9-FLATSCR-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX9-FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v2
-; GFX9-FLATSCR-NEXT: v_not_b32_e32 v2, v3
-; GFX9-FLATSCR-NEXT: v_or_b32_e32 v2, -5, v2
+; GFX9-FLATSCR-NEXT: v_bfi_b32 v2, v3, -5, -1
; GFX9-FLATSCR-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: buffer_wbinvl1_vol
@@ -242,9 +238,8 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v3
-; GFX11-NEXT: v_or_b32_e32 v2, -5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v2, v3, -5, -1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -273,9 +268,8 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v3
-; GFX12-NEXT: v_or_b32_e32 v2, -5, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_bfi_b32 v2, v3, -5, -1
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index 24a4d8fbde200..c572185e7bbf6 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -282,8 +282,7 @@ define i32 @atomicrmw_nand_private_i32(ptr addrspace(5) %ptr) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_not_b32_e32 v2, v1
-; GCN-NEXT: v_or_b32_e32 v2, -5, v2
+; GCN-NEXT: v_bfi_b32 v2, v1, -5, -1
; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
More information about the llvm-commits
mailing list