[llvm] [AMDGPU] Select v_bfe_u32 for i8/i16 (and (srl x, c), mask) (PR #182446)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 2 11:51:28 PST 2026
================
@@ -52,33 +52,118 @@ define i8 @bfe_i8(i8 %a) {
ret i8 %and
}
-; Negative: <2 x i1> element extractions should combine into a single AND
-; mask (v_and 3), not produce individual BFE instructions.
-define void @no_bfe_v2i1(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GFX9-LABEL: no_bfe_v2i1:
+; Negative: multiple uses of the shifted value should not match the one-use
+; narrow BFE pattern.
+define i16 @no_bfe_i16_multi_use(i16 %a) {
+; GFX9-LABEL: no_bfe_i16_multi_use:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b16_e32 v0, 4, v0
+; GFX9-NEXT: v_and_b32_e32 v1, 15, v0
; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX9-NEXT: global_store_byte v[2:3], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-TRUE16-LABEL: no_bfe_v2i1:
+; GFX12-TRUE16-LABEL: no_bfe_i16_multi_use:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b16 v0.l, 4, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b16 v0.h, v0.l, 15
; GFX12-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 3
-; GFX12-TRUE16-NEXT: global_store_b8 v[2:3], v0, off
+; GFX12-TRUE16-NEXT: v_xor_b16 v0.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
- %a = load <2 x i1>, ptr addrspace(1) %in
- %freeze = freeze <2 x i1> %a
- store <2 x i1> %freeze, ptr addrspace(1) %out
+ %shr = lshr i16 %a, 4
+ %and0 = and i16 %shr, 15
+ %and1 = and i16 %shr, 3
+ %xor = xor i16 %and0, %and1
+ ret i16 %xor
+}
+
+; Pure uniform (SGPR) case.
+define amdgpu_kernel void @bfe_i16_uniform(i16 %a, ptr addrspace(1) %out) {
----------------
arsenm wrote:
Should use amdgpu_ps and return in integer
https://github.com/llvm/llvm-project/pull/182446
More information about the llvm-commits
mailing list