[llvm] Revert "AMDGPU: Don't avoid clamp of bit shift in BFE pattern (#115372)" (PR #116091)
Changpeng Fang via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 13 11:21:04 PST 2024
https://github.com/changpeng updated https://github.com/llvm/llvm-project/pull/116091
>From 60277700c59d5c1b321860113e5fd1b1827c156c Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Wed, 13 Nov 2024 10:21:34 -0800
Subject: [PATCH 1/3] Revert "AMDGPU: Use "countMaxActiveBits() <= 5" to define
uint5Bits"
This reverts commit a2bacf8ab58af4c1a0247026ea131443d6066602.
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 2 +-
llvm/test/CodeGen/AMDGPU/extract-lowbits.ll | 25 ---------------------
2 files changed, 1 insertion(+), 26 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 25df5dabdc6aa1..2fdf69e068e5c1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3557,7 +3557,7 @@ def : AMDGPUPat <
>;
def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{
- return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxActiveBits() <= 5;
+ return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxTrailingOnes() <= 5;
}]>;
// x << (bitwidth - y) >> (bitwidth - y)
diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
index 0e5a68773a6ba8..3de8db2c6a448e 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -163,31 +163,6 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
ret i32 %masked
}
-define i32 @bzhi32_d0_even(i32 %val, i32 %numlowbits) nounwind {
-; SI-LABEL: bzhi32_d0_even:
-; SI: ; %bb.0:
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v1
-; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
-; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
-; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
-; SI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: bzhi32_d0_even:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v1
-; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
-; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
-; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
-; VI-NEXT: s_setpc_b64 s[30:31]
- %times2 = shl i32 %numlowbits, 1
- %numhighbits = sub i32 32, %times2
- %highbitscleared = shl i32 %val, %numhighbits
- %masked = lshr i32 %highbitscleared, %numhighbits
- ret i32 %masked
-}
-
define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
; SI-LABEL: bzhi32_d1_indexzext:
; SI: ; %bb.0:
>From d755816c6e51aaea31ff232d9479cf5038636cd5 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Wed, 13 Nov 2024 10:45:28 -0800
Subject: [PATCH 2/3] Revert "AMDGPU: Don't avoid clamp of bit shift in BFE
pattern (#115372)"
This reverts commit bdf8e308b7ea430f619ca3aa1199a76eb6b4e2d4.
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 1 +
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 -
llvm/lib/Target/AMDGPU/SIInstructions.td | 17 ----------
llvm/test/CodeGen/AMDGPU/bfe-patterns.ll | 33 +++++++++----------
llvm/test/CodeGen/AMDGPU/extract-lowbits.ll | 24 +++++++++-----
5 files changed, 33 insertions(+), 43 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e3a330d45aaa57..21fffba14287ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -22,6 +22,7 @@
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 5ae0b179d7d0e6..11c4cdd560c2f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -17,7 +17,6 @@
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "SIModeRegisterDefaults.h"
-#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/Target/TargetMachine.h"
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2fdf69e068e5c1..5f4cca0645b0ef 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3556,23 +3556,6 @@ def : AMDGPUPat <
(V_BFE_U32_e64 $src, (i32 0), $width)
>;
-def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{
- return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxTrailingOnes() <= 5;
-}]>;
-
-// x << (bitwidth - y) >> (bitwidth - y)
-def : AMDGPUPat <
- (DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, uint5Bits:$width)),
- (sub 32, uint5Bits:$width)),
- (V_BFE_U32_e64 $src, (i32 0), $width)
->;
-
-def : AMDGPUPat <
- (DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, uint5Bits:$width)),
- (sub 32, uint5Bits:$width)),
- (V_BFE_I32_e64 $src, (i32 0), $width)
->;
-
// SHA-256 Ma patterns
// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index 18d19673995115..ce54ad5c9a6a82 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -17,8 +17,9 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT: v_and_b32_e32 v3, 31, v3
-; SI-NEXT: v_bfe_u32 v2, v2, 0, v3
+; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
+; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -37,8 +38,9 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_and_b32_e32 v2, 31, v4
-; VI-NEXT: v_bfe_u32 v2, v3, 0, v2
+; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
+; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -47,8 +49,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
%src = load volatile i32, ptr addrspace(1) %in0.gep
%width = load volatile i32, ptr addrspace(1) %in0.gep
- %width5 = and i32 %width, 31
- %sub = sub i32 32, %width5
+ %sub = sub i32 32, %width
%shl = shl i32 %src, %sub
%bfe = lshr i32 %shl, %sub
store i32 %bfe, ptr addrspace(1) %out.gep
@@ -71,7 +72,6 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_and_b32_e32 v3, 31, v3
; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
; SI-NEXT: v_lshrrev_b32_e32 v3, v3, v2
@@ -95,8 +95,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_and_b32_e32 v2, 31, v4
-; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v2
+; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -109,8 +108,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
%src = load volatile i32, ptr addrspace(1) %in0.gep
%width = load volatile i32, ptr addrspace(1) %in0.gep
- %width5 = and i32 %width, 31
- %sub = sub i32 32, %width5
+ %sub = sub i32 32, %width
%shl = shl i32 %src, %sub
%bfe = lshr i32 %shl, %sub
store i32 %bfe, ptr addrspace(1) %out.gep
@@ -221,8 +219,9 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT: v_and_b32_e32 v3, 31, v3
-; SI-NEXT: v_bfe_i32 v2, v2, 0, v3
+; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
+; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
+; SI-NEXT: v_ashrrev_i32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -241,8 +240,9 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_and_b32_e32 v2, 31, v4
-; VI-NEXT: v_bfe_i32 v2, v3, 0, v2
+; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
+; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; VI-NEXT: v_ashrrev_i32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -251,8 +251,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
%src = load volatile i32, ptr addrspace(1) %in0.gep
%width = load volatile i32, ptr addrspace(1) %in0.gep
- %width5 = and i32 %width, 31
- %sub = sub i32 32, %width5
+ %sub = sub i32 32, %width
%shl = shl i32 %src, %sub
%bfe = ashr i32 %shl, %sub
store i32 %bfe, ptr addrspace(1) %out.gep
diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
index 3de8db2c6a448e..3d9616f02d52d1 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -150,14 +150,22 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
; ---------------------------------------------------------------------------- ;
define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
-; GCN-LABEL: bzhi32_d0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v1, 31, v1
-; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %numlow5bits = and i32 %numlowbits, 31
- %numhighbits = sub i32 32, %numlow5bits
+; SI-LABEL: bzhi32_d0:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: bzhi32_d0:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
+; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %numhighbits = sub i32 32, %numlowbits
%highbitscleared = shl i32 %val, %numhighbits
%masked = lshr i32 %highbitscleared, %numhighbits
ret i32 %masked
>From bd31c05b17226085aa6da8e790ffc4a9188c0dd7 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Wed, 13 Nov 2024 11:19:19 -0800
Subject: [PATCH 3/3] AMDGPU: Keep the test while we revert the patch
---
llvm/test/CodeGen/AMDGPU/extract-lowbits.ll | 25 +++++++++++++++++++++
1 file changed, 25 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
index 3d9616f02d52d1..7f1f7133d69919 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -171,6 +171,31 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
ret i32 %masked
}
+define i32 @bzhi32_d0_5bits(i32 %val, i32 %numlowbits) nounwind {
+; SI-LABEL: bzhi32_d0_5bits:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 31, v1
+; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: bzhi32_d0_5bits:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_and_b32_e32 v1, 31, v1
+; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
+; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %numlow5bits = and i32 %numlowbits, 31
+ %numhighbits = sub i32 32, %numlow5bits
+ %highbitscleared = shl i32 %val, %numhighbits
+ %masked = lshr i32 %highbitscleared, %numhighbits
+ ret i32 %masked
+}
+
define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
; SI-LABEL: bzhi32_d1_indexzext:
; SI: ; %bb.0:
More information about the llvm-commits
mailing list