[llvm] AMDGPU: Disable pattern matching from x&(-1>>(32-y) to "bfe x, 0, y" (PR #116115)
Changpeng Fang via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 13 16:11:52 PST 2024
https://github.com/changpeng updated https://github.com/llvm/llvm-project/pull/116115
>From 24a19f1aea91c06e7bbe575420dbe04bebd6825b Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Wed, 13 Nov 2024 14:27:34 -0800
Subject: [PATCH 1/2] AMDGPU: Disable pattern matching x&(-1>>(32-y) to "bfe x,
0, y"
It is not correct to lower x&(-1>>(32-y) to "bfe x, 0, y". When y
equals to 32, "-1" is not shifted, so x&(-1>>(32-32) is still x, but
"bfe x, 0, 32" is 0.
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 6 ----
llvm/test/CodeGen/AMDGPU/extract-lowbits.ll | 40 +++++++++++++++------
2 files changed, 30 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 5f4cca0645b0ef..2831d0339df6b9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3550,12 +3550,6 @@ def : AMDGPUPat <
(V_BFE_U32_e64 $src, (i32 0), $width)
>;
-// x & (-1 >> (bitwidth - y))
-def : AMDGPUPat <
- (DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
- (V_BFE_U32_e64 $src, (i32 0), $width)
->;
-
// SHA-256 Ma patterns
// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y
diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
index 7f1f7133d69919..cc2ef0c48f1152 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -99,11 +99,21 @@ define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind {
; ---------------------------------------------------------------------------- ;
define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
-; GCN-LABEL: bzhi32_c0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: bzhi32_c0:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
+; SI-NEXT: v_lshr_b32_e32 v1, -1, v1
+; SI-NEXT: v_and_b32_e32 v0, v1, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: bzhi32_c0:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
+; VI-NEXT: v_lshrrev_b32_e64 v1, v1, -1
+; VI-NEXT: v_and_b32_e32 v0, v1, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
%numhighbits = sub i32 32, %numlowbits
%mask = lshr i32 -1, %numhighbits
%masked = and i32 %mask, %val
@@ -134,11 +144,21 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
}
define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
-; GCN-LABEL: bzhi32_c4_commutative:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: bzhi32_c4_commutative:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
+; SI-NEXT: v_lshr_b32_e32 v1, -1, v1
+; SI-NEXT: v_and_b32_e32 v0, v0, v1
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: bzhi32_c4_commutative:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
+; VI-NEXT: v_lshrrev_b32_e64 v1, v1, -1
+; VI-NEXT: v_and_b32_e32 v0, v0, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
%numhighbits = sub i32 32, %numlowbits
%mask = lshr i32 -1, %numhighbits
%masked = and i32 %val, %mask ; swapped order
>From fa21d88a45b77debc1a236d5856c7c0d72d9c3b5 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Wed, 13 Nov 2024 16:08:00 -0800
Subject: [PATCH 2/2] AMDGPU: Allow clamp for pattern x&(-1>>(32-y) to "bfe x,
0, y"
It is fine to match the pattern if we know y has at most five
active bits (< 32).
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 1 -
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 +
llvm/lib/Target/AMDGPU/SIInstructions.td | 10 ++++++++++
llvm/test/CodeGen/AMDGPU/extract-lowbits.ll | 14 ++++++++++++++
4 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 21fffba14287ef..e3a330d45aaa57 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -22,7 +22,6 @@
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
-#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 11c4cdd560c2f3..5ae0b179d7d0e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -17,6 +17,7 @@
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "SIModeRegisterDefaults.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/Target/TargetMachine.h"
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2831d0339df6b9..3f211e7cbdde50 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3550,6 +3550,16 @@ def : AMDGPUPat <
(V_BFE_U32_e64 $src, (i32 0), $width)
>;
+def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{
+ return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxActiveBits() <= 5;
+}]>;
+
+// x & (-1 >> (bitwidth - y))
+def : AMDGPUPat <
+ (DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, uint5Bits:$width))),
+ (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
// SHA-256 Ma patterns
// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y
diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
index cc2ef0c48f1152..5e637ba071d977 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -120,6 +120,20 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
ret i32 %masked
}
+define i32 @bzhi32_c0_clamp(i32 %val, i32 %numlowbits) nounwind {
+; GCN-LABEL: bzhi32_c0_clamp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v1, 31, v1
+; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %low5bits = and i32 %numlowbits, 31
+ %numhighbits = sub i32 32, %low5bits
+ %mask = lshr i32 -1, %numhighbits
+ %masked = and i32 %mask, %val
+ ret i32 %masked
+}
+
define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
; SI-LABEL: bzhi32_c1_indexzext:
; SI: ; %bb.0:
More information about the llvm-commits
mailing list