[llvm] 0a3d755 - [AMDGPU] Enable divergence-driven BFE selection
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 3 13:25:15 PDT 2021
Author: alex-t
Date: 2021-11-03T23:26:59+03:00
New Revision: 0a3d755ee9fcef1a84e0290e217d3eba68cdee22
URL: https://github.com/llvm/llvm-project/commit/0a3d755ee9fcef1a84e0290e217d3eba68cdee22
DIFF: https://github.com/llvm/llvm-project/commit/0a3d755ee9fcef1a84e0290e217d3eba68cdee22.diff
LOG: [AMDGPU] Enable divergence-driven BFE selection
Detailed description: This change enables the bit field extract patterns
selection to s_bfe_u32 or v_bfe_u32 dependent on the pattern root node
divergence.
Reviewed By: rampitec
Differential Revision: https://reviews.llvm.org/D110950
Added:
llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 20defbc883c1..cee56ee97294 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -641,8 +641,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
uint32_t OffsetVal = Offset->getZExtValue();
uint32_t WidthVal = Width->getZExtValue();
- ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
- SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
+ ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
+ WidthVal));
return;
}
case AMDGPUISD::DIV_SCALE: {
@@ -1947,9 +1947,17 @@ bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
return true;
}
-SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
+SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
SDValue Val, uint32_t Offset,
uint32_t Width) {
+ if (Val->isDivergent()) {
+ unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
+ SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
+ SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
+
+ return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
+ }
+ unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
// Transformation function, pack the offset and width of a BFE into
// the format expected by the S_BFE_I32 / S_BFE_U32. In the second
// source, bits [5:0] contain the offset and bits [22:16] the width.
@@ -1974,10 +1982,8 @@ void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
if (0 < BVal && BVal <= CVal && CVal < 32) {
bool Signed = N->getOpcode() == ISD::SRA;
- unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
-
- ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal,
- 32 - CVal));
+ ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
+ 32 - CVal));
return;
}
}
@@ -2000,9 +2006,8 @@ void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
if (isMask_32(MaskVal)) {
uint32_t WidthVal = countPopulation(MaskVal);
-
- ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
- Srl.getOperand(0), ShiftVal, WidthVal));
+ ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
+ WidthVal));
return;
}
}
@@ -2022,9 +2027,8 @@ void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
if (isMask_32(MaskVal)) {
uint32_t WidthVal = countPopulation(MaskVal);
-
- ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
- And.getOperand(0), ShiftVal, WidthVal));
+ ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
+ WidthVal));
return;
}
}
@@ -2051,7 +2055,7 @@ void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
break;
unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
- ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0),
+ ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
Amt->getZExtValue(), Width));
return;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 6f4119165849..c1d9673f067e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -233,9 +233,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
void SelectMAD_64_32(SDNode *N);
void SelectFMA_W_CHAIN(SDNode *N);
void SelectFMUL_W_CHAIN(SDNode *N);
-
- SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
- uint32_t Offset, uint32_t Width);
+ SDNode *getBFE32(bool IsSigned, const SDLoc &DL, SDValue Val, uint32_t Offset,
+ uint32_t Width);
void SelectS_BFEFromShifts(SDNode *N);
void SelectS_BFE(SDNode *N);
bool isCBranchSCC(const SDNode *N) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 750d1981e763..d5f9cb8ba493 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1871,40 +1871,92 @@ def : GCNPat <
// Conversion Patterns
//===----------------------------------------------------------------------===//
-def : GCNPat<(i32 (sext_inreg i32:$src, i1)),
+class UniformSextInreg<ValueType VT> : PatFrag<
+ (ops node:$src),
+ (sext_inreg $src, VT),
+ [{ return !N->isDivergent(); }]>;
+
+def : GCNPat<(i32 (UniformSextInreg<i1> i32:$src)),
(S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
// Handle sext_inreg in i64
def : GCNPat <
- (i64 (sext_inreg i64:$src, i1)),
+ (i64 (UniformSextInreg<i1> i64:$src)),
(S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
>;
def : GCNPat <
- (i16 (sext_inreg i16:$src, i1)),
+ (i16 (UniformSextInreg<i1> i16:$src)),
(S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16
>;
def : GCNPat <
- (i16 (sext_inreg i16:$src, i8)),
+ (i16 (UniformSextInreg<i8> i16:$src)),
(S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
>;
def : GCNPat <
- (i64 (sext_inreg i64:$src, i8)),
+ (i64 (UniformSextInreg<i8> i64:$src)),
(S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
>;
def : GCNPat <
- (i64 (sext_inreg i64:$src, i16)),
+ (i64 (UniformSextInreg<i16> i64:$src)),
(S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
>;
def : GCNPat <
- (i64 (sext_inreg i64:$src, i32)),
+ (i64 (UniformSextInreg<i32> i64:$src)),
(S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
>;
+
+class DivergentSextInreg<ValueType VT> : PatFrag<
+ (ops node:$src),
+ (sext_inreg $src, VT),
+ [{ return N->isDivergent(); }]>;
+
+def : GCNPat<(i32 (DivergentSextInreg<i1> i32:$src)),
+ (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>;
+
+def : GCNPat <
+ (i16 (DivergentSextInreg<i1> i16:$src)),
+ (V_BFE_I32_e64 $src, (i32 0), (i32 1)) // 0 | 1 << 16
+>;
+
+def : GCNPat <
+ (i16 (DivergentSextInreg<i8> i16:$src)),
+ (V_BFE_I32_e64 $src, (i32 0), (i32 8)) // 0 | 8 << 16
+>;
+
+def : GCNPat <
+ (i64 (DivergentSextInreg<i1> i64:$src)),
+ (REG_SEQUENCE VReg_64,
+ (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1)), sub0,
+ (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1))), sub1)
+>;
+
+def : GCNPat <
+ (i64 (DivergentSextInreg<i8> i64:$src)),
+ (REG_SEQUENCE VReg_64,
+ (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)/* 0 | 8 << 16 */), sub0,
+ (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8))), sub1)
+>;
+
+def : GCNPat <
+ (i64 (DivergentSextInreg<i16> i64:$src)),
+ (REG_SEQUENCE VReg_64,
+ (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)/* 0 | 16 << 16 */), sub0,
+ (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16))), sub1)
+>;
+
+def : GCNPat <
+ (i64 (DivergentSextInreg<i32> i64:$src)),
+ (REG_SEQUENCE VReg_64,
+ (i32 (EXTRACT_SUBREG i64:$src, sub0)), sub0,
+ (V_ASHRREV_I32_e32 (i32 31), (i32 (EXTRACT_SUBREG i64:$src, sub0))), sub1)
+>;
+
def : GCNPat <
(i64 (zext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll
new file mode 100644
index 000000000000..ead58214bdf9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll
@@ -0,0 +1,25 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN_LABEL: @bfe_uniform
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40010
+define amdgpu_kernel void @bfe_uniform(i32 %val, i32 addrspace(1)* %out) {
+ %hibits = lshr i32 %val, 16
+ %masked = and i32 %hibits, 15
+ store i32 %masked, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN_LABEL: @bfe_divergent
+; GCN: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 4
+define amdgpu_kernel void @bfe_divergent(i32 %val, i32 addrspace(1)* %out) {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %divergent = add i32 %val, %tid
+ %hibits = lshr i32 %divergent, 16
+ %masked = and i32 %hibits, 15
+ store i32 %masked, i32 addrspace(1)* %out
+ ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index e68b93bed96f..0220940f2934 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -79,14 +79,14 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; SI-NEXT: s_movk_i32 s7, 0xfc01
+; SI-NEXT: s_movk_i32 s6, 0xfc01
; SI-NEXT: s_mov_b32 s0, -1
; SI-NEXT: s_mov_b32 s1, 0xfffff
-; SI-NEXT: s_brev_b32 s6, -2
+; SI-NEXT: s_brev_b32 s7, -2
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bfe_u32 v4, v3, 20, 11
-; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v4
+; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4
; SI-NEXT: v_lshr_b64 v[4:5], s[0:1], v6
; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3
; SI-NEXT: v_not_b32_e32 v4, v4
@@ -100,7 +100,7 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5]
-; SI-NEXT: v_bfi_b32 v2, s6, v8, v3
+; SI-NEXT: v_bfi_b32 v2, s7, v8, v3
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
index 9bafe57b786e..9b24fc80e6f4 100644
--- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
@@ -62,13 +62,13 @@ define <3 x i1> @test_srem_vec(<3 x i31> %X) nounwind {
; CHECK-NEXT: v_bfe_i32 v3, v2, 0, 31
; CHECK-NEXT: v_bfe_i32 v4, v1, 0, 31
; CHECK-NEXT: v_bfe_i32 v5, v0, 0, 31
-; CHECK-NEXT: s_mov_b32 s6, 0x38e38e39
-; CHECK-NEXT: s_mov_b32 s7, 0xc71c71c7
-; CHECK-NEXT: s_brev_b32 s4, -2
-; CHECK-NEXT: s_mov_b32 s5, 0x7ffffffd
-; CHECK-NEXT: v_mul_hi_i32 v5, v5, s6
-; CHECK-NEXT: v_mul_hi_i32 v4, v4, s6
-; CHECK-NEXT: v_mul_hi_i32 v3, v3, s7
+; CHECK-NEXT: s_mov_b32 s4, 0x38e38e39
+; CHECK-NEXT: s_mov_b32 s5, 0xc71c71c7
+; CHECK-NEXT: s_brev_b32 s6, -2
+; CHECK-NEXT: s_mov_b32 s7, 0x7ffffffd
+; CHECK-NEXT: v_mul_hi_i32 v5, v5, s4
+; CHECK-NEXT: v_mul_hi_i32 v4, v4, s4
+; CHECK-NEXT: v_mul_hi_i32 v3, v3, s5
; CHECK-NEXT: v_lshrrev_b32_e32 v6, 31, v5
; CHECK-NEXT: v_lshrrev_b32_e32 v5, 1, v5
; CHECK-NEXT: v_lshrrev_b32_e32 v7, 31, v4
@@ -84,12 +84,12 @@ define <3 x i1> @test_srem_vec(<3 x i31> %X) nounwind {
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
-; CHECK-NEXT: v_and_b32_e32 v2, s4, v2
-; CHECK-NEXT: v_and_b32_e32 v1, s4, v1
-; CHECK-NEXT: v_and_b32_e32 v0, s4, v0
+; CHECK-NEXT: v_and_b32_e32 v2, s6, v2
+; CHECK-NEXT: v_and_b32_e32 v1, s6, v1
+; CHECK-NEXT: v_and_b32_e32 v0, s6, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 3, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
More information about the llvm-commits
mailing list