[llvm] d159b44 - [AMDGPU] Enable divergence predicates for negative inline constant subtraction
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 10 04:03:30 PST 2022
Author: alex-t
Date: 2022-03-10T15:03:22+03:00
New Revision: d159b4444c880b8f42c4d507a49107535657be94
URL: https://github.com/llvm/llvm-project/commit/d159b4444c880b8f42c4d507a49107535657be94
DIFF: https://github.com/llvm/llvm-project/commit/d159b4444c880b8f42c4d507a49107535657be94.diff
LOG: [AMDGPU] Enable divergence predicates for negative inline constant subtraction
We have a pattern that undo sub x, c -> add x, -c canonicalization since c is more likely
an inline immediate than -c. This patch enables it to select scalar or vector subtracion by the input node divergence.
Reviewed By: foad
Differential Revision: https://reviews.llvm.org/D121360
Added:
llvm/test/CodeGen/AMDGPU/divergence-driven-negsubinlineconst.ll
Modified:
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/shift-i128.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 26acaa738c863..511e905d1a471 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2764,18 +2764,18 @@ def : GCNPat <
// an inline immediate than -c.
// TODO: Also do for 64-bit.
def : GCNPat<
- (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+ (UniformBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
(S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1)
>;
def : GCNPat<
- (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+ (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
(V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
let SubtargetPredicate = HasAddNoCarryInsts;
}
def : GCNPat<
- (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+ (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
(V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
let SubtargetPredicate = NotHasAddNoCarryInsts;
}
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-negsubinlineconst.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-negsubinlineconst.ll
new file mode 100644
index 0000000000000..9ba2810b5c967
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-negsubinlineconst.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
+
+; FUNC-LABEL: {{^}}uniform_add_SIC:
+; GCN: S_SUB_I32 killed %{{[0-9]+}}, 32
+define amdgpu_kernel void @uniform_add_SIC(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+ %a = load i32, i32 addrspace(1)* %in
+ %result = add i32 %a, -32
+ store i32 %result, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}uniform_add_SIC:
+; SI: V_SUB_CO_U32_e64 killed %{{[0-9]+}}, 32
+; GFX900: V_SUB_U32_e64 killed %{{[0-9]+}}, 32
+define amdgpu_kernel void @divergent_add_SIC(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
+ %a = load volatile i32, i32 addrspace(1)* %gep
+ %result = add i32 %a, -32
+ store i32 %result, i32 addrspace(1)* %out
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index b6457168d4eb2..78ad2625a673a 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -306,43 +306,43 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8
; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v16
; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v8
-; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9]
-; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
+; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9]
+; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
-; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8
+; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8
; GCN-NEXT: v_or_b32_e32 v19, v19, v17
; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9
-; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
-; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc
+; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
+; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[4:5]
; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12
-; GCN-NEXT: v_cndmask_b32_e64 v11, v17, v19, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc
; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v9
; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v12
-; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5]
; GCN-NEXT: v_or_b32_e32 v16, v16, v9
-; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13]
-; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
-; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12
+; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[12:13]
+; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
+; GCN-NEXT: v_subrev_i32_e64 v9, s[8:9], 64, v12
; GCN-NEXT: v_or_b32_e32 v11, v17, v10
; GCN-NEXT: v_lshl_b64 v[9:10], v[4:5], v9
; GCN-NEXT: v_or_b32_e32 v15, v13, v15
; GCN-NEXT: v_or_b32_e32 v14, v12, v14
-; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7]
+; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
-; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5]
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v12
; GCN-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v9, v10, v11, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = shl <2 x i128> %lhs, %rhs
ret <2 x i128> %shl
@@ -355,43 +355,43 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8
; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16
; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8
-; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9]
-; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
+; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9]
+; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
-; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8
+; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8
; GCN-NEXT: v_or_b32_e32 v19, v19, v17
; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9
-; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
-; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
+; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc
+; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
+; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5]
; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12
-; GCN-NEXT: v_cndmask_b32_e64 v11, v17, v19, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc
; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9
; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12
-; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[4:5]
; GCN-NEXT: v_or_b32_e32 v16, v16, v9
-; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13]
-; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
-; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12
+; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[12:13]
+; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
+; GCN-NEXT: v_subrev_i32_e64 v9, s[8:9], 64, v12
; GCN-NEXT: v_or_b32_e32 v11, v17, v10
; GCN-NEXT: v_lshr_b64 v[9:10], v[6:7], v9
; GCN-NEXT: v_or_b32_e32 v15, v13, v15
; GCN-NEXT: v_or_b32_e32 v14, v12, v14
-; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7]
+; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
-; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5]
; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v8
; GCN-NEXT: v_lshr_b64 v[6:7], v[6:7], v12
; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v9, v10, v11, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v7, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = lshr <2 x i128> %lhs, %rhs
ret <2 x i128> %shl
@@ -404,45 +404,45 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8
; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16
; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8
-; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9]
-; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
+; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9]
+; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
-; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8
+; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8
; GCN-NEXT: v_or_b32_e32 v19, v19, v17
; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9
-; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
-; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
+; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc
+; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
+; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5]
; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12
-; GCN-NEXT: v_cndmask_b32_e64 v11, v17, v19, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc
; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9
; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12
-; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[4:5]
; GCN-NEXT: v_or_b32_e32 v16, v16, v9
-; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13]
-; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
-; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12
+; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[12:13]
+; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
+; GCN-NEXT: v_subrev_i32_e64 v9, s[8:9], 64, v12
; GCN-NEXT: v_or_b32_e32 v11, v17, v10
; GCN-NEXT: v_ashr_i64 v[9:10], v[6:7], v9
; GCN-NEXT: v_or_b32_e32 v15, v13, v15
; GCN-NEXT: v_or_b32_e32 v14, v12, v14
-; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7]
+; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
-; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v9, v10, v11, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7]
; GCN-NEXT: v_ashr_i64 v[8:9], v[2:3], v8
; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3
-; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v8, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
; GCN-NEXT: v_ashr_i64 v[8:9], v[6:7], v12
; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7
-; GCN-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v6, v7, v8, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = ashr <2 x i128> %lhs, %rhs
ret <2 x i128> %shl
More information about the llvm-commits
mailing list