[llvm] 48a9cf9 - [AMDGPU] Enable SEXT divergence driven selection.
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 17 07:30:31 PDT 2020
Author: alex-t
Date: 2020-03-17T17:30:11+03:00
New Revision: 48a9cf90439aafaa40260d94897741ce12a730b0
URL: https://github.com/llvm/llvm-project/commit/48a9cf90439aafaa40260d94897741ce12a730b0
DIFF: https://github.com/llvm/llvm-project/commit/48a9cf90439aafaa40260d94897741ce12a730b0.diff
LOG: [AMDGPU] Enable SEXT divergence driven selection.
Summary: This change enable the divergence driven selection for the SEXT DAG opcode.
Reviewers: vpykhtin, rampitec
Reviewed By: vpykhtin
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Differential Revision: https://reviews.llvm.org/D76230
Added:
llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
Modified:
llvm/lib/Target/AMDGPU/SOPInstructions.td
llvm/lib/Target/AMDGPU/VOP3Instructions.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 054ce02b14d2..95233a0a0f34 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1256,7 +1256,7 @@ def : GCNPat <
// Same as a 32-bit inreg
def : GCNPat<
- (i32 (sext i16:$src)),
+ (i32 (UniformUnaryFrag<sext> i16:$src)),
(S_SEXT_I32_I16 $src)
>;
@@ -1283,7 +1283,7 @@ def : GCNPat<
>;
def : GCNPat <
- (i64 (sext i16:$src)),
+ (i64 (UniformUnaryFrag<sext> i16:$src)),
(REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0,
(i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1)
>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index a33cd3b8014d..30345f82fae6 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -402,6 +402,20 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, as
} // End SubtargetPredicate = isGFX8Plus
} // End SchedRW = [Write64Bit]
+def : GCNPat<
+ (i64 (getDivergentFrag<sext>.ret i16:$src)),
+ (REG_SEQUENCE VReg_64,
+ (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))), sub0,
+ (i32 (COPY_TO_REGCLASS
+ (V_ASHRREV_I32_e32 (S_MOV_B32 (i32 0x1f)), (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
+ ), VGPR_32)), sub1)
+>;
+
+def : GCNPat<
+ (i32 (getDivergentFrag<sext>.ret i16:$src)),
+ (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
+>;
+
let SubtargetPredicate = isGFX6GFX7GFX10 in {
def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
} // End SubtargetPredicate = isGFX6GFX7GFX10
diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
new file mode 100755
index 000000000000..6ce316d79d4c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+
+define amdgpu_kernel void @sext_i16_to_i32_uniform(i32 addrspace(1)* %out, i16 %a, i32 %b) {
+; GCN-LABEL: sext_i16_to_i32_uniform:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_sext_i32_i16 s0, s0
+; GCN-NEXT: s_add_i32 s0, s1, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %sext = sext i16 %a to i32
+ %res = add i32 %b, %sext
+ store i32 %res, i32 addrspace(1)* %out
+ ret void
+}
+
+
+define amdgpu_kernel void @sext_i16_to_i64_uniform(i64 addrspace(1)* %out, i16 %a, i64 %b) {
+; GCN-LABEL: sext_i16_to_i64_uniform:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT: s_load_dword s2, s[0:1], 0xb
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
+; GCN-NEXT: s_add_u32 s0, s0, s2
+; GCN-NEXT: s_addc_u32 s1, s1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %sext = sext i16 %a to i64
+ %res = add i64 %b, %sext
+ store i64 %res, i64 addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @sext_i16_to_i32_divergent(i32 addrspace(1)* %out, i16 %a, i32 %b) {
+; GCN-LABEL: sext_i16_to_i32_divergent:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT: s_load_dword s0, s[0:1], 0xb
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.truncated = trunc i32 %tid to i16
+ %divergent.a = add i16 %a, %tid.truncated
+ %sext = sext i16 %divergent.a to i32
+ store i32 %sext, i32 addrspace(1)* %out
+ ret void
+}
+
+
+define amdgpu_kernel void @sext_i16_to_i64_divergent(i64 addrspace(1)* %out, i16 %a, i64 %b) {
+; GCN-LABEL: sext_i16_to_i64_divergent:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT: s_load_dword s0, s[0:1], 0xb
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.truncated = trunc i32 %tid to i16
+ %divergent.a = add i16 %a, %tid.truncated
+ %sext = sext i16 %divergent.a to i64
+ store i64 %sext, i64 addrspace(1)* %out
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
More information about the llvm-commits
mailing list