[llvm] d88a146 - [AMDGPU] Missed sign/zero extend patterns for divergence-driven instruction selection
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 10 08:33:05 PST 2022
Author: alex-t
Date: 2022-02-10T19:36:12+03:00
New Revision: d88a146f2bc1422997f98ac9c8030af32e744433
URL: https://github.com/llvm/llvm-project/commit/d88a146f2bc1422997f98ac9c8030af32e744433
DIFF: https://github.com/llvm/llvm-project/commit/d88a146f2bc1422997f98ac9c8030af32e744433.diff
LOG: [AMDGPU] Missed sign/zero extend patterns for divergence-driven instruction selection
This change includes tablegen patterns that were missed by https://reviews.llvm.org/D110950 and https://reviews.llvm.org/D76230
Reviewed By: foad
Differential Revision: https://reviews.llvm.org/D119302
Added:
llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll
llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
Modified:
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/lib/Target/AMDGPU/SOPInstructions.td
llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 095ca4c303bad..952eecb07459b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -14,6 +14,16 @@ class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateContro
}
+class UniformSextInreg<ValueType VT> : PatFrag<
+ (ops node:$src),
+ (sext_inreg $src, VT),
+ [{ return !N->isDivergent(); }]>;
+
+class DivergentSextInreg<ValueType VT> : PatFrag<
+ (ops node:$src),
+ (sext_inreg $src, VT),
+ [{ return N->isDivergent(); }]>;
+
include "SOPInstructions.td"
include "VOPInstructions.td"
include "SMInstructions.td"
@@ -1939,12 +1949,6 @@ def : GCNPat <
//===----------------------------------------------------------------------===//
// Conversion Patterns
//===----------------------------------------------------------------------===//
-
-class UniformSextInreg<ValueType VT> : PatFrag<
- (ops node:$src),
- (sext_inreg $src, VT),
- [{ return !N->isDivergent(); }]>;
-
def : GCNPat<(i32 (UniformSextInreg<i1> i32:$src)),
(S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
@@ -1979,13 +1983,8 @@ def : GCNPat <
(S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
>;
-
-class DivergentSextInreg<ValueType VT> : PatFrag<
- (ops node:$src),
- (sext_inreg $src, VT),
- [{ return N->isDivergent(); }]>;
-
-def : GCNPat<(i32 (DivergentSextInreg<i1> i32:$src)),
+def : GCNPat<
+ (i32 (DivergentSextInreg<i1> i32:$src)),
(V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>;
def : GCNPat <
@@ -1998,6 +1997,16 @@ def : GCNPat <
(V_BFE_I32_e64 $src, (i32 0), (i32 8))
>;
+def : GCNPat<
+ (i32 (DivergentSextInreg<i8> i32:$src)),
+ (V_BFE_I32_e64 i32:$src, (i32 0), (i32 8))
+>;
+
+def : GCNPat <
+ (i32 (DivergentSextInreg<i16> i32:$src)),
+ (V_BFE_I32_e64 $src, (i32 0), (i32 16))
+>;
+
def : GCNPat <
(i64 (DivergentSextInreg<i1> i64:$src)),
(REG_SEQUENCE VReg_64,
@@ -2051,11 +2060,17 @@ def : ZExt_i64_i1_Pat<anyext>;
// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
// REG_SEQUENCE patterns don't support instructions with multiple outputs.
def : GCNPat <
- (i64 (sext i32:$src)),
+ (i64 (UniformUnaryFrag<sext> i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0,
(i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
>;
+def : GCNPat <
+ (i64 (DivergentUnaryFrag<sext> i32:$src)),
+ (REG_SEQUENCE VReg_64, $src, sub0,
+ (i32 (COPY_TO_REGCLASS (V_ASHRREV_I32_e64 (i32 31), $src), VGPR_32)), sub1)
+>;
+
def : GCNPat <
(i64 (sext i1:$src)),
(REG_SEQUENCE VReg_64,
@@ -2232,6 +2247,18 @@ def : GCNPat <
// certainty what the source behavior is without more context on how
// the src is lowered. e.g. fptrunc + fma may be lowered to a
// v_fma_mix* instruction which does not zero, or may not.
+def : GCNPat<
+ (i32 (DivergentUnaryFrag<zext> i16:$src)),
+ (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src)
+>;
+
+def : GCNPat<
+ (i64 (DivergentUnaryFrag<zext> i16:$src)),
+ (REG_SEQUENCE VReg_64,
+ (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src), sub0,
+ (S_MOV_B32 (i32 0)), sub1)
+>;
+
def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
(COPY VSrc_b16:$src)>;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 3f7837f7dbf11..25c33c2cc5d71 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -276,10 +276,10 @@ def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32",
>;
def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">;
def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8",
- [(set i32:$sdst, (sext_inreg i32:$src0, i8))]
+ [(set i32:$sdst, (UniformSextInreg<i8> i32:$src0))]
>;
def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16",
- [(set i32:$sdst, (sext_inreg i32:$src0, i16))]
+ [(set i32:$sdst, (UniformSextInreg<i16> i32:$src0))]
>;
} // End isReMaterializable = 1
@@ -1408,7 +1408,7 @@ def : GCNPat <
// REG_SEQUENCE patterns don't support instructions with multiple
// outputs.
def : GCNPat<
- (i64 (zext i16:$src)),
+ (i64 (UniformUnaryFrag<zext> i16:$src)),
(REG_SEQUENCE SReg_64,
(i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0,
(S_MOV_B32 (i32 0)), sub1)
@@ -1421,7 +1421,7 @@ def : GCNPat <
>;
def : GCNPat<
- (i32 (zext i16:$src)),
+ (i32 (UniformUnaryFrag<zext> i16:$src)),
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
>;
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll
new file mode 100644
index 0000000000000..d76d3b542a00c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s
+
+define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+; GCN-LABEL: uniform_sext_in_reg_i8_to_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_i32 s2, s4, s5
+; GCN-NEXT: s_sext_i32_i8 s4, s2
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %c = add i32 %a, %b ; add to prevent folding into extload
+ %shl = shl i32 %c, 24
+ %ashr = ashr i32 %shl, 24
+ store i32 %ashr, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+; GCN-LABEL: divergent_sext_in_reg_i8_to_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_i32 s4, s4, s5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %c = add i32 %a, %b ; add to prevent folding into extload
+ %c.divergent = add i32 %c, %tid
+ %shl = shl i32 %c.divergent, 24
+ %ashr = ashr i32 %shl, 24
+ store i32 %ashr, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+; GCN-LABEL: uniform_sext_in_reg_i16_to_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_i32 s2, s4, s5
+; GCN-NEXT: s_sext_i32_i16 s4, s2
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %c = add i32 %a, %b ; add to prevent folding into extload
+ %shl = shl i32 %c, 16
+ %ashr = ashr i32 %shl, 16
+ store i32 %ashr, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @divergent_sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+; GCN-LABEL: divergent_sext_in_reg_i16_to_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_i32 s4, s4, s5
+; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %c = add i32 %a, %b ; add to prevent folding into extload
+ %c.divergent = add i32 %c, %tid
+ %shl = shl i32 %c.divergent, 16
+ %ashr = ashr i32 %shl, 16
+ store i32 %ashr, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index 4f84302ced52f..b7b2fe5d02492 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -41,17 +41,15 @@ define i1 @divergent_trunc_i16_to_i1(i1 addrspace(1)* %out, i16 %x, i1 %z) {
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY1]], implicit $exec
; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec
- ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY2]]
- ; GCN-NEXT: [[S_SEXT_I32_I16_:%[0-9]+]]:sreg_32 = S_SEXT_I32_I16 [[COPY3]]
+ ; GCN-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[COPY2]], 0, 16, implicit $exec
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_]]
- ; GCN-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[S_SEXT_I32_I16_]], [[COPY4]], implicit $exec
+ ; GCN-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_BFE_I32_e64_]], killed [[S_MOV_B32_]], implicit $exec
; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
- ; GCN-NEXT: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; GCN-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
- ; GCN-NEXT: [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY5]]
- ; GCN-NEXT: S_SETPC_B64_return [[COPY6]], implicit $vgpr0
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+ ; GCN-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
%setcc = icmp slt i16 %x, 0
%select = select i1 %setcc, i1 true, i1 %z
ret i1 %select
diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
index a11b4033f8ff0..14e3645655cb7 100755
--- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
@@ -85,6 +85,47 @@ define amdgpu_kernel void @sext_i16_to_i64_divergent(i64 addrspace(1)* %out, i16
ret void
}
+define amdgpu_kernel void @sext_i32_to_i64_uniform(i64 addrspace(1)* %out, i32 %a, i64 %b) {
+; GCN-LABEL: sext_i32_to_i64_uniform:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s6, s[0:1], 0xb
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_ashr_i32 s7, s6, 31
+; GCN-NEXT: s_add_u32 s4, s4, s6
+; GCN-NEXT: s_addc_u32 s5, s5, s7
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %sext = sext i32 %a to i64
+ %res = add i64 %b, %sext
+ store i64 %res, i64 addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @sext_i32_to_i64_divergent(i64 addrspace(1)* %out, i32 %a, i64 %b) {
+; GCN-LABEL: sext_i32_to_i64_divergent:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0xb
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %divergent.a = add i32 %a, %tid
+ %sext = sext i32 %divergent.a to i64
+ store i64 %sext, i64 addrspace(1)* %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
new file mode 100644
index 0000000000000..2cbe36c196482
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+
+define amdgpu_kernel void @zext_i16_to_i32_uniform(i32 addrspace(1)* %out, i16 %a, i32 %b) {
+; GCN-LABEL: zext_i16_to_i32_uniform:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s4, s4, 0xffff
+; GCN-NEXT: s_add_i32 s4, s5, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %zext = zext i16 %a to i32
+ %res = add i32 %b, %zext
+ store i32 %res, i32 addrspace(1)* %out
+ ret void
+}
+
+
+define amdgpu_kernel void @zext_i16_to_i64_uniform(i64 addrspace(1)* %out, i16 %a, i64 %b) {
+; GCN-LABEL: zext_i16_to_i64_uniform:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s6, s[0:1], 0xb
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s6, s6, 0xffff
+; GCN-NEXT: s_add_u32 s4, s4, s6
+; GCN-NEXT: s_addc_u32 s5, s5, 0
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %zext = zext i16 %a to i64
+ %res = add i64 %b, %zext
+ store i64 %res, i64 addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @zext_i16_to_i32_divergent(i32 addrspace(1)* %out, i16 %a, i32 %b) {
+; GCN-LABEL: zext_i16_to_i32_divergent:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0xb
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.truncated = trunc i32 %tid to i16
+ %divergent.a = add i16 %a, %tid.truncated
+ %zext = zext i16 %divergent.a to i32
+ store i32 %zext, i32 addrspace(1)* %out
+ ret void
+}
+
+
+define amdgpu_kernel void @zext_i16_to_i64_divergent(i64 addrspace(1)* %out, i16 %a, i64 %b) {
+; GCN-LABEL: zext_i16_to_i64_divergent:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0xb
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.truncated = trunc i32 %tid to i16
+ %divergent.a = add i16 %a, %tid.truncated
+ %zext = zext i16 %divergent.a to i64
+ store i64 %zext, i64 addrspace(1)* %out
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
More information about the llvm-commits
mailing list