[llvm] [AMDGPU] Allow lane-op lowering for illegal types (PR #114887)
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 7 16:36:05 PST 2024
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/114887
>From 72b7e7d847790f7243a03ac017d8ccd8b0edb122 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Mon, 4 Nov 2024 14:58:48 -0800
Subject: [PATCH 1/2] [AMDGPU] Allow lane-op lowering for illegal types
Currently overloaded lane-op intrinsics only work for legal types.
It fails with 'Do not know how to promote this operator' with SDag
on the i8 type notably. The patch fixes that.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 11 ++
.../CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll | 19 ++
.../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 158 +++++++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll | 27 +++
.../AMDGPU/llvm.amdgcn.readfirstlane.ll | 48 ++++++
.../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 48 ++++++
.../llvm.amdgcn.set.inactive.chain.arg.ll | 62 +++++++
.../AMDGPU/llvm.amdgcn.set.inactive.ll | 23 +++
.../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 162 ++++++++++++++++++
9 files changed, 558 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 419414e5bd993d..ab041d58f7f6fe 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6454,6 +6454,17 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(LoadVal);
return;
}
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_writelane:
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16:
+ case Intrinsic::amdgcn_permlane64:
+ case Intrinsic::amdgcn_set_inactive:
+ case Intrinsic::amdgcn_set_inactive_chain_arg:
+ case Intrinsic::amdgcn_mov_dpp8:
+ Results.push_back(lowerLaneOp(*this, N, DAG));
+ return;
}
break;
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
index 049cc455ab01cb..1d6d2b315bccc8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
@@ -184,6 +184,25 @@ define amdgpu_ps void @dpp8_double(double %in, ptr addrspace(1) %out) {
ret void
}
+; GFX10PLUS-LABEL: {{^}}dpp8_i8:
+; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: global_store_{{byte|b8}} v[1:2], v0, off
+define amdgpu_ps void @dpp8_i8(i8 %in, ptr addrspace(1) %out) {
+ %tmp0 = call i8 @llvm.amdgcn.mov.dpp8.i8(i8 %in, i32 1)
+ store i8 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_i1:
+; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: v_and_b32_e32 v0, 1, v0
+; GFX10PLUS: global_store_{{byte|b8}} v[1:2], v0, off
+define amdgpu_ps void @dpp8_i1(i1 %in, ptr addrspace(1) %out) {
+ %tmp0 = call i1 @llvm.amdgcn.mov.dpp8.i1(i1 %in, i32 1)
+ store i1 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0
attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index aa6069c67f62ee..b1cf33a530b538 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -8770,6 +8770,85 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s
ret void
}
+define void @v_permlane16_i8(ptr addrspace(1) %out, i8 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlane16_i8:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlane16_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-NEXT: global_store_b8 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call i8 @llvm.amdgcn.permlane16.i8(i8 %src0, i8 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i8 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_i1(ptr addrspace(1) %out, i1 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_i1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlane16_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlane16_i1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX12-NEXT: global_store_b8 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call i1 @llvm.amdgcn.permlane16.i1(i1 %src0, i1 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i1 %v, ptr addrspace(1) %out
+ ret void
+}
+
define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %src1, i32 %src2) {
; GFX10-SDAG-LABEL: v_permlanex16_v2f32:
; GFX10-SDAG: ; %bb.0:
@@ -9258,3 +9337,82 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr
store <8 x i16> %v, ptr addrspace(1) %out
ret void
}
+
+define void @v_permlanex16_i8(ptr addrspace(1) %out, i8 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlanex16_i8:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlanex16_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-NEXT: global_store_b8 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call i8 @llvm.amdgcn.permlanex16.i8(i8 %src0, i8 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i8 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_i1(ptr addrspace(1) %out, i1 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_i1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlanex16_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlanex16_i1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX12-NEXT: global_store_b8 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call i1 @llvm.amdgcn.permlanex16.i1(i1 %src0, i1 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i1 %v, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index 216731519731a0..449187cf385fa4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -52,6 +52,33 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
store i32 %v, ptr addrspace(1) %out
ret void
}
+
+define void @test_i8(ptr addrspace(1) %out, i8 %src0) #1 {
+; GFX11-LABEL: test_i8:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_permlane64_b32 v2, v2
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %v = call i8 @llvm.amdgcn.permlane64.i8(i8 %src0)
+ store i8 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_i1(ptr addrspace(1) %out, i1 %src0) #1 {
+; GFX11-LABEL: test_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_permlane64_b32 v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %v = call i1 @llvm.amdgcn.permlane64.i1(i1 %src0)
+ store i1 %v, ptr addrspace(1) %out
+ ret void
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11-GISEL: {{.*}}
; GFX11-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 39a3b1c8adc9f1..bf66d9dbaf0565 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -700,3 +700,51 @@ define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
ret void
}
+
+define void @dpp8_i8(i8 %in, ptr addrspace(1) %out) {
+; CHECK-SDAG-LABEL: dpp8_i8:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-SDAG-NEXT: flat_store_byte v[1:2], v0
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: dpp8_i8:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-GISEL-NEXT: flat_store_byte v[1:2], v0
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %tmp0 = call i8 @llvm.amdgcn.readfirstlane.i8(i8 %in)
+ store i8 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+define void @dpp8_i1(i1 %in, ptr addrspace(1) %out) {
+; CHECK-SDAG-LABEL: dpp8_i1:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v0
+; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-SDAG-NEXT: flat_store_byte v[1:2], v0
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: dpp8_i1:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-GISEL-NEXT: flat_store_byte v[1:2], v0
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %tmp0 = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %in)
+ store i1 %tmp0, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 24a332fa211c15..b33929720ae1ba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -894,6 +894,54 @@ define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src
ret void
}
+define void @dpp8_i8(i8 %in, ptr addrspace(1) %out) {
+; CHECK-SDAG-LABEL: dpp8_i8:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v0, 1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-SDAG-NEXT: flat_store_byte v[1:2], v0
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: dpp8_i8:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v0, 1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-GISEL-NEXT: flat_store_byte v[1:2], v0
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %tmp0 = call i8 @llvm.amdgcn.readlane.i8(i8 %in, i32 1)
+ store i8 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+define void @dpp8_i1(i1 %in, ptr addrspace(1) %out) {
+; CHECK-SDAG-LABEL: dpp8_i1:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v0, 1
+; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-SDAG-NEXT: flat_store_byte v[1:2], v0
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: dpp8_i1:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v0, 1
+; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-GISEL-NEXT: flat_store_byte v[1:2], v0
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %tmp0 = call i1 @llvm.amdgcn.readlane.i1(i1 %in, i32 1)
+ store i1 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index fbf8c203dcb390..a9a03d3eefc2ac 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -861,6 +861,68 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
ret void
}
+define amdgpu_cs_chain void @set_inactive_chain_arg_i16(ptr addrspace(1) %out, i16 %inactive, i16 %active) {
+; GFX11-LABEL: set_inactive_chain_arg_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_or_saveexec_b32 s0, -1
+; GFX11-NEXT: v_mov_b32_e32 v0, v10
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_or_saveexec_b32 s0, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: global_store_b16 v[8:9], v1, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX10-LABEL: set_inactive_chain_arg_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-NEXT: v_mov_b32_e32 v0, v10
+; GFX10-NEXT: s_mov_b32 exec_lo, s0
+; GFX10-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0
+; GFX10-NEXT: s_mov_b32 exec_lo, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: global_store_short v[8:9], v1, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11_W64-LABEL: set_inactive_chain_arg_i16:
+; GFX11_W64: ; %bb.0:
+; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10
+; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1]
+; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11_W64-NEXT: v_mov_b32_e32 v1, v0
+; GFX11_W64-NEXT: global_store_b16 v[8:9], v1, off
+; GFX11_W64-NEXT: s_endpgm
+;
+; GFX10_W64-LABEL: set_inactive_chain_arg_i16:
+; GFX10_W64: ; %bb.0:
+; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX10_W64-NEXT: v_mov_b32_e32 v0, v10
+; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1]
+; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX10_W64-NEXT: v_mov_b32_e32 v1, v0
+; GFX10_W64-NEXT: global_store_short v[8:9], v1, off
+; GFX10_W64-NEXT: s_endpgm
+ %tmp = call i16 @llvm.amdgcn.set.inactive.chain.arg.i16(i16 %active, i16 %inactive) #0
+ %wwm = call i16 @llvm.amdgcn.strict.wwm.i16(i16 %tmp)
+ store i16 %wwm, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64, i64) #0
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 6fb5a9ce47a843..b51011ec29699f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -504,6 +504,29 @@ define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(
ret void
}
+define void @set_inactive_i16(ptr addrspace(1) %out, i16 %in) {
+; GCN-LABEL: set_inactive_i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v3, 3, v2, s[4:5]
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v3
+; GCN-NEXT: flat_store_short v[0:1], v2
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %tmp.0 = call i16 @llvm.amdgcn.set.inactive.i16(i16 %in, i16 3) #0
+ %tmp = call i16 @llvm.amdgcn.strict.wwm.i16(i16 %tmp.0)
+ store i16 %tmp, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 837d484583d53f..0cd8e2c60589bf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -2698,6 +2698,168 @@ define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %sr
ret void
}
+define void @test_writelane_i8(ptr addrspace(1) %out, i8 %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_i8:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_ubyte v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: flat_store_byte v[0:1], v4
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_i8:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_ubyte v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: global_store_byte v[0:1], v4, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_i8:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_u8 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: global_store_b8 v[0:1], v4, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_i8:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_ubyte v4, v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT: flat_store_byte v[0:1], v4
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_i8:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_ubyte v4, v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT: global_store_byte v[0:1], v4, off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_i8:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_u8 v4, v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT: global_store_b8 v[0:1], v4, off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load i8, ptr addrspace(1) %out
+ %writelane = call i8 @llvm.amdgcn.writelane.i8(i8 %src, i32 %src1, i8 %oldval)
+ store i8 %writelane, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_writelane_i1(ptr addrspace(1) %out, i1 %src, i32 %src1) {
+; GFX802-SDAG-LABEL: test_writelane_i1:
+; GFX802-SDAG: ; %bb.0:
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-SDAG-NEXT: flat_load_ubyte v4, v[0:1]
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3
+; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-SDAG-NEXT: v_and_b32_e32 v2, 1, v4
+; GFX802-SDAG-NEXT: flat_store_byte v[0:1], v2
+; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-SDAG-LABEL: test_writelane_i1:
+; GFX1010-SDAG: ; %bb.0:
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-SDAG-NEXT: global_load_ubyte v4, v[0:1], off
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-SDAG-NEXT: v_and_b32_e32 v2, 1, v4
+; GFX1010-SDAG-NEXT: global_store_byte v[0:1], v2, off
+; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-LABEL: test_writelane_i1:
+; GFX1100-SDAG: ; %bb.0:
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_load_u8 v4, v[0:1], off
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-SDAG-NEXT: v_and_b32_e32 v2, 1, v4
+; GFX1100-SDAG-NEXT: global_store_b8 v[0:1], v2, off
+; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX802-GISEL-LABEL: test_writelane_i1:
+; GFX802-GISEL: ; %bb.0:
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX802-GISEL-NEXT: flat_load_ubyte v4, v[0:1]
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX802-GISEL-NEXT: s_mov_b32 m0, s5
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0
+; GFX802-GISEL-NEXT: v_and_b32_e32 v2, 1, v4
+; GFX802-GISEL-NEXT: flat_store_byte v[0:1], v2
+; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-GISEL-LABEL: test_writelane_i1:
+; GFX1010-GISEL: ; %bb.0:
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-GISEL-NEXT: global_load_ubyte v4, v[0:1], off
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5
+; GFX1010-GISEL-NEXT: v_and_b32_e32 v2, 1, v4
+; GFX1010-GISEL-NEXT: global_store_byte v[0:1], v2, off
+; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-LABEL: test_writelane_i1:
+; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_load_u8 v4, v[0:1], off
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
+; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
+; GFX1100-GISEL-NEXT: v_and_b32_e32 v2, 1, v4
+; GFX1100-GISEL-NEXT: global_store_b8 v[0:1], v2, off
+; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %oldval = load i1, ptr addrspace(1) %out
+ %writelane = call i1 @llvm.amdgcn.writelane.i1(i1 %src, i32 %src1, i1 %oldval)
+ store i1 %writelane, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind readnone convergent }
>From 7c4fa7d8b8a6020e739b1acab2d993d2ed1cde2e Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Tue, 5 Nov 2024 14:20:27 -0800
Subject: [PATCH 2/2] Added amdgcn_update_dpp
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 1 +
.../CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll | 18 ++++++++++++++++++
2 files changed, 19 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ab041d58f7f6fe..4c5bbc87521dff 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6463,6 +6463,7 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
case Intrinsic::amdgcn_set_inactive:
case Intrinsic::amdgcn_set_inactive_chain_arg:
case Intrinsic::amdgcn_mov_dpp8:
+ case Intrinsic::amdgcn_update_dpp:
Results.push_back(lowerLaneOp(*this, N, DAG));
return;
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index a7424831ae5dbf..a7a2141b4d5693 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -576,6 +576,24 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb8(ptr addrspace(1) %out, <2 x
ret void
}
+; GCN-LABEL: {{^}}dpp_i8:
+; GCN: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN: store_{{byte|b8}} v[0:1], v2
+define void @dpp_i8(ptr addrspace(1) %out, i8 %in) {
+ %tmp0 = call i8 @llvm.amdgcn.update.dpp.i8(i8 %in, i8 %in, i32 1, i32 1, i32 1, i1 false) #0
+ store i8 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_i1:
+; GCN: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN: store_{{byte|b8}} v[0:1], v2
+define void @dpp_i1(ptr addrspace(1) %out, i1 %in) {
+ %tmp0 = call i1 @llvm.amdgcn.update.dpp.i8(i1 %in, i1 %in, i32 1, i32 1, i32 1, i1 false) #0
+ store i1 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x()
declare void @llvm.amdgcn.s.barrier()
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
More information about the llvm-commits
mailing list