[llvm] [AMDGCN][SDAG] Expand (INSERT|EXTRACT)_VECTOR_ELT for dynamic indices even with -O0 (PR #170323)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 2 08:32:56 PST 2025
Juan Manuel Martinez =?utf-8?q?Caamaño?=,
Juan Manuel Martinez =?utf-8?q?Caamaño?Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/170323 at github.com>
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Juan Manuel Martinez Caamaño (jmmartinez)
<details>
<summary>Changes</summary>
Before this patch, `insertelement/extractelement` with dynamic indices would
fail to select with `-O0` for vector types that do not map to a
`SI_INDIRECT_SRC/DST` pattern.
To solve this issue, we allow this expansion with `-O0` by hoisting the
code that does the expansion outside of the second `switch` statement in
`SITargetLowering::PerformDAGCombine` and into the first `switch`.
---
Patch is 413.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/170323.diff
6 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+43-26)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+4-1)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll (+2148)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+583-1298)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll (+64-78)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+4286)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a4f376aceaa4b..80ea9fc0789d5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15336,22 +15336,6 @@ SITargetLowering::performExtractVectorEltCombine(SDNode *N,
}
}
- // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
- if (shouldExpandVectorDynExt(N)) {
- SDLoc SL(N);
- SDValue Idx = N->getOperand(1);
- SDValue V;
- for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
- SDValue IC = DAG.getVectorIdxConstant(I, SL);
- SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
- if (I == 0)
- V = Elt;
- else
- V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
- }
- return V;
- }
-
if (!DCI.isBeforeLegalize())
return SDValue();
@@ -15393,19 +15377,45 @@ SITargetLowering::performExtractVectorEltCombine(SDNode *N,
return SDValue();
}
-SDValue
-SITargetLowering::performInsertVectorEltCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
+// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
+SDValue SITargetLowering::performExtractVectorDynEltCombine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ if (!shouldExpandVectorDynExt(N))
+ return SDValue();
+
SDValue Vec = N->getOperand(0);
- SDValue Idx = N->getOperand(2);
+ SelectionDAG &DAG = DCI.DAG;
+
EVT VecVT = Vec.getValueType();
- EVT EltVT = VecVT.getVectorElementType();
+ EVT ResVT = N->getValueType(0);
+
+ SDLoc SL(N);
+ SDValue Idx = N->getOperand(1);
+ SDValue V;
+ for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
+ SDValue IC = DAG.getVectorIdxConstant(I, SL);
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
+ if (I == 0)
+ V = Elt;
+ else
+ V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
+ }
+ return V;
+}
- // INSERT_VECTOR_ELT (<n x e>, var-idx)
- // => BUILD_VECTOR n x select (e, const-idx)
+// INSERT_VECTOR_ELT (<n x e>, var-idx)
+// => BUILD_VECTOR n x select (e, const-idx)
+SDValue
+SITargetLowering::performInsertVectorDynEltCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
if (!shouldExpandVectorDynExt(N))
return SDValue();
+ SDValue Vec = N->getOperand(0);
+ SDValue Idx = N->getOperand(2);
+ EVT VecVT = Vec.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+
SelectionDAG &DAG = DCI.DAG;
SDLoc SL(N);
SDValue Ins = N->getOperand(1);
@@ -16943,12 +16953,21 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
return Res;
break;
+ case ISD::EXTRACT_VECTOR_ELT:
+ if (SDValue V = performExtractVectorDynEltCombine(N, DCI))
+ return V;
+ break;
+ case ISD::INSERT_VECTOR_ELT:
+ if (SDValue V = performInsertVectorDynEltCombine(N, DCI))
+ return V;
+ break;
default:
break;
}
- if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
+ if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
return SDValue();
+ }
switch (N->getOpcode()) {
case ISD::ADD:
@@ -17063,8 +17082,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
}
case ISD::EXTRACT_VECTOR_ELT:
return performExtractVectorEltCombine(N, DCI);
- case ISD::INSERT_VECTOR_ELT:
- return performInsertVectorEltCombine(N, DCI);
case ISD::FP_ROUND:
return performFPRoundCombine(N, DCI);
case ISD::LOAD: {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 40c03ca024c6c..55e883e4c78c6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -223,7 +223,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performExtractVectorDynEltCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const;
+ SDValue performInsertVectorDynEltCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const;
SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index c69b0cce3d208..a4037c817c359 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck --check-prefixes=GCN-O0 %s
define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-LABEL: float4_extelt:
@@ -20,6 +21,36 @@ define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: float4_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s3, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 1.0
+; GCN-O0-NEXT: s_mov_b32 s6, 0
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s6
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 2
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 2.0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 3
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s4, 4.0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s4
+; GCN-O0-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[2:3]
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT: flat_store_dword v[0:1], v2
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <4 x float> <float 0.0, float 1.0, float 2.0, float 4.0>, i32 %sel
store float %ext, ptr addrspace(1) %out
@@ -43,6 +74,28 @@ define amdgpu_kernel void @int4_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: int4_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s1, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s0, s1
+; GCN-O0-NEXT: s_mov_b32 s4, 0
+; GCN-O0-NEXT: s_cselect_b32 s4, s1, s4
+; GCN-O0-NEXT: s_mov_b32 s1, 2
+; GCN-O0-NEXT: s_cmp_eq_u32 s0, s1
+; GCN-O0-NEXT: s_cselect_b32 s1, s1, s4
+; GCN-O0-NEXT: s_mov_b32 s4, 3
+; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4
+; GCN-O0-NEXT: s_mov_b32 s0, 4
+; GCN-O0-NEXT: s_cselect_b32 s0, s0, s1
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_mov_b32_e32 v2, s0
+; GCN-O0-NEXT: flat_store_dword v[0:1], v2
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 4>, i32 %sel
store i32 %ext, ptr addrspace(1) %out
@@ -72,6 +125,56 @@ define amdgpu_kernel void @double4_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: double4_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s5, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s2, 0x3f847ae1
+; GCN-O0-NEXT: s_mov_b32 s8, 0x47ae147b
+; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; GCN-O0-NEXT: s_mov_b32 s9, s2
+; GCN-O0-NEXT: s_mov_b32 s3, s9
+; GCN-O0-NEXT: s_mov_b32 s2, 0x3ff028f5
+; GCN-O0-NEXT: s_mov_b32 s6, 0xc28f5c29
+; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT: s_mov_b32 s7, s2
+; GCN-O0-NEXT: s_mov_b32 s2, s7
+; GCN-O0-NEXT: s_mov_b32 s4, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s5, s4
+; GCN-O0-NEXT: s_cselect_b32 s3, s2, s3
+; GCN-O0-NEXT: s_mov_b32 s4, s8
+; GCN-O0-NEXT: s_mov_b32 s2, s6
+; GCN-O0-NEXT: s_cselect_b32 s4, s2, s4
+; GCN-O0-NEXT: s_mov_b32 s2, 0x4000147a
+; GCN-O0-NEXT: s_mov_b32 s6, 0xe147ae14
+; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT: s_mov_b32 s7, s2
+; GCN-O0-NEXT: s_mov_b32 s2, s6
+; GCN-O0-NEXT: s_mov_b32 s8, 2
+; GCN-O0-NEXT: s_cmp_eq_u32 s5, s8
+; GCN-O0-NEXT: s_cselect_b32 s4, s2, s4
+; GCN-O0-NEXT: s_mov_b32 s2, s7
+; GCN-O0-NEXT: s_cselect_b32 s3, s2, s3
+; GCN-O0-NEXT: s_mov_b32 s2, 0x40100a3d
+; GCN-O0-NEXT: s_mov_b32 s6, 0x70a3d70a
+; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT: s_mov_b32 s7, s2
+; GCN-O0-NEXT: s_mov_b32 s2, s7
+; GCN-O0-NEXT: s_mov_b32 s8, 3
+; GCN-O0-NEXT: s_cmp_eq_u32 s5, s8
+; GCN-O0-NEXT: s_cselect_b32 s2, s2, s3
+; GCN-O0-NEXT: s_mov_b32 s3, s6
+; GCN-O0-NEXT: s_cselect_b32 s3, s3, s4
+; GCN-O0-NEXT: v_mov_b32_e32 v2, s3
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
+; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT: v_mov_b32_e32 v3, v0
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <4 x double> <double 0.01, double 1.01, double 2.01, double 4.01>, i32 %sel
store double %ext, ptr addrspace(1) %out
@@ -109,6 +212,65 @@ define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: double5_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s5, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s4, 0x3f847ae1
+; GCN-O0-NEXT: s_mov_b32 s2, 0x47ae147b
+; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; GCN-O0-NEXT: s_mov_b32 s3, s4
+; GCN-O0-NEXT: s_mov_b32 s8, s2
+; GCN-O0-NEXT: s_mov_b32 s4, 0x3ff028f5
+; GCN-O0-NEXT: s_mov_b32 s6, 0xc28f5c29
+; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT: s_mov_b32 s7, s4
+; GCN-O0-NEXT: s_mov_b32 s4, s6
+; GCN-O0-NEXT: s_mov_b32 s9, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s5, s9
+; GCN-O0-NEXT: s_cselect_b32 s4, s4, s8
+; GCN-O0-NEXT: ; kill: def $sgpr3 killed $sgpr3 killed $sgpr2_sgpr3
+; GCN-O0-NEXT: s_mov_b32 s2, s7
+; GCN-O0-NEXT: s_cselect_b32 s3, s2, s3
+; GCN-O0-NEXT: s_mov_b32 s2, 0x4000147a
+; GCN-O0-NEXT: s_mov_b32 s6, 0xe147ae14
+; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT: s_mov_b32 s7, s2
+; GCN-O0-NEXT: s_mov_b32 s2, s7
+; GCN-O0-NEXT: s_mov_b32 s8, 2
+; GCN-O0-NEXT: s_cmp_eq_u32 s5, s8
+; GCN-O0-NEXT: s_cselect_b32 s3, s2, s3
+; GCN-O0-NEXT: s_mov_b32 s2, s6
+; GCN-O0-NEXT: s_cselect_b32 s4, s2, s4
+; GCN-O0-NEXT: s_mov_b32 s2, 0x40100a3d
+; GCN-O0-NEXT: s_mov_b32 s6, 0x70a3d70a
+; GCN-O0-NEXT: s_mov_b32 s8, s6
+; GCN-O0-NEXT: s_mov_b32 s9, s2
+; GCN-O0-NEXT: s_mov_b32 s2, s8
+; GCN-O0-NEXT: s_mov_b32 s7, 3
+; GCN-O0-NEXT: s_cmp_eq_u32 s5, s7
+; GCN-O0-NEXT: s_cselect_b32 s4, s2, s4
+; GCN-O0-NEXT: s_mov_b32 s2, s9
+; GCN-O0-NEXT: s_cselect_b32 s3, s2, s3
+; GCN-O0-NEXT: s_mov_b32 s2, 0x40140a3d
+; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT: s_mov_b32 s7, s2
+; GCN-O0-NEXT: s_mov_b32 s2, s7
+; GCN-O0-NEXT: s_mov_b32 s8, 4
+; GCN-O0-NEXT: s_cmp_eq_u32 s5, s8
+; GCN-O0-NEXT: s_cselect_b32 s2, s2, s3
+; GCN-O0-NEXT: s_mov_b32 s3, s6
+; GCN-O0-NEXT: s_cselect_b32 s3, s3, s4
+; GCN-O0-NEXT: v_mov_b32_e32 v2, s3
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
+; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT: v_mov_b32_e32 v3, v0
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <5 x double> <double 0.01, double 1.01, double 2.01, double 4.01, double 5.01>, i32 %sel
store double %ext, ptr addrspace(1) %out
@@ -130,6 +292,25 @@ define amdgpu_kernel void @half4_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: flat_store_short v[0:1], v2
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: half4_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s4, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s5, 0x44004200
+; GCN-O0-NEXT: s_mov_b32 s0, 0x40003c00
+; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; GCN-O0-NEXT: s_mov_b32 s1, s5
+; GCN-O0-NEXT: s_mov_b32 s5, 4
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_lshl_b32 s4, s4, s5
+; GCN-O0-NEXT: s_lshr_b64 s[0:1], s[0:1], s4
+; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_mov_b32_e32 v2, s0
+; GCN-O0-NEXT: flat_store_short v[0:1], v2
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, i32 %sel
store half %ext, ptr addrspace(1) %out
@@ -149,6 +330,24 @@ define amdgpu_kernel void @float2_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: float2_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s3, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s4, 1.0
+; GCN-O0-NEXT: s_mov_b32 s5, 0
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s5
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s4
+; GCN-O0-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[2:3]
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT: flat_store_dword v[0:1], v2
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <2 x float> <float 0.0, float 1.0>, i32 %sel
store float %ext, ptr addrspace(1) %out
@@ -172,6 +371,36 @@ define amdgpu_kernel void @double2_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: double2_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s8, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s2, 0x3f847ae1
+; GCN-O0-NEXT: s_mov_b32 s4, 0x47ae147b
+; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GCN-O0-NEXT: s_mov_b32 s5, s2
+; GCN-O0-NEXT: s_mov_b32 s3, s5
+; GCN-O0-NEXT: s_mov_b32 s2, 0x3ff028f5
+; GCN-O0-NEXT: s_mov_b32 s6, 0xc28f5c29
+; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT: s_mov_b32 s7, s2
+; GCN-O0-NEXT: s_mov_b32 s2, s7
+; GCN-O0-NEXT: s_mov_b32 s9, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s8, s9
+; GCN-O0-NEXT: s_cselect_b32 s2, s2, s3
+; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GCN-O0-NEXT: s_mov_b32 s3, s6
+; GCN-O0-NEXT: s_cselect_b32 s3, s3, s4
+; GCN-O0-NEXT: v_mov_b32_e32 v2, s3
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
+; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT: v_mov_b32_e32 v3, v0
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <2 x double> <double 0.01, double 1.01>, i32 %sel
store double %ext, ptr addrspace(1) %out
@@ -217,6 +446,60 @@ define amdgpu_kernel void @half8_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_short v[0:1], v2
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: half8_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s3, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 0x4000
+; GCN-O0-NEXT: s_mov_b32 s6, 0x3c00
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s6
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 2
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 0x4200
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 3
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 0x4400
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 4
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 0x4500
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 5
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 0x4600
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 6
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 0x4700
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 7
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s4, 0x4800
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s4
+; GCN-O0-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[2:3]
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT: flat_store_short v[0:1], v2
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <8 x half> <half 1.0, half 2.0, half 3.0, half 4.0, half 5.0, half 6.0, half 7.0, half 8.0>, i32 %sel
store half %ext, ptr addrspace(1) %out
@@ -248,6 +531,39 @@ define amdgpu_kernel void @short8_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: flat_store_short v[0:1], v2
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: short8_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s1, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s0, s1
+; GCN-O0-NEXT: s_mov_b32 s4, 2
+; GCN-O0-NEXT: s_cselect_b32 s1, s4, s1
+; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4
+; GCN-O0-NEX...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/170323
More information about the llvm-commits
mailing list