[llvm] [AMDGCN][SDAG] Expand (INSERT|EXTRACT)_VECTOR_ELT for dynamic indices even with -O0 (PR #170323)

Tue Dec 2 08:32:56 PST 2025

Juan Manuel Martinez =?utf-8?q?Caamaño?=,
Juan Manuel Martinez =?utf-8?q?Caamaño?Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/170323 at github.com>


llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Juan Manuel Martinez Caamaño (jmmartinez)

<details>
<summary>Changes</summary>

Before this patch, `insertelement/extractelement` with dynamic indices would
fail to select with `-O0` for vector types that do not map to a
`SI_INDIRECT_SRC/DST` pattern.

To solve this issue, we allow this expansion with `-O0` by hoisting the
code that does the expansion outside of the second `switch` statement in
`SITargetLowering::PerformDAGCombine` and into the first `switch`.

---

Patch is 413.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/170323.diff


6 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+43-26) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+4-1) 
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll (+2148) 
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+583-1298) 
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll (+64-78) 
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+4286) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a4f376aceaa4b..80ea9fc0789d5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15336,22 +15336,6 @@ SITargetLowering::performExtractVectorEltCombine(SDNode *N,
     }
   }
 
-  // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
-  if (shouldExpandVectorDynExt(N)) {
-    SDLoc SL(N);
-    SDValue Idx = N->getOperand(1);
-    SDValue V;
-    for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
-      SDValue IC = DAG.getVectorIdxConstant(I, SL);
-      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
-      if (I == 0)
-        V = Elt;
-      else
-        V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
-    }
-    return V;
-  }
-
   if (!DCI.isBeforeLegalize())
     return SDValue();
 
@@ -15393,19 +15377,45 @@ SITargetLowering::performExtractVectorEltCombine(SDNode *N,
   return SDValue();
 }
 
-SDValue
-SITargetLowering::performInsertVectorEltCombine(SDNode *N,
-                                                DAGCombinerInfo &DCI) const {
+// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
+SDValue SITargetLowering::performExtractVectorDynEltCombine(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  if (!shouldExpandVectorDynExt(N))
+    return SDValue();
+
   SDValue Vec = N->getOperand(0);
-  SDValue Idx = N->getOperand(2);
+  SelectionDAG &DAG = DCI.DAG;
+
   EVT VecVT = Vec.getValueType();
-  EVT EltVT = VecVT.getVectorElementType();
+  EVT ResVT = N->getValueType(0);
+
+  SDLoc SL(N);
+  SDValue Idx = N->getOperand(1);
+  SDValue V;
+  for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
+    SDValue IC = DAG.getVectorIdxConstant(I, SL);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
+    if (I == 0)
+      V = Elt;
+    else
+      V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
+  }
+  return V;
+}
 
-  // INSERT_VECTOR_ELT (<n x e>, var-idx)
-  // => BUILD_VECTOR n x select (e, const-idx)
+// INSERT_VECTOR_ELT (<n x e>, var-idx)
+// => BUILD_VECTOR n x select (e, const-idx)
+SDValue
+SITargetLowering::performInsertVectorDynEltCombine(SDNode *N,
+                                                   DAGCombinerInfo &DCI) const {
   if (!shouldExpandVectorDynExt(N))
     return SDValue();
 
+  SDValue Vec = N->getOperand(0);
+  SDValue Idx = N->getOperand(2);
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+
   SelectionDAG &DAG = DCI.DAG;
   SDLoc SL(N);
   SDValue Ins = N->getOperand(1);
@@ -16943,12 +16953,21 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
       return Res;
     break;
+  case ISD::EXTRACT_VECTOR_ELT:
+    if (SDValue V = performExtractVectorDynEltCombine(N, DCI))
+      return V;
+    break;
+  case ISD::INSERT_VECTOR_ELT:
+    if (SDValue V = performInsertVectorDynEltCombine(N, DCI))
+      return V;
+    break;
   default:
     break;
   }
 
-  if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
+  if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
     return SDValue();
+  }
 
   switch (N->getOpcode()) {
   case ISD::ADD:
@@ -17063,8 +17082,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   }
   case ISD::EXTRACT_VECTOR_ELT:
     return performExtractVectorEltCombine(N, DCI);
-  case ISD::INSERT_VECTOR_ELT:
-    return performInsertVectorEltCombine(N, DCI);
   case ISD::FP_ROUND:
     return performFPRoundCombine(N, DCI);
   case ISD::LOAD: {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 40c03ca024c6c..55e883e4c78c6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -223,7 +223,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performExtractVectorDynEltCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const;
+  SDValue performInsertVectorDynEltCombine(SDNode *N,
+                                           DAGCombinerInfo &DCI) const;
   SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index c69b0cce3d208..a4037c817c359 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck --check-prefixes=GCN-O0 %s
 
 define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-LABEL: float4_extelt:
@@ -20,6 +21,36 @@ define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-O0-LABEL: float4_extelt:
+; GCN-O0:       ; %bb.0: ; %entry
+; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-O0-NEXT:    s_mov_b32 s3, 1
+; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT:    s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT:    s_mov_b32 s3, 1.0
+; GCN-O0-NEXT:    s_mov_b32 s6, 0
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT:    s_mov_b32 s3, 2
+; GCN-O0-NEXT:    s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT:    s_mov_b32 s3, 2.0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT:    s_mov_b32 s3, 3
+; GCN-O0-NEXT:    s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-O0-NEXT:    s_mov_b32 s4, 4.0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-O0-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[2:3]
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT:    flat_store_dword v[0:1], v2
+; GCN-O0-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x float> <float 0.0, float 1.0, float 2.0, float 4.0>, i32 %sel
   store float %ext, ptr addrspace(1) %out
@@ -43,6 +74,28 @@ define amdgpu_kernel void @int4_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-O0-LABEL: int4_extelt:
+; GCN-O0:       ; %bb.0: ; %entry
+; GCN-O0-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN-O0-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GCN-O0-NEXT:    s_mov_b32 s1, 1
+; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT:    s_cmp_eq_u32 s0, s1
+; GCN-O0-NEXT:    s_mov_b32 s4, 0
+; GCN-O0-NEXT:    s_cselect_b32 s4, s1, s4
+; GCN-O0-NEXT:    s_mov_b32 s1, 2
+; GCN-O0-NEXT:    s_cmp_eq_u32 s0, s1
+; GCN-O0-NEXT:    s_cselect_b32 s1, s1, s4
+; GCN-O0-NEXT:    s_mov_b32 s4, 3
+; GCN-O0-NEXT:    s_cmp_eq_u32 s0, s4
+; GCN-O0-NEXT:    s_mov_b32 s0, 4
+; GCN-O0-NEXT:    s_cselect_b32 s0, s0, s1
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-O0-NEXT:    flat_store_dword v[0:1], v2
+; GCN-O0-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 4>, i32 %sel
   store i32 %ext, ptr addrspace(1) %out
@@ -72,6 +125,56 @@ define amdgpu_kernel void @double4_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-O0-LABEL: double4_extelt:
+; GCN-O0:       ; %bb.0: ; %entry
+; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT:    s_load_dword s5, s[4:5], 0x2c
+; GCN-O0-NEXT:    s_mov_b32 s2, 0x3f847ae1
+; GCN-O0-NEXT:    s_mov_b32 s8, 0x47ae147b
+; GCN-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; GCN-O0-NEXT:    s_mov_b32 s9, s2
+; GCN-O0-NEXT:    s_mov_b32 s3, s9
+; GCN-O0-NEXT:    s_mov_b32 s2, 0x3ff028f5
+; GCN-O0-NEXT:    s_mov_b32 s6, 0xc28f5c29
+; GCN-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT:    s_mov_b32 s7, s2
+; GCN-O0-NEXT:    s_mov_b32 s2, s7
+; GCN-O0-NEXT:    s_mov_b32 s4, 1
+; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT:    s_cmp_eq_u32 s5, s4
+; GCN-O0-NEXT:    s_cselect_b32 s3, s2, s3
+; GCN-O0-NEXT:    s_mov_b32 s4, s8
+; GCN-O0-NEXT:    s_mov_b32 s2, s6
+; GCN-O0-NEXT:    s_cselect_b32 s4, s2, s4
+; GCN-O0-NEXT:    s_mov_b32 s2, 0x4000147a
+; GCN-O0-NEXT:    s_mov_b32 s6, 0xe147ae14
+; GCN-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT:    s_mov_b32 s7, s2
+; GCN-O0-NEXT:    s_mov_b32 s2, s6
+; GCN-O0-NEXT:    s_mov_b32 s8, 2
+; GCN-O0-NEXT:    s_cmp_eq_u32 s5, s8
+; GCN-O0-NEXT:    s_cselect_b32 s4, s2, s4
+; GCN-O0-NEXT:    s_mov_b32 s2, s7
+; GCN-O0-NEXT:    s_cselect_b32 s3, s2, s3
+; GCN-O0-NEXT:    s_mov_b32 s2, 0x40100a3d
+; GCN-O0-NEXT:    s_mov_b32 s6, 0x70a3d70a
+; GCN-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT:    s_mov_b32 s7, s2
+; GCN-O0-NEXT:    s_mov_b32 s2, s7
+; GCN-O0-NEXT:    s_mov_b32 s8, 3
+; GCN-O0-NEXT:    s_cmp_eq_u32 s5, s8
+; GCN-O0-NEXT:    s_cselect_b32 s2, s2, s3
+; GCN-O0-NEXT:    s_mov_b32 s3, s6
+; GCN-O0-NEXT:    s_cselect_b32 s3, s3, s4
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GCN-O0-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x double> <double 0.01, double 1.01, double 2.01, double 4.01>, i32 %sel
   store double %ext, ptr addrspace(1) %out
@@ -109,6 +212,65 @@ define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-O0-LABEL: double5_extelt:
+; GCN-O0:       ; %bb.0: ; %entry
+; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT:    s_load_dword s5, s[4:5], 0x2c
+; GCN-O0-NEXT:    s_mov_b32 s4, 0x3f847ae1
+; GCN-O0-NEXT:    s_mov_b32 s2, 0x47ae147b
+; GCN-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; GCN-O0-NEXT:    s_mov_b32 s3, s4
+; GCN-O0-NEXT:    s_mov_b32 s8, s2
+; GCN-O0-NEXT:    s_mov_b32 s4, 0x3ff028f5
+; GCN-O0-NEXT:    s_mov_b32 s6, 0xc28f5c29
+; GCN-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT:    s_mov_b32 s7, s4
+; GCN-O0-NEXT:    s_mov_b32 s4, s6
+; GCN-O0-NEXT:    s_mov_b32 s9, 1
+; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT:    s_cmp_eq_u32 s5, s9
+; GCN-O0-NEXT:    s_cselect_b32 s4, s4, s8
+; GCN-O0-NEXT:    ; kill: def $sgpr3 killed $sgpr3 killed $sgpr2_sgpr3
+; GCN-O0-NEXT:    s_mov_b32 s2, s7
+; GCN-O0-NEXT:    s_cselect_b32 s3, s2, s3
+; GCN-O0-NEXT:    s_mov_b32 s2, 0x4000147a
+; GCN-O0-NEXT:    s_mov_b32 s6, 0xe147ae14
+; GCN-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT:    s_mov_b32 s7, s2
+; GCN-O0-NEXT:    s_mov_b32 s2, s7
+; GCN-O0-NEXT:    s_mov_b32 s8, 2
+; GCN-O0-NEXT:    s_cmp_eq_u32 s5, s8
+; GCN-O0-NEXT:    s_cselect_b32 s3, s2, s3
+; GCN-O0-NEXT:    s_mov_b32 s2, s6
+; GCN-O0-NEXT:    s_cselect_b32 s4, s2, s4
+; GCN-O0-NEXT:    s_mov_b32 s2, 0x40100a3d
+; GCN-O0-NEXT:    s_mov_b32 s6, 0x70a3d70a
+; GCN-O0-NEXT:    s_mov_b32 s8, s6
+; GCN-O0-NEXT:    s_mov_b32 s9, s2
+; GCN-O0-NEXT:    s_mov_b32 s2, s8
+; GCN-O0-NEXT:    s_mov_b32 s7, 3
+; GCN-O0-NEXT:    s_cmp_eq_u32 s5, s7
+; GCN-O0-NEXT:    s_cselect_b32 s4, s2, s4
+; GCN-O0-NEXT:    s_mov_b32 s2, s9
+; GCN-O0-NEXT:    s_cselect_b32 s3, s2, s3
+; GCN-O0-NEXT:    s_mov_b32 s2, 0x40140a3d
+; GCN-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT:    s_mov_b32 s7, s2
+; GCN-O0-NEXT:    s_mov_b32 s2, s7
+; GCN-O0-NEXT:    s_mov_b32 s8, 4
+; GCN-O0-NEXT:    s_cmp_eq_u32 s5, s8
+; GCN-O0-NEXT:    s_cselect_b32 s2, s2, s3
+; GCN-O0-NEXT:    s_mov_b32 s3, s6
+; GCN-O0-NEXT:    s_cselect_b32 s3, s3, s4
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GCN-O0-NEXT:    s_endpgm
 entry:
   %ext = extractelement <5 x double> <double 0.01, double 1.01, double 2.01, double 4.01, double 5.01>, i32 %sel
   store double %ext, ptr addrspace(1) %out
@@ -130,6 +292,25 @@ define amdgpu_kernel void @half4_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_short v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-O0-LABEL: half4_extelt:
+; GCN-O0:       ; %bb.0: ; %entry
+; GCN-O0-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN-O0-NEXT:    s_load_dword s4, s[4:5], 0x2c
+; GCN-O0-NEXT:    s_mov_b32 s5, 0x44004200
+; GCN-O0-NEXT:    s_mov_b32 s0, 0x40003c00
+; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; GCN-O0-NEXT:    s_mov_b32 s1, s5
+; GCN-O0-NEXT:    s_mov_b32 s5, 4
+; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT:    s_lshl_b32 s4, s4, s5
+; GCN-O0-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
+; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-O0-NEXT:    flat_store_short v[0:1], v2
+; GCN-O0-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, i32 %sel
   store half %ext, ptr addrspace(1) %out
@@ -149,6 +330,24 @@ define amdgpu_kernel void @float2_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-O0-LABEL: float2_extelt:
+; GCN-O0:       ; %bb.0: ; %entry
+; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-O0-NEXT:    s_mov_b32 s3, 1
+; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT:    s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-O0-NEXT:    s_mov_b32 s4, 1.0
+; GCN-O0-NEXT:    s_mov_b32 s5, 0
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-O0-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[2:3]
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT:    flat_store_dword v[0:1], v2
+; GCN-O0-NEXT:    s_endpgm
 entry:
   %ext = extractelement <2 x float> <float 0.0, float 1.0>, i32 %sel
   store float %ext, ptr addrspace(1) %out
@@ -172,6 +371,36 @@ define amdgpu_kernel void @double2_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-O0-LABEL: double2_extelt:
+; GCN-O0:       ; %bb.0: ; %entry
+; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT:    s_load_dword s8, s[4:5], 0x2c
+; GCN-O0-NEXT:    s_mov_b32 s2, 0x3f847ae1
+; GCN-O0-NEXT:    s_mov_b32 s4, 0x47ae147b
+; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GCN-O0-NEXT:    s_mov_b32 s5, s2
+; GCN-O0-NEXT:    s_mov_b32 s3, s5
+; GCN-O0-NEXT:    s_mov_b32 s2, 0x3ff028f5
+; GCN-O0-NEXT:    s_mov_b32 s6, 0xc28f5c29
+; GCN-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT:    s_mov_b32 s7, s2
+; GCN-O0-NEXT:    s_mov_b32 s2, s7
+; GCN-O0-NEXT:    s_mov_b32 s9, 1
+; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT:    s_cmp_eq_u32 s8, s9
+; GCN-O0-NEXT:    s_cselect_b32 s2, s2, s3
+; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GCN-O0-NEXT:    s_mov_b32 s3, s6
+; GCN-O0-NEXT:    s_cselect_b32 s3, s3, s4
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GCN-O0-NEXT:    s_endpgm
 entry:
   %ext = extractelement <2 x double> <double 0.01, double 1.01>, i32 %sel
   store double %ext, ptr addrspace(1) %out
@@ -217,6 +446,60 @@ define amdgpu_kernel void @half8_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_short v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-O0-LABEL: half8_extelt:
+; GCN-O0:       ; %bb.0: ; %entry
+; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-O0-NEXT:    s_mov_b32 s3, 1
+; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT:    s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT:    s_mov_b32 s3, 0x4000
+; GCN-O0-NEXT:    s_mov_b32 s6, 0x3c00
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT:    s_mov_b32 s3, 2
+; GCN-O0-NEXT:    s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT:    s_mov_b32 s3, 0x4200
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT:    s_mov_b32 s3, 3
+; GCN-O0-NEXT:    s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT:    s_mov_b32 s3, 0x4400
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT:    s_mov_b32 s3, 4
+; GCN-O0-NEXT:    s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT:    s_mov_b32 s3, 0x4500
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT:    s_mov_b32 s3, 5
+; GCN-O0-NEXT:    s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT:    s_mov_b32 s3, 0x4600
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT:    s_mov_b32 s3, 6
+; GCN-O0-NEXT:    s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT:    s_mov_b32 s3, 0x4700
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT:    s_mov_b32 s3, 7
+; GCN-O0-NEXT:    s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-O0-NEXT:    s_mov_b32 s4, 0x4800
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-O0-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[2:3]
+; GCN-O0-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT:    flat_store_short v[0:1], v2
+; GCN-O0-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x half> <half 1.0, half 2.0, half 3.0, half 4.0, half 5.0, half 6.0, half 7.0, half 8.0>, i32 %sel
   store half %ext, ptr addrspace(1) %out
@@ -248,6 +531,39 @@ define amdgpu_kernel void @short8_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_short v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GCN-O0-LABEL: short8_extelt:
+; GCN-O0:       ; %bb.0: ; %entry
+; GCN-O0-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN-O0-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GCN-O0-NEXT:    s_mov_b32 s1, 1
+; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT:    s_cmp_eq_u32 s0, s1
+; GCN-O0-NEXT:    s_mov_b32 s4, 2
+; GCN-O0-NEXT:    s_cselect_b32 s1, s4, s1
+; GCN-O0-NEXT:    s_cmp_eq_u32 s0, s4
+; GCN-O0-NEX...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/170323