[llvm] [AMDGPU] In promote-alloca, if index is dynamic, sandwich load with bitcasts to reduce number of extractelements as they have large expansion in the backend. (PR #171253)

Mon Dec 15 10:55:19 PST 2025

https://github.com/choikwa updated https://github.com/llvm/llvm-project/pull/171253

>From 552663451d43a9238f76d281a0effc8e48c5ccf3 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Mon, 8 Dec 2025 22:13:27 -0600
Subject: [PATCH 1/2] [AMDGPU] In promote-alloca, if index is dynamic, sandwich
 load with bitcasts to reduce number of extractelements as they have large
 expansion in the backend.

Investigation revealed that scalarized copy results in a long chain of extract/insert elements which can explode in generated temps in the AMDGPU backend as there is no efficient representation for extracting subvector with dynamic index.
Using identity bitcasts can reduce the number of extract/insert elements down to 1 and produce much smaller generated and efficient code.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp |  30 ++
 .../CodeGen/AMDGPU/promote-alloca-subvecs.ll  |  17 +-
 ...-alloca-vector-dynamic-idx-bitcasts-llc.ll | 497 ++++++++++++++++++
 ...mote-alloca-vector-dynamic-idx-bitcasts.ll |  82 +++
 4 files changed, 614 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts-llc.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 83b463c630d71..1ed138b7badcc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -42,6 +42,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 
@@ -644,6 +645,35 @@ static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
       auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
 
+      // If idx is dynamic, then sandwich load with bitcasts.
+      // ie. <64 x i8> -> <16 x i8>  instead do
+      //     <64 x i8> -> <4 x i128> -> i128 -> <16 x i8>
+      // Extracting subvector with dynamic index has very large expansion in
+      // the amdgpu backend. Limit to pow2 for UDiv.
+      if (!isa<ConstantInt>(Index) && SubVecTy->isIntOrIntVectorTy() &&
+          llvm::isPowerOf2_32(VectorTy->getNumElements()) &&
+          llvm::isPowerOf2_32(SubVecTy->getNumElements())) {
+        IntegerType *NewElemType = Builder.getIntNTy(
+            SubVecTy->getScalarSizeInBits() * SubVecTy->getNumElements());
+        const unsigned NewNumElts =
+            VectorTy->getNumElements() * VectorTy->getScalarSizeInBits() /
+              NewElemType->getScalarSizeInBits();
+        const unsigned IndexDivisor = VectorTy->getNumElements() / NewNumElts;
+        assert(VectorTy->getScalarSizeInBits() <
+            NewElemType->getScalarSizeInBits() &&
+            "New element type should be bigger");
+        assert(IndexDivisor > 0u && "Zero index divisor");
+        FixedVectorType *BitCastType =
+            FixedVectorType::get(NewElemType, NewNumElts);
+        Value *BCVal = Builder.CreateBitCast(CurVal, BitCastType);
+        Value *NewIdx = Builder.CreateUDiv(Index,
+            ConstantInt::get(Index->getType(), IndexDivisor));
+        Value *ExtVal = Builder.CreateExtractElement(BCVal, NewIdx);
+        Value *BCOut = Builder.CreateBitCast(ExtVal, SubVecTy);
+        Inst->replaceAllUsesWith(BCOut);
+        return nullptr;
+      }
+
       Value *SubVec = PoisonValue::get(SubVecTy);
       for (unsigned K = 0; K < NumLoadedElts; ++K) {
         Value *CurIdx =
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
index 62a04f3a6f86f..5aa09ae74ec36 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
@@ -436,18 +436,11 @@ define <4 x i16> @nonconst_indexes(i1 %cond, i32 %otheridx, <4 x i16> %store) #0
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX_1]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[STORE]], i64 3
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i16> [[TMP7]], i16 [[TMP9]], i32 [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP10]], i32 [[INDEX_2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> poison, i16 [[TMP11]], i64 0
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX_2]], 1
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP10]], i32 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i16> [[TMP12]], i16 [[TMP14]], i64 1
-; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[INDEX_2]], 2
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP10]], i32 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i16> [[TMP15]], i16 [[TMP17]], i64 2
-; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[INDEX_2]], 3
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i16> [[TMP10]], i32 [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[TMP20]], i64 3
-; CHECK-NEXT:    ret <4 x i16> [[TMP21]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i16> [[TMP10]] to <4 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = udiv i32 [[INDEX_2]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP11]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP13]] to <4 x i16>
+; CHECK-NEXT:    ret <4 x i16> [[TMP14]]
 ;
 entry:
   %data = alloca [16 x i16], addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts-llc.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts-llc.ll
new file mode 100644
index 0000000000000..084b7a2d59b2f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts-llc.ll
@@ -0,0 +1,497 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck %s --check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GFX12
+
+define amdgpu_kernel void @test_bitcast_llc_v128i8_v16i8(ptr addrspace(1) %out, i32 %idx) {
+; GFX9-LABEL: test_bitcast_llc_v128i8_v16i8:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s33, s[4:5], 0x8
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX9-NEXT:    s_and_b32 s1, s0, 0xff
+; GFX9-NEXT:    s_or_b32 s0, s1, s0
+; GFX9-NEXT:    s_and_b32 s1, s0, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX9-NEXT:    s_or_b32 s0, s1, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_add_i32 s33, s33, s33
+; GFX9-NEXT:    s_mov_b32 s1, s0
+; GFX9-NEXT:    s_lshl_b32 s33, s33, 1
+; GFX9-NEXT:    s_mov_b32 s2, s0
+; GFX9-NEXT:    s_mov_b32 s3, s0
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s0
+; GFX9-NEXT:    s_mov_b32 s6, s0
+; GFX9-NEXT:    s_mov_b32 s7, s0
+; GFX9-NEXT:    s_mov_b32 s8, s0
+; GFX9-NEXT:    s_mov_b32 s9, s0
+; GFX9-NEXT:    s_mov_b32 s10, s0
+; GFX9-NEXT:    s_mov_b32 s11, s0
+; GFX9-NEXT:    s_mov_b32 s12, s0
+; GFX9-NEXT:    s_mov_b32 s13, s0
+; GFX9-NEXT:    s_mov_b32 s14, s0
+; GFX9-NEXT:    s_mov_b32 s15, s0
+; GFX9-NEXT:    s_mov_b32 s16, s0
+; GFX9-NEXT:    s_mov_b32 s17, s0
+; GFX9-NEXT:    s_mov_b32 s18, s0
+; GFX9-NEXT:    s_mov_b32 s19, s0
+; GFX9-NEXT:    s_mov_b32 s20, s0
+; GFX9-NEXT:    s_mov_b32 s21, s0
+; GFX9-NEXT:    s_mov_b32 s22, s0
+; GFX9-NEXT:    s_mov_b32 s23, s0
+; GFX9-NEXT:    s_mov_b32 s24, s0
+; GFX9-NEXT:    s_mov_b32 s25, s0
+; GFX9-NEXT:    s_mov_b32 s26, s0
+; GFX9-NEXT:    s_mov_b32 s27, s0
+; GFX9-NEXT:    s_mov_b32 s28, s0
+; GFX9-NEXT:    s_mov_b32 s29, s0
+; GFX9-NEXT:    s_mov_b32 s30, s0
+; GFX9-NEXT:    s_mov_b32 s31, s0
+; GFX9-NEXT:    s_add_i32 s36, s33, 3
+; GFX9-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX9-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX9-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX9-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX9-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX9-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX9-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX9-NEXT:    v_mov_b64_e32 v[16:17], s[16:17]
+; GFX9-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
+; GFX9-NEXT:    v_mov_b64_e32 v[20:21], s[20:21]
+; GFX9-NEXT:    v_mov_b64_e32 v[22:23], s[22:23]
+; GFX9-NEXT:    v_mov_b64_e32 v[24:25], s[24:25]
+; GFX9-NEXT:    v_mov_b64_e32 v[26:27], s[26:27]
+; GFX9-NEXT:    v_mov_b64_e32 v[28:29], s[28:29]
+; GFX9-NEXT:    v_mov_b64_e32 v[30:31], s[30:31]
+; GFX9-NEXT:    s_set_gpr_idx_on s36, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v35, v0
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    s_add_i32 s0, s33, 2
+; GFX9-NEXT:    s_set_gpr_idx_on s0, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v34, v0
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    v_mov_b32_e32 v36, 0
+; GFX9-NEXT:    s_set_gpr_idx_on s33, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v33, v1
+; GFX9-NEXT:    v_mov_b32_e32 v32, v0
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    global_store_dwordx4 v36, v[32:35], s[34:35]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_bitcast_llc_v128i8_v16i8:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[34:35], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b32 s33, s[4:5], 0x8
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX11-NEXT:    v_mov_b32_e32 v35, 0
+; GFX11-NEXT:    s_and_b32 s1, s0, 0xff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_and_b32 s1, s0, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_mov_b32 s1, s0
+; GFX11-NEXT:    s_mov_b32 s2, s0
+; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    s_mov_b32 s4, s0
+; GFX11-NEXT:    s_mov_b32 s5, s0
+; GFX11-NEXT:    s_mov_b32 s6, s0
+; GFX11-NEXT:    s_mov_b32 s7, s0
+; GFX11-NEXT:    s_mov_b32 s8, s0
+; GFX11-NEXT:    s_mov_b32 s9, s0
+; GFX11-NEXT:    s_mov_b32 s10, s0
+; GFX11-NEXT:    s_mov_b32 s11, s0
+; GFX11-NEXT:    s_mov_b32 s12, s0
+; GFX11-NEXT:    s_mov_b32 s13, s0
+; GFX11-NEXT:    s_mov_b32 s14, s0
+; GFX11-NEXT:    s_mov_b32 s15, s0
+; GFX11-NEXT:    s_mov_b32 s16, s0
+; GFX11-NEXT:    s_mov_b32 s17, s0
+; GFX11-NEXT:    s_mov_b32 s18, s0
+; GFX11-NEXT:    s_mov_b32 s19, s0
+; GFX11-NEXT:    s_mov_b32 s20, s0
+; GFX11-NEXT:    s_mov_b32 s21, s0
+; GFX11-NEXT:    s_mov_b32 s22, s0
+; GFX11-NEXT:    s_mov_b32 s23, s0
+; GFX11-NEXT:    s_mov_b32 s24, s0
+; GFX11-NEXT:    s_mov_b32 s25, s0
+; GFX11-NEXT:    s_mov_b32 s26, s0
+; GFX11-NEXT:    s_mov_b32 s27, s0
+; GFX11-NEXT:    s_mov_b32 s28, s0
+; GFX11-NEXT:    s_mov_b32 s29, s0
+; GFX11-NEXT:    s_mov_b32 s30, s0
+; GFX11-NEXT:    s_mov_b32 s31, s0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s33, s33, s33
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GFX11-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GFX11-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
+; GFX11-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GFX11-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-NEXT:    v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
+; GFX11-NEXT:    v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21
+; GFX11-NEXT:    v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23
+; GFX11-NEXT:    v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25
+; GFX11-NEXT:    v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27
+; GFX11-NEXT:    v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29
+; GFX11-NEXT:    v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31
+; GFX11-NEXT:    s_lshl_b32 s0, s33, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 m0, s0, 3
+; GFX11-NEXT:    v_movrels_b32_e32 v34, v0
+; GFX11-NEXT:    s_add_i32 m0, s0, 2
+; GFX11-NEXT:    v_movrels_b32_e32 v33, v0
+; GFX11-NEXT:    s_mov_b32 m0, s0
+; GFX11-NEXT:    v_movrels_b32_e32 v32, v1
+; GFX11-NEXT:    v_movrels_b32_e32 v31, v0
+; GFX11-NEXT:    global_store_b128 v35, v[31:34], s[34:35]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_bitcast_llc_v128i8_v16i8:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b96 s[36:38], s[4:5], 0x0
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX12-NEXT:    v_mov_b32_e32 v35, 0
+; GFX12-NEXT:    s_and_b32 s1, s0, 0xff
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_or_b32 s0, s1, s0
+; GFX12-NEXT:    s_and_b32 s1, s0, 0xffff
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_or_b32 s0, s1, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s0
+; GFX12-NEXT:    s_mov_b32 s10, s0
+; GFX12-NEXT:    s_mov_b32 s11, s0
+; GFX12-NEXT:    s_mov_b32 s12, s0
+; GFX12-NEXT:    s_mov_b32 s13, s0
+; GFX12-NEXT:    s_mov_b32 s14, s0
+; GFX12-NEXT:    s_mov_b32 s15, s0
+; GFX12-NEXT:    s_mov_b32 s16, s0
+; GFX12-NEXT:    s_mov_b32 s17, s0
+; GFX12-NEXT:    s_mov_b32 s18, s0
+; GFX12-NEXT:    s_mov_b32 s19, s0
+; GFX12-NEXT:    s_mov_b32 s20, s0
+; GFX12-NEXT:    s_mov_b32 s21, s0
+; GFX12-NEXT:    s_mov_b32 s22, s0
+; GFX12-NEXT:    s_mov_b32 s23, s0
+; GFX12-NEXT:    s_mov_b32 s24, s0
+; GFX12-NEXT:    s_mov_b32 s25, s0
+; GFX12-NEXT:    s_mov_b32 s26, s0
+; GFX12-NEXT:    s_mov_b32 s27, s0
+; GFX12-NEXT:    s_mov_b32 s28, s0
+; GFX12-NEXT:    s_mov_b32 s29, s0
+; GFX12-NEXT:    s_mov_b32 s30, s0
+; GFX12-NEXT:    s_mov_b32 s31, s0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_add_co_i32 s33, s38, s38
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX12-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GFX12-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GFX12-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
+; GFX12-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GFX12-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX12-NEXT:    v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
+; GFX12-NEXT:    v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21
+; GFX12-NEXT:    v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23
+; GFX12-NEXT:    v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25
+; GFX12-NEXT:    v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27
+; GFX12-NEXT:    v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29
+; GFX12-NEXT:    v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31
+; GFX12-NEXT:    s_lshl_b32 s0, s33, 1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 m0, s0, 3
+; GFX12-NEXT:    v_movrels_b32_e32 v34, v0
+; GFX12-NEXT:    s_add_co_i32 m0, s0, 2
+; GFX12-NEXT:    v_movrels_b32_e32 v33, v0
+; GFX12-NEXT:    s_mov_b32 m0, s0
+; GFX12-NEXT:    v_movrels_b32_e32 v32, v1
+; GFX12-NEXT:    v_movrels_b32_e32 v31, v0
+; GFX12-NEXT:    global_store_b128 v35, v[31:34], s[36:37]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %alloca = freeze <128 x i8> poison
+  %allocabc = bitcast <128 x i8> %alloca to <8 x i128>
+  %vec = extractelement <8 x i128> %allocabc, i32 %idx
+  %vecbc = bitcast i128 %vec to <16 x i8>
+  store <16 x i8> %vecbc, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_bitcast_llc_v64i16_v8i16(ptr addrspace(1) %out, i32 %idx) {
+; GFX9-LABEL: test_bitcast_llc_v64i16_v8i16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_add_i32 s2, s2, s2
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX9-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v1
+; GFX9-NEXT:    s_add_i32 s3, s2, 3
+; GFX9-NEXT:    v_mov_b32_e32 v0, v0
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    s_set_gpr_idx_on s3, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    s_add_i32 s2, s2, 2
+; GFX9-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_bitcast_llc_v64i16_v8i16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s2, s2, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX11-NEXT:    s_mov_b32 m0, s2
+; GFX11-NEXT:    v_movrels_b32_e32 v1, v1
+; GFX11-NEXT:    v_movrels_b32_e32 v0, v0
+; GFX11-NEXT:    s_add_i32 m0, s2, 3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_movrels_b32_e32 v3, v0
+; GFX11-NEXT:    s_add_i32 m0, s2, 2
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_movrels_b32_e32 v2, v0
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_bitcast_llc_v64i16_v8i16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_add_co_i32 s2, s2, s2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX12-NEXT:    s_mov_b32 m0, s2
+; GFX12-NEXT:    v_movrels_b32_e32 v1, v1
+; GFX12-NEXT:    v_movrels_b32_e32 v0, v0
+; GFX12-NEXT:    s_add_co_i32 m0, s2, 3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_movrels_b32_e32 v3, v0
+; GFX12-NEXT:    s_add_co_i32 m0, s2, 2
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-NEXT:    v_movrels_b32_e32 v2, v0
+; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %alloca = freeze <64 x i16> poison
+  %allocabc = bitcast <64 x i16> %alloca to <8 x i128>
+  %vec = extractelement <8 x i128> %allocabc, i32 %idx
+  %vecbc = bitcast i128 %vec to <8 x i16>
+  store <8 x i16> %vecbc, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_bitcast_llc_v32i32_v4i32(ptr addrspace(1) %out, i32 %idx) {
+; GFX9-LABEL: test_bitcast_llc_v32i32_v4i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_add_i32 s2, s2, s2
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX9-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v1
+; GFX9-NEXT:    s_add_i32 s3, s2, 3
+; GFX9-NEXT:    v_mov_b32_e32 v0, v0
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    s_set_gpr_idx_on s3, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    s_add_i32 s2, s2, 2
+; GFX9-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_bitcast_llc_v32i32_v4i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s2, s2, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX11-NEXT:    s_mov_b32 m0, s2
+; GFX11-NEXT:    v_movrels_b32_e32 v1, v1
+; GFX11-NEXT:    v_movrels_b32_e32 v0, v0
+; GFX11-NEXT:    s_add_i32 m0, s2, 3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_movrels_b32_e32 v3, v0
+; GFX11-NEXT:    s_add_i32 m0, s2, 2
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_movrels_b32_e32 v2, v0
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_bitcast_llc_v32i32_v4i32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_add_co_i32 s2, s2, s2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX12-NEXT:    s_mov_b32 m0, s2
+; GFX12-NEXT:    v_movrels_b32_e32 v1, v1
+; GFX12-NEXT:    v_movrels_b32_e32 v0, v0
+; GFX12-NEXT:    s_add_co_i32 m0, s2, 3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_movrels_b32_e32 v3, v0
+; GFX12-NEXT:    s_add_co_i32 m0, s2, 2
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-NEXT:    v_movrels_b32_e32 v2, v0
+; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %alloca = freeze <32 x i32> poison
+  %allocabc = bitcast <32 x i32> %alloca to <8 x i128>
+  %vec = extractelement <8 x i128> %allocabc, i32 %idx
+  %vecbc = bitcast i128 %vec to <4 x i32>
+  store <4 x i32> %vecbc, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_bitcast_llc_v16i64_v4i256(ptr addrspace(1) %out, i32 %idx) {
+; GFX9-LABEL: test_bitcast_llc_v16i64_v4i256:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_add_i32 s2, s2, s2
+; GFX9-NEXT:    s_add_i32 s3, s2, 1
+; GFX9-NEXT:    s_add_i32 s3, s3, s3
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX9-NEXT:    s_set_gpr_idx_on s3, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v1
+; GFX9-NEXT:    s_add_i32 s4, s3, 3
+; GFX9-NEXT:    v_mov_b32_e32 v0, v0
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    s_add_i32 s5, s3, 2
+; GFX9-NEXT:    s_set_gpr_idx_on s4, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    s_add_i32 s2, s2, s2
+; GFX9-NEXT:    s_set_gpr_idx_on s5, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX9-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    s_add_i32 s3, s2, 3
+; GFX9-NEXT:    s_set_gpr_idx_on s3, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    s_add_i32 s2, s2, 2
+; GFX9-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    s_set_gpr_idx_off
+; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_bitcast_llc_v16i64_v4i256:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s2, s0, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s0, s2, 1
+; GFX11-NEXT:    s_add_i32 s2, s2, s2
+; GFX11-NEXT:    s_add_i32 s0, s0, s0
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX11-NEXT:    s_lshl_b32 s3, s0, 1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    s_mov_b32 m0, s3
+; GFX11-NEXT:    v_movrels_b32_e32 v1, v1
+; GFX11-NEXT:    v_movrels_b32_e32 v0, v0
+; GFX11-NEXT:    s_add_i32 m0, s3, 3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_movrels_b32_e32 v3, v0
+; GFX11-NEXT:    s_add_i32 m0, s3, 2
+; GFX11-NEXT:    v_movrels_b32_e32 v2, v0
+; GFX11-NEXT:    s_mov_b32 m0, s2
+; GFX11-NEXT:    v_movrels_b32_e32 v5, v1
+; GFX11-NEXT:    v_movrels_b32_e32 v4, v0
+; GFX11-NEXT:    s_add_i32 m0, s2, 3
+; GFX11-NEXT:    v_movrels_b32_e32 v7, v0
+; GFX11-NEXT:    s_add_i32 m0, s2, 2
+; GFX11-NEXT:    v_mov_b32_e32 v8, 0
+; GFX11-NEXT:    v_movrels_b32_e32 v6, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
+; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_bitcast_llc_v16i64_v4i256:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_add_co_i32 s2, s2, s2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_co_i32 s3, s2, 1
+; GFX12-NEXT:    s_add_co_i32 s2, s2, s2
+; GFX12-NEXT:    s_add_co_i32 s3, s3, s3
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX12-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX12-NEXT:    s_mov_b32 m0, s3
+; GFX12-NEXT:    v_movrels_b32_e32 v1, v1
+; GFX12-NEXT:    v_movrels_b32_e32 v0, v0
+; GFX12-NEXT:    s_add_co_i32 m0, s3, 3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_movrels_b32_e32 v3, v0
+; GFX12-NEXT:    s_add_co_i32 m0, s3, 2
+; GFX12-NEXT:    v_movrels_b32_e32 v2, v0
+; GFX12-NEXT:    s_mov_b32 m0, s2
+; GFX12-NEXT:    v_movrels_b32_e32 v5, v1
+; GFX12-NEXT:    v_movrels_b32_e32 v4, v0
+; GFX12-NEXT:    s_add_co_i32 m0, s2, 3
+; GFX12-NEXT:    v_movrels_b32_e32 v7, v0
+; GFX12-NEXT:    s_add_co_i32 m0, s2, 2
+; GFX12-NEXT:    v_mov_b32_e32 v8, 0
+; GFX12-NEXT:    v_movrels_b32_e32 v6, v0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX12-NEXT:    s_endpgm
+entry:
+  %alloca = freeze <16 x i64> poison
+  %allocabc = bitcast <16 x i64> %alloca to <4 x i256>
+  %vec = extractelement <4 x i256> %allocabc, i32 %idx
+  %vecbc = bitcast i256 %vec to <4 x i64>
+  store <4 x i64> %vecbc, ptr addrspace(1) %out, align 16
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts.ll
new file mode 100644
index 0000000000000..cfee09551e92a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+
+define amdgpu_kernel void @test_bitcast_gen_64i8_v16i8(ptr addrspace(1) %out, i32 %idx) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @test_bitcast_gen_64i8_v16i8(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <64 x i8> poison
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <64 x i8> [[ALLOCA]] to <4 x i128>
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 [[IDX]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i128> [[TMP0]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i128 [[TMP2]] to <16 x i8>
+; CHECK-NEXT:    store <16 x i8> [[TMP3]], ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [4 x [16 x i8]], align 16, addrspace(5)
+  %gep = getelementptr <16 x i8>, ptr addrspace(5) %alloca, i32 0, i32 %idx
+  %load = load <16 x i8>, ptr addrspace(5) %gep, align 16
+  store <16 x i8> %load, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_bitcast_gen_32i16_v8i16(ptr addrspace(1) %out, i32 %idx) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @test_bitcast_gen_32i16_v8i16(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <32 x i16> poison
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <32 x i16> [[ALLOCA]] to <4 x i128>
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 [[IDX]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i128> [[TMP0]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i128 [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    store <8 x i16> [[TMP3]], ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [32 x i16], align 16, addrspace(5)
+  %gep = getelementptr <8 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %idx
+  %load = load <8 x i16>, ptr addrspace(5) %gep, align 16
+  store <8 x i16> %load, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_bitcast_gen_64i8_v32i8(ptr addrspace(1) %out, i32 %idx) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @test_bitcast_gen_64i8_v32i8(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <64 x i8> poison
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <64 x i8> [[ALLOCA]] to <2 x i256>
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 [[IDX]], 32
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i256> [[TMP0]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i256 [[TMP2]] to <32 x i8>
+; CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [2 x [32 x i8]], align 16, addrspace(5)
+  %gep = getelementptr <32 x i8>, ptr addrspace(5) %alloca, i32 0, i32 %idx
+  %load = load <32 x i8>, ptr addrspace(5) %gep, align 16
+  store <32 x i8> %load, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_bitcast_gen_16i32_v4i32(ptr addrspace(1) %out, i32 %idx) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @test_bitcast_gen_16i32_v4i32(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <16 x i32> poison
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i32> [[ALLOCA]] to <4 x i128>
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 [[IDX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i128> [[TMP0]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i128 [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [16 x i32], align 16, addrspace(5)
+  %gep = getelementptr <4 x i32>, ptr addrspace(5) %alloca, i32 0, i32 %idx
+  %load = load <4 x i32>, ptr addrspace(5) %gep, align 16
+  store <4 x i32> %load, ptr addrspace(1) %out, align 16
+  ret void
+}

>From 8ae99ccdee10322af98d20aae1b9865b58f02d97 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Mon, 15 Dec 2025 12:53:57 -0600
Subject: [PATCH 2/2] Address feedback, add unaligned testcase. Now checks if
 VectorTy is multiple of SubVecTy. Also checks GEPTy

---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 60 ++++++++++++-------
 .../CodeGen/AMDGPU/promote-alloca-subvecs.ll  | 17 ++++--
 ...mote-alloca-vector-dynamic-idx-bitcasts.ll | 40 +++++++++++++
 3 files changed, 89 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 1ed138b7badcc..e962025542d31 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -646,32 +646,46 @@ static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
 
       // If idx is dynamic, then sandwich load with bitcasts.
-      // ie. <64 x i8> -> <16 x i8>  instead do
-      //     <64 x i8> -> <4 x i128> -> i128 -> <16 x i8>
+      // ie. CurValTy                 SubVecTy  AccessTy
+      //     <64 x i8> ->             <16 x i8> <8 x i16>
+      //     <64 x i8> -> <4 x i128> -> i128 -> <8 x i16>
       // Extracting subvector with dynamic index has very large expansion in
       // the amdgpu backend. Limit to pow2 for UDiv.
+      FixedVectorType *AccessVecTy = cast<FixedVectorType>(AccessTy);
+      auto *GEP = dyn_cast<GetElementPtrInst>(
+          cast<LoadInst>(Inst)->getPointerOperand());
       if (!isa<ConstantInt>(Index) && SubVecTy->isIntOrIntVectorTy() &&
-          llvm::isPowerOf2_32(VectorTy->getNumElements()) &&
-          llvm::isPowerOf2_32(SubVecTy->getNumElements())) {
-        IntegerType *NewElemType = Builder.getIntNTy(
-            SubVecTy->getScalarSizeInBits() * SubVecTy->getNumElements());
-        const unsigned NewNumElts =
-            VectorTy->getNumElements() * VectorTy->getScalarSizeInBits() /
-              NewElemType->getScalarSizeInBits();
-        const unsigned IndexDivisor = VectorTy->getNumElements() / NewNumElts;
-        assert(VectorTy->getScalarSizeInBits() <
-            NewElemType->getScalarSizeInBits() &&
-            "New element type should be bigger");
-        assert(IndexDivisor > 0u && "Zero index divisor");
-        FixedVectorType *BitCastType =
-            FixedVectorType::get(NewElemType, NewNumElts);
-        Value *BCVal = Builder.CreateBitCast(CurVal, BitCastType);
-        Value *NewIdx = Builder.CreateUDiv(Index,
-            ConstantInt::get(Index->getType(), IndexDivisor));
-        Value *ExtVal = Builder.CreateExtractElement(BCVal, NewIdx);
-        Value *BCOut = Builder.CreateBitCast(ExtVal, SubVecTy);
-        Inst->replaceAllUsesWith(BCOut);
-        return nullptr;
+          llvm::isPowerOf2_32(SubVecTy->getNumElements()) &&
+          VectorTy->getNumElements() % SubVecTy->getNumElements() == 0 &&
+          llvm::isPowerOf2_32(AccessVecTy->getNumElements())) {
+        uint64_t NumBits = SubVecTy->getScalarSizeInBits() *
+                           SubVecTy->getNumElements();
+        // Check that NumBits and GEP's type's NumBits match for alignment
+        if (auto *GEPTy =
+                dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
+          if (NumBits == GEPTy->getScalarSizeInBits() *
+                         GEPTy->getNumElements()) {
+            IntegerType *NewElemType = Builder.getIntNTy(NumBits);
+            const unsigned NewNumElts = VectorTy->getNumElements() *
+                                        VectorTy->getScalarSizeInBits() /
+                                        NewElemType->getScalarSizeInBits();
+            const unsigned IndexDivisor = VectorTy->getNumElements() /
+                                          NewNumElts;
+            assert(VectorTy->getScalarSizeInBits() <
+                       NewElemType->getScalarSizeInBits() &&
+                   "New element type should be bigger");
+            assert(IndexDivisor > 0u && "Zero index divisor");
+            FixedVectorType *BitCastType =
+                FixedVectorType::get(NewElemType, NewNumElts);
+            Value *BCVal = Builder.CreateBitCast(CurVal, BitCastType);
+            Value *NewIdx = Builder.CreateUDiv(
+                Index, ConstantInt::get(Index->getType(), IndexDivisor));
+            Value *ExtVal = Builder.CreateExtractElement(BCVal, NewIdx);
+            Value *BCOut = Builder.CreateBitCast(ExtVal, AccessTy);
+            Inst->replaceAllUsesWith(BCOut);
+            return nullptr;
+          }
+        }
       }
 
       Value *SubVec = PoisonValue::get(SubVecTy);
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
index 5aa09ae74ec36..62a04f3a6f86f 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
@@ -436,11 +436,18 @@ define <4 x i16> @nonconst_indexes(i1 %cond, i32 %otheridx, <4 x i16> %store) #0
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX_1]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[STORE]], i64 3
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i16> [[TMP7]], i16 [[TMP9]], i32 [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i16> [[TMP10]] to <4 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = udiv i32 [[INDEX_2]], 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP11]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP13]] to <4 x i16>
-; CHECK-NEXT:    ret <4 x i16> [[TMP14]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP10]], i32 [[INDEX_2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> poison, i16 [[TMP11]], i64 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX_2]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP10]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i16> [[TMP12]], i16 [[TMP14]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[INDEX_2]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP10]], i32 [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i16> [[TMP15]], i16 [[TMP17]], i64 2
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[INDEX_2]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i16> [[TMP10]], i32 [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[TMP20]], i64 3
+; CHECK-NEXT:    ret <4 x i16> [[TMP21]]
 ;
 entry:
   %data = alloca [16 x i16], addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts.ll
index cfee09551e92a..793a9daabbcc5 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts.ll
@@ -80,3 +80,43 @@ entry:
   store <4 x i32> %load, ptr addrspace(1) %out, align 16
   ret void
 }
+
+
+define amdgpu_kernel void @test_bitcast_gen_unaligned_gep(ptr addrspace(1) %out, i32 %idx) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @test_bitcast_gen_unaligned_gep(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <32 x i16> poison
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[IDX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[IDX]], 1
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP22]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[IDX]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i16> [[TMP4]], i16 [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[IDX]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x i16> [[TMP7]], i16 [[TMP9]], i64 3
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[IDX]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x i16> [[TMP10]], i16 [[TMP12]], i64 4
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[IDX]], 5
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x i16> [[TMP13]], i16 [[TMP15]], i64 5
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[IDX]], 6
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP18]], i64 6
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[IDX]], 7
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[TMP20]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> [[TMP19]], i16 [[TMP21]], i64 7
+; CHECK-NEXT:    store <8 x i16> [[TMP3]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [32 x i16], align 1, addrspace(5)
+  %gep = getelementptr i16, ptr addrspace(5) %alloca, i32 %idx
+  %load = load <8 x i16>, ptr addrspace(5) %gep, align 1
+  store <8 x i16> %load, ptr addrspace(1) %out, align 1
+  ret void
+}