[llvm] [DirectX] Scalarize `extractelement` and `insertelement` with dynamic indices (PR #141676)

Deric C. via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 18 11:26:28 PDT 2025


https://github.com/Icohedron updated https://github.com/llvm/llvm-project/pull/141676

>From d3f1a51cef21e74b281ba1dbf38bff16410b20e1 Mon Sep 17 00:00:00 2001
From: Icohedron <cheung.deric at gmail.com>
Date: Tue, 27 May 2025 21:28:12 +0000
Subject: [PATCH 1/9] Scalarize extractelement with dynamic index

---
 .../Target/DirectX/DXILDataScalarization.cpp  | 66 ++++++++++++++-----
 .../DirectX/scalarize-dynamic-vector-index.ll | 38 +++++++++++
 2 files changed, 86 insertions(+), 18 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll

diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index 06708cec00cec..7bd0539c6bfe0 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -27,6 +27,19 @@ static const int MaxVecSize = 4;
 
 using namespace llvm;
 
+// Recursively creates an array-like version of a given vector type.
+static Type *equivalentArrayTypeFromVector(Type *T) {
+  if (auto *VecTy = dyn_cast<VectorType>(T))
+    return ArrayType::get(VecTy->getElementType(),
+                          dyn_cast<FixedVectorType>(VecTy)->getNumElements());
+  if (auto *ArrayTy = dyn_cast<ArrayType>(T)) {
+    Type *NewElementType = equivalentArrayTypeFromVector(ArrayTy->getElementType());
+    return ArrayType::get(NewElementType, ArrayTy->getNumElements());
+  }
+  // If it's not a vector or array, return the original type.
+  return T;
+}
+
 class DXILDataScalarizationLegacy : public ModulePass {
 
 public:
@@ -55,7 +68,7 @@ class DataScalarizerVisitor : public InstVisitor<DataScalarizerVisitor, bool> {
   bool visitCastInst(CastInst &CI) { return false; }
   bool visitBitCastInst(BitCastInst &BCI) { return false; }
   bool visitInsertElementInst(InsertElementInst &IEI) { return false; }
-  bool visitExtractElementInst(ExtractElementInst &EEI) { return false; }
+  bool visitExtractElementInst(ExtractElementInst &EEI);
   bool visitShuffleVectorInst(ShuffleVectorInst &SVI) { return false; }
   bool visitPHINode(PHINode &PHI) { return false; }
   bool visitLoadInst(LoadInst &LI);
@@ -90,20 +103,6 @@ DataScalarizerVisitor::lookupReplacementGlobal(Value *CurrOperand) {
   return nullptr; // Not found
 }
 
-// Recursively creates an array version of the given vector type.
-static Type *replaceVectorWithArray(Type *T, LLVMContext &Ctx) {
-  if (auto *VecTy = dyn_cast<VectorType>(T))
-    return ArrayType::get(VecTy->getElementType(),
-                          dyn_cast<FixedVectorType>(VecTy)->getNumElements());
-  if (auto *ArrayTy = dyn_cast<ArrayType>(T)) {
-    Type *NewElementType =
-        replaceVectorWithArray(ArrayTy->getElementType(), Ctx);
-    return ArrayType::get(NewElementType, ArrayTy->getNumElements());
-  }
-  // If it's not a vector or array, return the original type.
-  return T;
-}
-
 static bool isArrayOfVectors(Type *T) {
   if (ArrayType *ArrType = dyn_cast<ArrayType>(T))
     return isa<VectorType>(ArrType->getElementType());
@@ -116,8 +115,7 @@ bool DataScalarizerVisitor::visitAllocaInst(AllocaInst &AI) {
 
   ArrayType *ArrType = cast<ArrayType>(AI.getAllocatedType());
   IRBuilder<> Builder(&AI);
-  LLVMContext &Ctx = AI.getContext();
-  Type *NewType = replaceVectorWithArray(ArrType, Ctx);
+  Type *NewType = equivalentArrayTypeFromVector(ArrType);
   AllocaInst *ArrAlloca =
       Builder.CreateAlloca(NewType, nullptr, AI.getName() + ".scalarize");
   ArrAlloca->setAlignment(AI.getAlign());
@@ -173,6 +171,38 @@ bool DataScalarizerVisitor::visitStoreInst(StoreInst &SI) {
   return false;
 }
 
+bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
+  // If the index is a constant then we don't need to scalarize it
+  Value *Index = EEI.getIndexOperand();
+  Type *IndexTy = Index->getType();
+  if (isa<ConstantInt>(Index))
+    return false;
+
+  IRBuilder<> Builder(&EEI);
+  VectorType *VecTy = EEI.getVectorOperandType();
+  assert(VecTy->getElementCount().isFixed() &&
+         "Vector operand of ExtractElement must have a fixed size");
+  
+  Type *ArrTy = equivalentArrayTypeFromVector(VecTy);
+  Value *ArrAlloca = Builder.CreateAlloca(ArrTy);
+
+  for (unsigned I = 0; I < ArrTy->getArrayNumElements(); ++I) {
+    Value *EE = Builder.CreateExtractElement(EEI.getVectorOperand(), I);
+    Value *GEP = Builder.CreateInBoundsGEP(
+        ArrTy, ArrAlloca,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, I)});
+    Builder.CreateStore(EE, GEP);
+  }
+
+  Value *GEP = Builder.CreateInBoundsGEP(ArrTy, ArrAlloca,
+                                         {ConstantInt::get(IndexTy, 0), Index});
+  Value *Load = Builder.CreateLoad(ArrTy->getArrayElementType(), GEP);
+
+  EEI.replaceAllUsesWith(Load);
+  EEI.eraseFromParent();
+  return true;
+}
+
 bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
 
   unsigned NumOperands = GEPI.getNumOperands();
@@ -257,7 +287,7 @@ static bool findAndReplaceVectors(Module &M) {
   for (GlobalVariable &G : M.globals()) {
     Type *OrigType = G.getValueType();
 
-    Type *NewType = replaceVectorWithArray(OrigType, Ctx);
+    Type *NewType = equivalentArrayTypeFromVector(OrigType);
     if (OrigType != NewType) {
       // Create a new global variable with the updated type
       // Note: Initializer is set via transformInitializer
diff --git a/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll b/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
new file mode 100644
index 0000000000000..74e9202b540c1
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes='dxil-data-scalarization' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+define float @extract_float_vec_dynamic(<4 x float> %0, i32 %1) {
+; CHECK-LABEL: define float @extract_float_vec_dynamic(
+; CHECK-SAME: <4 x float> [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
+; CHECK-NEXT:    [[TMP3:%.*]] = alloca [4 x float], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP3]], i32 0, i32 0
+; CHECK-NEXT:    store float [[TMP4]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP3]], i32 0, i32 1
+; CHECK-NEXT:    store float [[TMP6]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP3]], i32 0, i32 2
+; CHECK-NEXT:    store float [[TMP8]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP3]], i32 0, i32 3
+; CHECK-NEXT:    store float [[TMP10]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP3]], i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4
+; CHECK-NEXT:    ret float [[TMP13]]
+;
+  %e = extractelement <4 x float> %0, i32 %1
+  ret float %e
+}
+
+; An extractelement with a constant index should not be converted to array form
+define i16 @extract_i16_vec_constant(<4 x i16> %0) {
+; CHECK-LABEL: define i16 @extract_i16_vec_constant(
+; CHECK-SAME: <4 x i16> [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1
+; CHECK-NEXT:    ret i16 [[E]]
+;
+  %e = extractelement <4 x i16> %0, i32 1
+  ret i16 %e
+}
+

>From cda7e887104eafcb6b16705df39d0e7fb4df02be Mon Sep 17 00:00:00 2001
From: Icohedron <cheung.deric at gmail.com>
Date: Tue, 27 May 2025 21:52:03 +0000
Subject: [PATCH 2/9] Apply clang-format

---
 llvm/lib/Target/DirectX/DXILDataScalarization.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index 7bd0539c6bfe0..eb8c941c1f348 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -33,7 +33,8 @@ static Type *equivalentArrayTypeFromVector(Type *T) {
     return ArrayType::get(VecTy->getElementType(),
                           dyn_cast<FixedVectorType>(VecTy)->getNumElements());
   if (auto *ArrayTy = dyn_cast<ArrayType>(T)) {
-    Type *NewElementType = equivalentArrayTypeFromVector(ArrayTy->getElementType());
+    Type *NewElementType =
+        equivalentArrayTypeFromVector(ArrayTy->getElementType());
     return ArrayType::get(NewElementType, ArrayTy->getNumElements());
   }
   // If it's not a vector or array, return the original type.

>From be5d425ad04f461372849e4a4ce4f2dad19570e1 Mon Sep 17 00:00:00 2001
From: Icohedron <cheung.deric at gmail.com>
Date: Tue, 27 May 2025 22:41:03 +0000
Subject: [PATCH 3/9] Remove unnecessary assert

---
 llvm/lib/Target/DirectX/DXILDataScalarization.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index eb8c941c1f348..17d8b6fc0d7ef 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -181,9 +181,7 @@ bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
 
   IRBuilder<> Builder(&EEI);
   VectorType *VecTy = EEI.getVectorOperandType();
-  assert(VecTy->getElementCount().isFixed() &&
-         "Vector operand of ExtractElement must have a fixed size");
-  
+
   Type *ArrTy = equivalentArrayTypeFromVector(VecTy);
   Value *ArrAlloca = Builder.CreateAlloca(ArrTy);
 

>From 15cf98cfec409af748ca1d3f4012a7efbb3ce43f Mon Sep 17 00:00:00 2001
From: Icohedron <cheung.deric at gmail.com>
Date: Tue, 27 May 2025 23:38:08 +0000
Subject: [PATCH 4/9] Scalarize dynamically-indexed insertelement

---
 .../Target/DirectX/DXILDataScalarization.cpp  | 34 +++++++-
 .../DirectX/scalarize-dynamic-vector-index.ll | 78 +++++++++++++------
 2 files changed, 88 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index 17d8b6fc0d7ef..04080055c2f75 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -68,7 +68,7 @@ class DataScalarizerVisitor : public InstVisitor<DataScalarizerVisitor, bool> {
   bool visitGetElementPtrInst(GetElementPtrInst &GEPI);
   bool visitCastInst(CastInst &CI) { return false; }
   bool visitBitCastInst(BitCastInst &BCI) { return false; }
-  bool visitInsertElementInst(InsertElementInst &IEI) { return false; }
+  bool visitInsertElementInst(InsertElementInst &IEI);
   bool visitExtractElementInst(ExtractElementInst &EEI);
   bool visitShuffleVectorInst(ShuffleVectorInst &SVI) { return false; }
   bool visitPHINode(PHINode &PHI) { return false; }
@@ -172,6 +172,38 @@ bool DataScalarizerVisitor::visitStoreInst(StoreInst &SI) {
   return false;
 }
 
+bool DataScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) {
+  Value *Vec = IEI.getOperand(0);
+  Value *Val = IEI.getOperand(1);
+  Value *Index = IEI.getOperand(2);
+  Type *IndexTy = Index->getType();
+
+  // If the index is a constant then we don't need to scalarize it
+  if (isa<ConstantInt>(Index))
+    return false;
+
+  IRBuilder<> Builder(&IEI);
+  Type *VecTy = Vec->getType();
+
+  Type *ArrTy = equivalentArrayTypeFromVector(VecTy);
+  Value *ArrAlloca = Builder.CreateAlloca(ArrTy);
+
+  for (unsigned I = 0; I < ArrTy->getArrayNumElements(); ++I) {
+    Value *EE = Builder.CreateExtractElement(Vec, I);
+    Value *GEP = Builder.CreateInBoundsGEP(
+        ArrTy, ArrAlloca,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, I)});
+    Builder.CreateStore(EE, GEP);
+  }
+
+  Value *GEP = Builder.CreateInBoundsGEP(ArrTy, ArrAlloca,
+                                         {ConstantInt::get(IndexTy, 0), Index});
+  Builder.CreateStore(Val, GEP);
+
+  IEI.eraseFromParent();
+  return true;
+}
+
 bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
   // If the index is a constant then we don't need to scalarize it
   Value *Index = EEI.getIndexOperand();
diff --git a/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll b/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
index 74e9202b540c1..b1191d903fd49 100644
--- a/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
+++ b/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
@@ -1,38 +1,70 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes='dxil-data-scalarization' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
-define float @extract_float_vec_dynamic(<4 x float> %0, i32 %1) {
+define float @extract_float_vec_dynamic(<4 x float> %v, i32 %i) {
 ; CHECK-LABEL: define float @extract_float_vec_dynamic(
-; CHECK-SAME: <4 x float> [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
-; CHECK-NEXT:    [[TMP3:%.*]] = alloca [4 x float], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP3]], i32 0, i32 0
+; CHECK-SAME: <4 x float> [[V:%.*]], i32 [[I:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [4 x float], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[V]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP1]], i32 0, i32 0
+; CHECK-NEXT:    store float [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[V]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP1]], i32 0, i32 1
 ; CHECK-NEXT:    store float [[TMP4]], ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP3]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[V]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP1]], i32 0, i32 2
 ; CHECK-NEXT:    store float [[TMP6]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP3]], i32 0, i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[V]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP1]], i32 0, i32 3
 ; CHECK-NEXT:    store float [[TMP8]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP3]], i32 0, i32 3
-; CHECK-NEXT:    store float [[TMP10]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP3]], i32 0, i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4
-; CHECK-NEXT:    ret float [[TMP13]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP1]], i32 0, i32 [[I]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4
+; CHECK-NEXT:    ret float [[TMP11]]
 ;
-  %e = extractelement <4 x float> %0, i32 %1
-  ret float %e
+  %ee = extractelement <4 x float> %v, i32 %i
+  ret float %ee
+}
+
+define void @insert_i32_vec_dynamic(<3 x i32> %v, i32 %a, i32 %i) {
+; CHECK-LABEL: define void @insert_i32_vec_dynamic(
+; CHECK-SAME: <3 x i32> [[V:%.*]], i32 [[A:%.*]], i32 [[I:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [3 x i32], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i32> [[V]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP1]], i32 0, i32 0
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x i32> [[V]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP1]], i32 0, i32 1
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x i32> [[V]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP1]], i32 0, i32 2
+; CHECK-NEXT:    store i32 [[TMP6]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP1]], i32 0, i32 [[I]]
+; CHECK-NEXT:    store i32 [[A]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    ret void
+;
+  insertelement <3 x i32> %v, i32 %a, i32 %i
+  ret void
 }
 
 ; An extractelement with a constant index should not be converted to array form
-define i16 @extract_i16_vec_constant(<4 x i16> %0) {
+define i16 @extract_i16_vec_constant(<4 x i16> %v) {
 ; CHECK-LABEL: define i16 @extract_i16_vec_constant(
-; CHECK-SAME: <4 x i16> [[TMP0:%.*]]) {
-; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1
-; CHECK-NEXT:    ret i16 [[E]]
+; CHECK-SAME: <4 x i16> [[V:%.*]]) {
+; CHECK-NEXT:    [[EE:%.*]] = extractelement <4 x i16> [[V]], i32 1
+; CHECK-NEXT:    ret i16 [[EE]]
+;
+  %ee = extractelement <4 x i16> %v, i32 1
+  ret i16 %ee
+}
+
+; An insertelement with a constant index should not be converted to array form
+define void @insert_half_vec_constant(<2 x half> %v, half %a) {
+; CHECK-LABEL: define void @insert_half_vec_constant(
+; CHECK-SAME: <2 x half> [[V:%.*]], half [[A:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x half> [[V]], half [[A]], i32 1
+; CHECK-NEXT:    ret void
 ;
-  %e = extractelement <4 x i16> %0, i32 1
-  ret i16 %e
+  insertelement <2 x half> %v, half %a, i32 1
+  ret void
 }
 

>From 4c28344d9684b2c861db003eb6485349710ecafd Mon Sep 17 00:00:00 2001
From: Icohedron <cheung.deric at gmail.com>
Date: Tue, 27 May 2025 23:53:46 +0000
Subject: [PATCH 5/9] Subroutine creating an array from a vector

---
 .../Target/DirectX/DXILDataScalarization.cpp  | 48 +++++++++----------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index 04080055c2f75..a2e27e4f4ff8b 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -172,6 +172,22 @@ bool DataScalarizerVisitor::visitStoreInst(StoreInst &SI) {
   return false;
 }
 
+// Allocates and populates an array equivalent to the vector operand Vec.
+// Returns the array and the type of the array.
+static std::pair<Value *, Type *>
+allocaArrayFromVector(IRBuilder<> &Builder, Value *Vec, Type *IdxTy) {
+  Type *ArrTy = equivalentArrayTypeFromVector(Vec->getType());
+  Value *ArrAlloca = Builder.CreateAlloca(ArrTy);
+  for (unsigned I = 0; I < ArrTy->getArrayNumElements(); ++I) {
+    Value *EE = Builder.CreateExtractElement(Vec, I);
+    Value *GEP = Builder.CreateInBoundsGEP(
+        ArrTy, ArrAlloca,
+        {ConstantInt::get(IdxTy, 0), ConstantInt::get(IdxTy, I)});
+    Builder.CreateStore(EE, GEP);
+  }
+  return std::make_pair(ArrAlloca, ArrTy);
+}
+
 bool DataScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) {
   Value *Vec = IEI.getOperand(0);
   Value *Val = IEI.getOperand(1);
@@ -183,19 +199,9 @@ bool DataScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) {
     return false;
 
   IRBuilder<> Builder(&IEI);
-  Type *VecTy = Vec->getType();
-
-  Type *ArrTy = equivalentArrayTypeFromVector(VecTy);
-  Value *ArrAlloca = Builder.CreateAlloca(ArrTy);
-
-  for (unsigned I = 0; I < ArrTy->getArrayNumElements(); ++I) {
-    Value *EE = Builder.CreateExtractElement(Vec, I);
-    Value *GEP = Builder.CreateInBoundsGEP(
-        ArrTy, ArrAlloca,
-        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, I)});
-    Builder.CreateStore(EE, GEP);
-  }
-
+  std::pair<Value *, Type *> Arr = allocaArrayFromVector(Builder, Vec, IndexTy);
+  Value *ArrAlloca = Arr.first;
+  Type *ArrTy = Arr.second;
   Value *GEP = Builder.CreateInBoundsGEP(ArrTy, ArrAlloca,
                                          {ConstantInt::get(IndexTy, 0), Index});
   Builder.CreateStore(Val, GEP);
@@ -212,18 +218,10 @@ bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
     return false;
 
   IRBuilder<> Builder(&EEI);
-  VectorType *VecTy = EEI.getVectorOperandType();
-
-  Type *ArrTy = equivalentArrayTypeFromVector(VecTy);
-  Value *ArrAlloca = Builder.CreateAlloca(ArrTy);
-
-  for (unsigned I = 0; I < ArrTy->getArrayNumElements(); ++I) {
-    Value *EE = Builder.CreateExtractElement(EEI.getVectorOperand(), I);
-    Value *GEP = Builder.CreateInBoundsGEP(
-        ArrTy, ArrAlloca,
-        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, I)});
-    Builder.CreateStore(EE, GEP);
-  }
+  std::pair<Value *, Type *> Arr =
+      allocaArrayFromVector(Builder, EEI.getVectorOperand(), IndexTy);
+  Value *ArrAlloca = Arr.first;
+  Type *ArrTy = Arr.second;
 
   Value *GEP = Builder.CreateInBoundsGEP(ArrTy, ArrAlloca,
                                          {ConstantInt::get(IndexTy, 0), Index});

>From 989d82e5e7cd047f8ec864f233a66f2536828daf Mon Sep 17 00:00:00 2001
From: Icohedron <cheung.deric at gmail.com>
Date: Tue, 27 May 2025 23:58:57 +0000
Subject: [PATCH 6/9] Refactor visitExtractElementInst and
 visitInsertElementInst

---
 .../Target/DirectX/DXILDataScalarization.cpp  | 30 ++++++++++++-------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index a2e27e4f4ff8b..8164850927e2a 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -188,36 +188,38 @@ allocaArrayFromVector(IRBuilder<> &Builder, Value *Vec, Type *IdxTy) {
   return std::make_pair(ArrAlloca, ArrTy);
 }
 
-bool DataScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) {
+static bool replaceDynamicInsertElementInst(InsertElementInst &IEI) {
+  IRBuilder<> Builder(&IEI);
+
   Value *Vec = IEI.getOperand(0);
   Value *Val = IEI.getOperand(1);
   Value *Index = IEI.getOperand(2);
   Type *IndexTy = Index->getType();
 
-  // If the index is a constant then we don't need to scalarize it
-  if (isa<ConstantInt>(Index))
-    return false;
-
-  IRBuilder<> Builder(&IEI);
   std::pair<Value *, Type *> Arr = allocaArrayFromVector(Builder, Vec, IndexTy);
   Value *ArrAlloca = Arr.first;
   Type *ArrTy = Arr.second;
   Value *GEP = Builder.CreateInBoundsGEP(ArrTy, ArrAlloca,
                                          {ConstantInt::get(IndexTy, 0), Index});
   Builder.CreateStore(Val, GEP);
-
   IEI.eraseFromParent();
   return true;
 }
 
-bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
+bool DataScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) {
   // If the index is a constant then we don't need to scalarize it
-  Value *Index = EEI.getIndexOperand();
-  Type *IndexTy = Index->getType();
+  Value *Index = IEI.getOperand(2);
   if (isa<ConstantInt>(Index))
     return false;
+  return replaceDynamicInsertElementInst(IEI);
+}
 
+static bool replaceDynamicExtractElementInst(ExtractElementInst &EEI) {
   IRBuilder<> Builder(&EEI);
+
+  Value *Index = EEI.getIndexOperand();
+  Type *IndexTy = Index->getType();
+
   std::pair<Value *, Type *> Arr =
       allocaArrayFromVector(Builder, EEI.getVectorOperand(), IndexTy);
   Value *ArrAlloca = Arr.first;
@@ -232,6 +234,14 @@ bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
   return true;
 }
 
+bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
+  // If the index is a constant then we don't need to scalarize it
+  Value *Index = EEI.getIndexOperand();
+  if (isa<ConstantInt>(Index))
+    return false;
+  return replaceDynamicExtractElementInst(EEI);
+}
+
 bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
 
   unsigned NumOperands = GEPI.getNumOperands();

>From bfa93eac0112f859c7813acba37c307fe70624ec Mon Sep 17 00:00:00 2001
From: Icohedron <cheung.deric at gmail.com>
Date: Thu, 29 May 2025 04:03:58 +0000
Subject: [PATCH 7/9] Complete scalarization of insertelement with dynamic
 index

---
 .../Target/DirectX/DXILDataScalarization.cpp  | 58 +++++++++++--------
 .../DirectX/scalarize-dynamic-vector-index.ll | 26 +++++----
 2 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index 8164850927e2a..9ef43c938d9b5 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -172,36 +172,41 @@ bool DataScalarizerVisitor::visitStoreInst(StoreInst &SI) {
   return false;
 }
 
-// Allocates and populates an array equivalent to the vector operand Vec.
-// Returns the array and the type of the array.
-static std::pair<Value *, Type *>
-allocaArrayFromVector(IRBuilder<> &Builder, Value *Vec, Type *IdxTy) {
+static bool replaceDynamicInsertElementInst(InsertElementInst &IEI) {
+  IRBuilder<> Builder(&IEI);
+
+  Value *Vec = IEI.getOperand(0);
+  Value *Val = IEI.getOperand(1);
+  Value *Index = IEI.getOperand(2);
+  Type *IndexTy = Index->getType();
+
   Type *ArrTy = equivalentArrayTypeFromVector(Vec->getType());
   Value *ArrAlloca = Builder.CreateAlloca(ArrTy);
-  for (unsigned I = 0; I < ArrTy->getArrayNumElements(); ++I) {
+  const uint64_t ArrNumElems = ArrTy->getArrayNumElements();
+
+  SmallVector<Value *, 4> GEPs(ArrNumElems);
+  for (unsigned I = 0; I < ArrNumElems; ++I) {
     Value *EE = Builder.CreateExtractElement(Vec, I);
     Value *GEP = Builder.CreateInBoundsGEP(
         ArrTy, ArrAlloca,
-        {ConstantInt::get(IdxTy, 0), ConstantInt::get(IdxTy, I)});
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, I)});
     Builder.CreateStore(EE, GEP);
+    GEPs[I] = GEP;
   }
-  return std::make_pair(ArrAlloca, ArrTy);
-}
 
-static bool replaceDynamicInsertElementInst(InsertElementInst &IEI) {
-  IRBuilder<> Builder(&IEI);
+  Value *GEPForStore = Builder.CreateInBoundsGEP(
+      ArrTy, ArrAlloca, {ConstantInt::get(IndexTy, 0), Index});
+  Builder.CreateStore(Val, GEPForStore);
 
-  Value *Vec = IEI.getOperand(0);
-  Value *Val = IEI.getOperand(1);
-  Value *Index = IEI.getOperand(2);
-  Type *IndexTy = Index->getType();
+  Value *NewIEI = PoisonValue::get(Vec->getType());
+  for (unsigned I = 0; I < ArrNumElems; ++I) {
+    Value *GEP = GEPs[I];
+    Value *Load = Builder.CreateLoad(ArrTy->getArrayElementType(), GEP);
+    NewIEI =
+        Builder.CreateInsertElement(NewIEI, Load, ConstantInt::get(IndexTy, I));
+  }
 
-  std::pair<Value *, Type *> Arr = allocaArrayFromVector(Builder, Vec, IndexTy);
-  Value *ArrAlloca = Arr.first;
-  Type *ArrTy = Arr.second;
-  Value *GEP = Builder.CreateInBoundsGEP(ArrTy, ArrAlloca,
-                                         {ConstantInt::get(IndexTy, 0), Index});
-  Builder.CreateStore(Val, GEP);
+  IEI.replaceAllUsesWith(NewIEI);
   IEI.eraseFromParent();
   return true;
 }
@@ -220,10 +225,15 @@ static bool replaceDynamicExtractElementInst(ExtractElementInst &EEI) {
   Value *Index = EEI.getIndexOperand();
   Type *IndexTy = Index->getType();
 
-  std::pair<Value *, Type *> Arr =
-      allocaArrayFromVector(Builder, EEI.getVectorOperand(), IndexTy);
-  Value *ArrAlloca = Arr.first;
-  Type *ArrTy = Arr.second;
+  Type *ArrTy = equivalentArrayTypeFromVector(EEI.getVectorOperandType());
+  Value *ArrAlloca = Builder.CreateAlloca(ArrTy);
+  for (unsigned I = 0; I < ArrTy->getArrayNumElements(); ++I) {
+    Value *EE = Builder.CreateExtractElement(EEI.getVectorOperand(), I);
+    Value *GEP = Builder.CreateInBoundsGEP(
+        ArrTy, ArrAlloca,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, I)});
+    Builder.CreateStore(EE, GEP);
+  }
 
   Value *GEP = Builder.CreateInBoundsGEP(ArrTy, ArrAlloca,
                                          {ConstantInt::get(IndexTy, 0), Index});
diff --git a/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll b/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
index b1191d903fd49..1fe9868b88f65 100644
--- a/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
+++ b/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
@@ -25,8 +25,8 @@ define float @extract_float_vec_dynamic(<4 x float> %v, i32 %i) {
   ret float %ee
 }
 
-define void @insert_i32_vec_dynamic(<3 x i32> %v, i32 %a, i32 %i) {
-; CHECK-LABEL: define void @insert_i32_vec_dynamic(
+define <3 x i32> @insert_i32_vec_dynamic(<3 x i32> %v, i32 %a, i32 %i) {
+; CHECK-LABEL: define <3 x i32> @insert_i32_vec_dynamic(
 ; CHECK-SAME: <3 x i32> [[V:%.*]], i32 [[A:%.*]], i32 [[I:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [3 x i32], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i32> [[V]], i64 0
@@ -40,10 +40,16 @@ define void @insert_i32_vec_dynamic(<3 x i32> %v, i32 %a, i32 %i) {
 ; CHECK-NEXT:    store i32 [[TMP6]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP1]], i32 0, i32 [[I]]
 ; CHECK-NEXT:    store i32 [[A]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <3 x i32> poison, i32 [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <3 x i32> [[TMP10]], i32 [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <3 x i32> [[TMP12]], i32 [[TMP13]], i32 2
+; CHECK-NEXT:    ret <3 x i32> [[TMP14]]
 ;
-  insertelement <3 x i32> %v, i32 %a, i32 %i
-  ret void
+  %ie = insertelement <3 x i32> %v, i32 %a, i32 %i
+  ret <3 x i32> %ie
 }
 
 ; An extractelement with a constant index should not be converted to array form
@@ -58,13 +64,13 @@ define i16 @extract_i16_vec_constant(<4 x i16> %v) {
 }
 
 ; An insertelement with a constant index should not be converted to array form
-define void @insert_half_vec_constant(<2 x half> %v, half %a) {
-; CHECK-LABEL: define void @insert_half_vec_constant(
+define <2 x half> @insert_half_vec_constant(<2 x half> %v, half %a) {
+; CHECK-LABEL: define <2 x half> @insert_half_vec_constant(
 ; CHECK-SAME: <2 x half> [[V:%.*]], half [[A:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x half> [[V]], half [[A]], i32 1
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> [[TMP1]]
 ;
-  insertelement <2 x half> %v, half %a, i32 1
-  ret void
+  %ie = insertelement <2 x half> %v, half %a, i32 1
+  ret <2 x half> %ie
 }
 

>From f78b0590aaf4a2a93794dc15744b3e0631bb615d Mon Sep 17 00:00:00 2001
From: Icohedron <cheung.deric at gmail.com>
Date: Fri, 6 Jun 2025 20:27:33 +0000
Subject: [PATCH 8/9] Reuse allocas and place allocas in entry block

---
 .../Target/DirectX/DXILDataScalarization.cpp  | 105 +++++++++------
 .../DirectX/scalarize-dynamic-vector-index.ll | 125 ++++++++++++------
 2 files changed, 155 insertions(+), 75 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index 9ef43c938d9b5..922e86936230d 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -79,6 +79,16 @@ class DataScalarizerVisitor : public InstVisitor<DataScalarizerVisitor, bool> {
   friend bool findAndReplaceVectors(llvm::Module &M);
 
 private:
+  typedef std::pair<AllocaInst *, SmallVector<Value *, 4>> AllocaAndGEPs;
+  typedef SmallDenseMap<Value *, AllocaAndGEPs>
+      VectorToArrayMap; // A map from a vector-typed Value to its corresponding
+                        // AllocaInst and GEPs to each element of an array
+  VectorToArrayMap VectorAllocaMap;
+  AllocaAndGEPs createArrayFromVector(IRBuilder<> &Builder, Value *Vec,
+                                      const Twine &Name);
+  bool replaceDynamicInsertElementInst(InsertElementInst &IEI);
+  bool replaceDynamicExtractElementInst(ExtractElementInst &EEI);
+
   GlobalVariable *lookupReplacementGlobal(Value *CurrOperand);
   DenseMap<GlobalVariable *, GlobalVariable *> GlobalMap;
 };
@@ -90,6 +100,7 @@ bool DataScalarizerVisitor::visit(Function &F) {
     for (Instruction &I : make_early_inc_range(*BB))
       MadeChange |= InstVisitor::visit(I);
   }
+  VectorAllocaMap.clear();
   return MadeChange;
 }
 
@@ -172,38 +183,61 @@ bool DataScalarizerVisitor::visitStoreInst(StoreInst &SI) {
   return false;
 }
 
-static bool replaceDynamicInsertElementInst(InsertElementInst &IEI) {
-  IRBuilder<> Builder(&IEI);
+DataScalarizerVisitor::AllocaAndGEPs
+DataScalarizerVisitor::createArrayFromVector(IRBuilder<> &Builder, Value *Vec,
+                                             const Twine &Name = "") {
+  // If there is already an alloca for this vector, return it
+  auto VA = VectorAllocaMap.find(Vec);
+  if (VA != VectorAllocaMap.end())
+    return VA->second;
 
-  Value *Vec = IEI.getOperand(0);
-  Value *Val = IEI.getOperand(1);
-  Value *Index = IEI.getOperand(2);
-  Type *IndexTy = Index->getType();
+  auto InsertPoint = Builder.GetInsertPoint();
+  Builder.SetInsertPointPastAllocas(Builder.GetInsertBlock()->getParent());
 
   Type *ArrTy = equivalentArrayTypeFromVector(Vec->getType());
-  Value *ArrAlloca = Builder.CreateAlloca(ArrTy);
+  AllocaInst *ArrAlloca =
+      Builder.CreateAlloca(ArrTy, nullptr, Name + ".alloca");
   const uint64_t ArrNumElems = ArrTy->getArrayNumElements();
 
   SmallVector<Value *, 4> GEPs(ArrNumElems);
   for (unsigned I = 0; I < ArrNumElems; ++I) {
-    Value *EE = Builder.CreateExtractElement(Vec, I);
-    Value *GEP = Builder.CreateInBoundsGEP(
-        ArrTy, ArrAlloca,
-        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, I)});
-    Builder.CreateStore(EE, GEP);
-    GEPs[I] = GEP;
+    Value *EE = Builder.CreateExtractElement(Vec, I, Name + ".extract");
+    GEPs[I] = Builder.CreateInBoundsGEP(
+        ArrTy, ArrAlloca, {Builder.getInt32(0), Builder.getInt32(I)},
+        Name + ".index");
+    Builder.CreateStore(EE, GEPs[I]);
   }
 
-  Value *GEPForStore = Builder.CreateInBoundsGEP(
-      ArrTy, ArrAlloca, {ConstantInt::get(IndexTy, 0), Index});
+  VectorAllocaMap.insert({Vec, {ArrAlloca, GEPs}});
+  Builder.SetInsertPoint(InsertPoint);
+  return {ArrAlloca, GEPs};
+}
+
+bool DataScalarizerVisitor::replaceDynamicInsertElementInst(
+    InsertElementInst &IEI) {
+  IRBuilder<> Builder(&IEI);
+
+  Value *Vec = IEI.getOperand(0);
+  Value *Val = IEI.getOperand(1);
+  Value *Index = IEI.getOperand(2);
+
+  AllocaAndGEPs ArrAllocaAndGEPs =
+      createArrayFromVector(Builder, Vec, IEI.getName());
+  AllocaInst *ArrAlloca = ArrAllocaAndGEPs.first;
+  SmallVector<Value *, 4> &ArrGEPs = ArrAllocaAndGEPs.second;
+
+  Type *ArrTy = ArrAlloca->getAllocatedType();
+  Value *GEPForStore =
+      Builder.CreateInBoundsGEP(ArrTy, ArrAlloca, {Builder.getInt32(0), Index},
+                                IEI.getName() + ".dynindex");
   Builder.CreateStore(Val, GEPForStore);
 
   Value *NewIEI = PoisonValue::get(Vec->getType());
-  for (unsigned I = 0; I < ArrNumElems; ++I) {
-    Value *GEP = GEPs[I];
-    Value *Load = Builder.CreateLoad(ArrTy->getArrayElementType(), GEP);
-    NewIEI =
-        Builder.CreateInsertElement(NewIEI, Load, ConstantInt::get(IndexTy, I));
+  for (unsigned I = 0; I < ArrTy->getArrayNumElements(); ++I) {
+    Value *Load = Builder.CreateLoad(ArrTy->getArrayElementType(), ArrGEPs[I],
+                                     IEI.getName() + ".load");
+    NewIEI = Builder.CreateInsertElement(NewIEI, Load, Builder.getInt32(I),
+                                         IEI.getName() + ".insert");
   }
 
   IEI.replaceAllUsesWith(NewIEI);
@@ -219,25 +253,20 @@ bool DataScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) {
   return replaceDynamicInsertElementInst(IEI);
 }
 
-static bool replaceDynamicExtractElementInst(ExtractElementInst &EEI) {
+bool DataScalarizerVisitor::replaceDynamicExtractElementInst(
+    ExtractElementInst &EEI) {
   IRBuilder<> Builder(&EEI);
 
-  Value *Index = EEI.getIndexOperand();
-  Type *IndexTy = Index->getType();
-
-  Type *ArrTy = equivalentArrayTypeFromVector(EEI.getVectorOperandType());
-  Value *ArrAlloca = Builder.CreateAlloca(ArrTy);
-  for (unsigned I = 0; I < ArrTy->getArrayNumElements(); ++I) {
-    Value *EE = Builder.CreateExtractElement(EEI.getVectorOperand(), I);
-    Value *GEP = Builder.CreateInBoundsGEP(
-        ArrTy, ArrAlloca,
-        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, I)});
-    Builder.CreateStore(EE, GEP);
-  }
+  AllocaAndGEPs ArrAllocaAndGEPs =
+      createArrayFromVector(Builder, EEI.getVectorOperand(), EEI.getName());
+  AllocaInst *ArrAlloca = ArrAllocaAndGEPs.first;
 
-  Value *GEP = Builder.CreateInBoundsGEP(ArrTy, ArrAlloca,
-                                         {ConstantInt::get(IndexTy, 0), Index});
-  Value *Load = Builder.CreateLoad(ArrTy->getArrayElementType(), GEP);
+  Type *ArrTy = ArrAlloca->getAllocatedType();
+  Value *GEP = Builder.CreateInBoundsGEP(
+      ArrTy, ArrAlloca, {Builder.getInt32(0), EEI.getIndexOperand()},
+      EEI.getName() + ".index");
+  Value *Load = Builder.CreateLoad(ArrTy->getArrayElementType(), GEP,
+                                   EEI.getName() + ".load");
 
   EEI.replaceAllUsesWith(Load);
   EEI.eraseFromParent();
@@ -276,8 +305,8 @@ bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   return true;
 }
 
-Constant *transformInitializer(Constant *Init, Type *OrigType, Type *NewType,
-                               LLVMContext &Ctx) {
+static Constant *transformInitializer(Constant *Init, Type *OrigType,
+                                      Type *NewType, LLVMContext &Ctx) {
   // Handle ConstantAggregateZero (zero-initialized constants)
   if (isa<ConstantAggregateZero>(Init)) {
     return ConstantAggregateZero::get(NewType);
diff --git a/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll b/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
index 1fe9868b88f65..e6d01489ea961 100644
--- a/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
+++ b/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
@@ -1,25 +1,76 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes='dxil-data-scalarization' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
+; Allocas should be placed in the entry block.
+; Allocas should also be reused across multiple insertelement and extractelement instructions for the same vector
+define void @alloca_placement_and_reuse(<3 x i32> %v1, <3 x i32> %v2, i32 %a, i32 %i, i32 %j) {
+; CHECK-LABEL: define void @alloca_placement_and_reuse(
+; CHECK-SAME: <3 x i32> [[V1:%.*]], <3 x i32> [[V2:%.*]], i32 [[A:%.*]], i32 [[I:%.*]], i32 [[J:%.*]]) {
+; CHECK-NEXT:    [[AL:%.*]] = alloca [3 x i32], align 4
+; CHECK-NEXT:    [[EE1_ALLOCA:%.*]] = alloca [3 x i32], align 4
+; CHECK-NEXT:    [[EE2_ALLOCA:%.*]] = alloca [3 x i32], align 4
+; CHECK-NEXT:    [[EE2_EXTRACT:%.*]] = extractelement <3 x i32> [[V2]], i64 0
+; CHECK-NEXT:    [[EE2_INDEX:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE2_ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    store i32 [[EE2_EXTRACT]], ptr [[EE2_INDEX]], align 4
+; CHECK-NEXT:    [[EE2_EXTRACT10:%.*]] = extractelement <3 x i32> [[V2]], i64 1
+; CHECK-NEXT:    [[EE2_INDEX11:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE2_ALLOCA]], i32 0, i32 1
+; CHECK-NEXT:    store i32 [[EE2_EXTRACT10]], ptr [[EE2_INDEX11]], align 4
+; CHECK-NEXT:    [[EE2_EXTRACT12:%.*]] = extractelement <3 x i32> [[V2]], i64 2
+; CHECK-NEXT:    [[EE2_INDEX13:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE2_ALLOCA]], i32 0, i32 2
+; CHECK-NEXT:    store i32 [[EE2_EXTRACT12]], ptr [[EE2_INDEX13]], align 4
+; CHECK-NEXT:    [[EE1_EXTRACT:%.*]] = extractelement <3 x i32> [[V1]], i64 0
+; CHECK-NEXT:    [[EE1_INDEX:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE1_ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    store i32 [[EE1_EXTRACT]], ptr [[EE1_INDEX]], align 4
+; CHECK-NEXT:    [[EE1_EXTRACT1:%.*]] = extractelement <3 x i32> [[V1]], i64 1
+; CHECK-NEXT:    [[EE1_INDEX2:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE1_ALLOCA]], i32 0, i32 1
+; CHECK-NEXT:    store i32 [[EE1_EXTRACT1]], ptr [[EE1_INDEX2]], align 4
+; CHECK-NEXT:    [[EE1_EXTRACT3:%.*]] = extractelement <3 x i32> [[V1]], i64 2
+; CHECK-NEXT:    [[EE1_INDEX4:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE1_ALLOCA]], i32 0, i32 2
+; CHECK-NEXT:    store i32 [[EE1_EXTRACT3]], ptr [[EE1_INDEX4]], align 4
+; CHECK-NEXT:    br label %[[BODY:.*]]
+; CHECK:       [[BODY]]:
+; CHECK-NEXT:    [[EE1_INDEX5:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE1_ALLOCA]], i32 0, i32 [[I]]
+; CHECK-NEXT:    [[EE1_LOAD:%.*]] = load i32, ptr [[EE1_INDEX5]], align 4
+; CHECK-NEXT:    [[IE1_DYNINDEX:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE1_ALLOCA]], i32 0, i32 [[I]]
+; CHECK-NEXT:    store i32 [[A]], ptr [[IE1_DYNINDEX]], align 4
+; CHECK-NEXT:    [[IE1_LOAD:%.*]] = load i32, ptr [[EE1_INDEX]], align 4
+; CHECK-NEXT:    [[IE1_INSERT:%.*]] = insertelement <3 x i32> poison, i32 [[IE1_LOAD]], i32 0
+; CHECK-NEXT:    [[IE1_LOAD6:%.*]] = load i32, ptr [[EE1_INDEX2]], align 4
+; CHECK-NEXT:    [[IE1_INSERT7:%.*]] = insertelement <3 x i32> [[IE1_INSERT]], i32 [[IE1_LOAD6]], i32 1
+; CHECK-NEXT:    [[IE1_LOAD8:%.*]] = load i32, ptr [[EE1_INDEX4]], align 4
+; CHECK-NEXT:    [[IE1_INSERT9:%.*]] = insertelement <3 x i32> [[IE1_INSERT7]], i32 [[IE1_LOAD8]], i32 2
+; CHECK-NEXT:    [[EE2_INDEX14:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE2_ALLOCA]], i32 0, i32 [[J]]
+; CHECK-NEXT:    [[EE2_LOAD:%.*]] = load i32, ptr [[EE2_INDEX14]], align 4
+; CHECK-NEXT:    ret void
+;
+  %al = alloca [3 x i32], align 4
+  br label %body
+body:
+  %ee1 = extractelement <3 x i32> %v1, i32 %i
+  %ie1 = insertelement <3 x i32> %v1, i32 %a, i32 %i
+  %ee2 = extractelement <3 x i32> %v2, i32 %j
+  ret void
+}
+
 define float @extract_float_vec_dynamic(<4 x float> %v, i32 %i) {
 ; CHECK-LABEL: define float @extract_float_vec_dynamic(
 ; CHECK-SAME: <4 x float> [[V:%.*]], i32 [[I:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = alloca [4 x float], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[V]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP1]], i32 0, i32 0
-; CHECK-NEXT:    store float [[TMP2]], ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[V]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP1]], i32 0, i32 1
-; CHECK-NEXT:    store float [[TMP4]], ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[V]], i64 2
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP1]], i32 0, i32 2
-; CHECK-NEXT:    store float [[TMP6]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[V]], i64 3
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP1]], i32 0, i32 3
-; CHECK-NEXT:    store float [[TMP8]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP1]], i32 0, i32 [[I]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4
-; CHECK-NEXT:    ret float [[TMP11]]
+; CHECK-NEXT:    [[EE_ALLOCA:%.*]] = alloca [4 x float], align 4
+; CHECK-NEXT:    [[EE_EXTRACT:%.*]] = extractelement <4 x float> [[V]], i64 0
+; CHECK-NEXT:    [[EE_INDEX:%.*]] = getelementptr inbounds [4 x float], ptr [[EE_ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    store float [[EE_EXTRACT]], ptr [[EE_INDEX]], align 4
+; CHECK-NEXT:    [[EE_EXTRACT1:%.*]] = extractelement <4 x float> [[V]], i64 1
+; CHECK-NEXT:    [[EE_INDEX2:%.*]] = getelementptr inbounds [4 x float], ptr [[EE_ALLOCA]], i32 0, i32 1
+; CHECK-NEXT:    store float [[EE_EXTRACT1]], ptr [[EE_INDEX2]], align 4
+; CHECK-NEXT:    [[EE_EXTRACT3:%.*]] = extractelement <4 x float> [[V]], i64 2
+; CHECK-NEXT:    [[EE_INDEX4:%.*]] = getelementptr inbounds [4 x float], ptr [[EE_ALLOCA]], i32 0, i32 2
+; CHECK-NEXT:    store float [[EE_EXTRACT3]], ptr [[EE_INDEX4]], align 4
+; CHECK-NEXT:    [[EE_EXTRACT5:%.*]] = extractelement <4 x float> [[V]], i64 3
+; CHECK-NEXT:    [[EE_INDEX6:%.*]] = getelementptr inbounds [4 x float], ptr [[EE_ALLOCA]], i32 0, i32 3
+; CHECK-NEXT:    store float [[EE_EXTRACT5]], ptr [[EE_INDEX6]], align 4
+; CHECK-NEXT:    [[EE_INDEX7:%.*]] = getelementptr inbounds [4 x float], ptr [[EE_ALLOCA]], i32 0, i32 [[I]]
+; CHECK-NEXT:    [[EE_LOAD:%.*]] = load float, ptr [[EE_INDEX7]], align 4
+; CHECK-NEXT:    ret float [[EE_LOAD]]
 ;
   %ee = extractelement <4 x float> %v, i32 %i
   ret float %ee
@@ -28,25 +79,25 @@ define float @extract_float_vec_dynamic(<4 x float> %v, i32 %i) {
 define <3 x i32> @insert_i32_vec_dynamic(<3 x i32> %v, i32 %a, i32 %i) {
 ; CHECK-LABEL: define <3 x i32> @insert_i32_vec_dynamic(
 ; CHECK-SAME: <3 x i32> [[V:%.*]], i32 [[A:%.*]], i32 [[I:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = alloca [3 x i32], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i32> [[V]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP1]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x i32> [[V]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP1]], i32 0, i32 1
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x i32> [[V]], i64 2
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP1]], i32 0, i32 2
-; CHECK-NEXT:    store i32 [[TMP6]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP1]], i32 0, i32 [[I]]
-; CHECK-NEXT:    store i32 [[A]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <3 x i32> poison, i32 [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <3 x i32> [[TMP10]], i32 [[TMP11]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <3 x i32> [[TMP12]], i32 [[TMP13]], i32 2
-; CHECK-NEXT:    ret <3 x i32> [[TMP14]]
+; CHECK-NEXT:    [[IE_ALLOCA:%.*]] = alloca [3 x i32], align 4
+; CHECK-NEXT:    [[IE_EXTRACT:%.*]] = extractelement <3 x i32> [[V]], i64 0
+; CHECK-NEXT:    [[IE_INDEX:%.*]] = getelementptr inbounds [3 x i32], ptr [[IE_ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    store i32 [[IE_EXTRACT]], ptr [[IE_INDEX]], align 4
+; CHECK-NEXT:    [[IE_EXTRACT1:%.*]] = extractelement <3 x i32> [[V]], i64 1
+; CHECK-NEXT:    [[IE_INDEX2:%.*]] = getelementptr inbounds [3 x i32], ptr [[IE_ALLOCA]], i32 0, i32 1
+; CHECK-NEXT:    store i32 [[IE_EXTRACT1]], ptr [[IE_INDEX2]], align 4
+; CHECK-NEXT:    [[IE_EXTRACT3:%.*]] = extractelement <3 x i32> [[V]], i64 2
+; CHECK-NEXT:    [[IE_INDEX4:%.*]] = getelementptr inbounds [3 x i32], ptr [[IE_ALLOCA]], i32 0, i32 2
+; CHECK-NEXT:    store i32 [[IE_EXTRACT3]], ptr [[IE_INDEX4]], align 4
+; CHECK-NEXT:    [[IE_DYNINDEX:%.*]] = getelementptr inbounds [3 x i32], ptr [[IE_ALLOCA]], i32 0, i32 [[I]]
+; CHECK-NEXT:    store i32 [[A]], ptr [[IE_DYNINDEX]], align 4
+; CHECK-NEXT:    [[IE_LOAD:%.*]] = load i32, ptr [[IE_INDEX]], align 4
+; CHECK-NEXT:    [[IE_INSERT:%.*]] = insertelement <3 x i32> poison, i32 [[IE_LOAD]], i32 0
+; CHECK-NEXT:    [[IE_LOAD5:%.*]] = load i32, ptr [[IE_INDEX2]], align 4
+; CHECK-NEXT:    [[IE_INSERT6:%.*]] = insertelement <3 x i32> [[IE_INSERT]], i32 [[IE_LOAD5]], i32 1
+; CHECK-NEXT:    [[IE_LOAD7:%.*]] = load i32, ptr [[IE_INDEX4]], align 4
+; CHECK-NEXT:    [[IE_INSERT8:%.*]] = insertelement <3 x i32> [[IE_INSERT6]], i32 [[IE_LOAD7]], i32 2
+; CHECK-NEXT:    ret <3 x i32> [[IE_INSERT8]]
 ;
   %ie = insertelement <3 x i32> %v, i32 %a, i32 %i
   ret <3 x i32> %ie
@@ -67,8 +118,8 @@ define i16 @extract_i16_vec_constant(<4 x i16> %v) {
 define <2 x half> @insert_half_vec_constant(<2 x half> %v, half %a) {
 ; CHECK-LABEL: define <2 x half> @insert_half_vec_constant(
 ; CHECK-SAME: <2 x half> [[V:%.*]], half [[A:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x half> [[V]], half [[A]], i32 1
-; CHECK-NEXT:    ret <2 x half> [[TMP1]]
+; CHECK-NEXT:    [[IE:%.*]] = insertelement <2 x half> [[V]], half [[A]], i32 1
+; CHECK-NEXT:    ret <2 x half> [[IE]]
 ;
   %ie = insertelement <2 x half> %v, half %a, i32 1
   ret <2 x half> %ie

>From 63afe6eebea1cdf495fcb12389feb0237676759b Mon Sep 17 00:00:00 2001
From: Icohedron <cheung.deric at gmail.com>
Date: Tue, 17 Jun 2025 22:28:55 +0000
Subject: [PATCH 9/9] Ensure correctness in control flow and multiple
 insertelements

---
 .../Target/DirectX/DXILDataScalarization.cpp  |  45 +++++--
 .../DirectX/scalarize-dynamic-vector-index.ll | 113 +++++++++++++-----
 2 files changed, 117 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index 922e86936230d..1d94362a05459 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -192,13 +192,19 @@ DataScalarizerVisitor::createArrayFromVector(IRBuilder<> &Builder, Value *Vec,
     return VA->second;
 
   auto InsertPoint = Builder.GetInsertPoint();
-  Builder.SetInsertPointPastAllocas(Builder.GetInsertBlock()->getParent());
 
+  // Allocate the array to hold the vector elements
+  Builder.SetInsertPointPastAllocas(Builder.GetInsertBlock()->getParent());
   Type *ArrTy = equivalentArrayTypeFromVector(Vec->getType());
   AllocaInst *ArrAlloca =
       Builder.CreateAlloca(ArrTy, nullptr, Name + ".alloca");
   const uint64_t ArrNumElems = ArrTy->getArrayNumElements();
 
+  // Create loads and stores to populate the array immediately after the
+  // original vector's defining instruction if available, else immediately after
+  // the alloca
+  if (auto *Instr = dyn_cast<Instruction>(Vec))
+    Builder.SetInsertPoint(Instr->getNextNonDebugInstruction());
   SmallVector<Value *, 4> GEPs(ArrNumElems);
   for (unsigned I = 0; I < ArrNumElems; ++I) {
     Value *EE = Builder.CreateExtractElement(Vec, I, Name + ".extract");
@@ -213,6 +219,19 @@ DataScalarizerVisitor::createArrayFromVector(IRBuilder<> &Builder, Value *Vec,
   return {ArrAlloca, GEPs};
 }
 
+/// Returns a pair of Value* with the first being a GEP into ArrAlloca using
+/// indices {0, Index}, and the second Value* being a Load of the GEP
+static std::pair<Value *, Value *>
+dynamicallyLoadArray(IRBuilder<> &Builder, AllocaInst *ArrAlloca, Value *Index,
+                     const Twine &Name = "") {
+  Type *ArrTy = ArrAlloca->getAllocatedType();
+  Value *GEP = Builder.CreateInBoundsGEP(
+      ArrTy, ArrAlloca, {Builder.getInt32(0), Index}, Name + ".index");
+  Value *Load =
+      Builder.CreateLoad(ArrTy->getArrayElementType(), GEP, Name + ".load");
+  return std::make_pair(GEP, Load);
+}
+
 bool DataScalarizerVisitor::replaceDynamicInsertElementInst(
     InsertElementInst &IEI) {
   IRBuilder<> Builder(&IEI);
@@ -224,14 +243,15 @@ bool DataScalarizerVisitor::replaceDynamicInsertElementInst(
   AllocaAndGEPs ArrAllocaAndGEPs =
       createArrayFromVector(Builder, Vec, IEI.getName());
   AllocaInst *ArrAlloca = ArrAllocaAndGEPs.first;
+  Type *ArrTy = ArrAlloca->getAllocatedType();
   SmallVector<Value *, 4> &ArrGEPs = ArrAllocaAndGEPs.second;
 
-  Type *ArrTy = ArrAlloca->getAllocatedType();
-  Value *GEPForStore =
-      Builder.CreateInBoundsGEP(ArrTy, ArrAlloca, {Builder.getInt32(0), Index},
-                                IEI.getName() + ".dynindex");
-  Builder.CreateStore(Val, GEPForStore);
+  auto GEPAndLoad =
+      dynamicallyLoadArray(Builder, ArrAlloca, Index, IEI.getName());
+  Value *GEP = GEPAndLoad.first;
+  Value *Load = GEPAndLoad.second;
 
+  Builder.CreateStore(Val, GEP);
   Value *NewIEI = PoisonValue::get(Vec->getType());
   for (unsigned I = 0; I < ArrTy->getArrayNumElements(); ++I) {
     Value *Load = Builder.CreateLoad(ArrTy->getArrayElementType(), ArrGEPs[I],
@@ -240,6 +260,10 @@ bool DataScalarizerVisitor::replaceDynamicInsertElementInst(
                                          IEI.getName() + ".insert");
   }
 
+  // Store back the original value so the Alloca can be reused for subsequent
+  // insertelement instructions on the same vector
+  Builder.CreateStore(Load, GEP);
+
   IEI.replaceAllUsesWith(NewIEI);
   IEI.eraseFromParent();
   return true;
@@ -261,12 +285,9 @@ bool DataScalarizerVisitor::replaceDynamicExtractElementInst(
       createArrayFromVector(Builder, EEI.getVectorOperand(), EEI.getName());
   AllocaInst *ArrAlloca = ArrAllocaAndGEPs.first;
 
-  Type *ArrTy = ArrAlloca->getAllocatedType();
-  Value *GEP = Builder.CreateInBoundsGEP(
-      ArrTy, ArrAlloca, {Builder.getInt32(0), EEI.getIndexOperand()},
-      EEI.getName() + ".index");
-  Value *Load = Builder.CreateLoad(ArrTy->getArrayElementType(), GEP,
-                                   EEI.getName() + ".load");
+  auto GEPAndLoad = dynamicallyLoadArray(Builder, ArrAlloca,
+                                         EEI.getIndexOperand(), EEI.getName());
+  Value *Load = GEPAndLoad.second;
 
   EEI.replaceAllUsesWith(Load);
   EEI.eraseFromParent();
diff --git a/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll b/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
index e6d01489ea961..0eb65bd4fc751 100644
--- a/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
+++ b/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
@@ -1,37 +1,95 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes='dxil-data-scalarization' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
-; Allocas should be placed in the entry block.
-; Allocas should also be reused across multiple insertelement and extractelement instructions for the same vector
-define void @alloca_placement_and_reuse(<3 x i32> %v1, <3 x i32> %v2, i32 %a, i32 %i, i32 %j) {
-; CHECK-LABEL: define void @alloca_placement_and_reuse(
-; CHECK-SAME: <3 x i32> [[V1:%.*]], <3 x i32> [[V2:%.*]], i32 [[A:%.*]], i32 [[I:%.*]], i32 [[J:%.*]]) {
-; CHECK-NEXT:    [[AL:%.*]] = alloca [3 x i32], align 4
+; Ensure that insertelement instructions have no side effects on each other
+; even in the presence of control flow
+define void @test_multiple_insert(i32 %c, i32 %i, i32 %j) {
+; CHECK-LABEL: define void @test_multiple_insert(
+; CHECK-SAME: i32 [[C:%.*]], i32 [[I:%.*]], i32 [[J:%.*]]) {
+; CHECK-NEXT:    [[V0_ALLOCA:%.*]] = alloca [2 x i32], align 4
+; CHECK-NEXT:    [[V_ALLOCA:%.*]] = alloca [2 x i32], align 4
+; CHECK-NEXT:    [[V0_0:%.*]] = insertelement <2 x i32> poison, i32 0, i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <2 x i32> [[V0_0]], i32 0, i32 1
+; CHECK-NEXT:    [[V0_EXTRACT0:%.*]] = extractelement <2 x i32> [[V0]], i64 0
+; CHECK-NEXT:    [[V0_INDEX0:%.*]] = getelementptr inbounds [2 x i32], ptr [[V0_ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    store i32 [[V0_EXTRACT0]], ptr [[V0_INDEX0]], align 4
+; CHECK-NEXT:    [[V0_EXTRACT1:%.*]] = extractelement <2 x i32> [[V0]], i64 1
+; CHECK-NEXT:    [[V0_INDEX1:%.*]] = getelementptr inbounds [2 x i32], ptr [[V0_ALLOCA]], i32 0, i32 1
+; CHECK-NEXT:    store i32 [[V0_EXTRACT1]], ptr [[V0_INDEX1]], align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[C]], 1
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[V1_INDEX:%.*]] = getelementptr inbounds [2 x i32], ptr [[V0_ALLOCA]], i32 0, i32 [[I]]
+; CHECK-NEXT:    [[V1_LOAD:%.*]] = load i32, ptr [[V1_INDEX]], align 4
+; CHECK-NEXT:    store i32 1, ptr [[V1_INDEX]], align 4
+; CHECK-NEXT:    [[V1_LOAD0:%.*]] = load i32, ptr [[V0_INDEX0]], align 4
+; CHECK-NEXT:    [[V1_INSERT0:%.*]] = insertelement <2 x i32> poison, i32 [[V1_LOAD0]], i32 0
+; CHECK-NEXT:    [[V1_LOAD1:%.*]] = load i32, ptr [[V0_INDEX1]], align 4
+; CHECK-NEXT:    [[V1_INSERT1:%.*]] = insertelement <2 x i32> [[V1_INSERT0]], i32 [[V1_LOAD1]], i32 1
+; CHECK-NEXT:    store i32 [[V1_LOAD]], ptr [[V1_INDEX]], align 4
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[V2_INDEX:%.*]] = getelementptr inbounds [2 x i32], ptr [[V0_ALLOCA]], i32 0, i32 [[I]]
+; CHECK-NEXT:    [[V2_LOAD:%.*]] = load i32, ptr [[V2_INDEX]], align 4
+; CHECK-NEXT:    store i32 2, ptr [[V2_INDEX]], align 4
+; CHECK-NEXT:    [[V2_LOAD0:%.*]] = load i32, ptr [[V0_INDEX0]], align 4
+; CHECK-NEXT:    [[V2_INSERT0:%.*]] = insertelement <2 x i32> poison, i32 [[V2_LOAD0]], i32 0
+; CHECK-NEXT:    [[V2_LOAD1:%.*]] = load i32, ptr [[V0_INDEX1]], align 4
+; CHECK-NEXT:    [[V2_INSERT1:%.*]] = insertelement <2 x i32> [[V2_INSERT0]], i32 [[V2_LOAD1]], i32 1
+; CHECK-NEXT:    store i32 [[V2_LOAD]], ptr [[V2_INDEX]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[V:%.*]] = phi <2 x i32> [ [[V1_INSERT1]], %[[IF]] ], [ [[V2_INSERT1]], %[[ELSE]] ]
+; CHECK-NEXT:    [[V_EXTRACT:%.*]] = extractelement <2 x i32> [[V]], i64 0
+; CHECK-NEXT:    [[V_INDEX:%.*]] = getelementptr inbounds [2 x i32], ptr [[V_ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    store i32 [[V_EXTRACT]], ptr [[V_INDEX]], align 4
+; CHECK-NEXT:    [[V_EXTRACT10:%.*]] = extractelement <2 x i32> [[V]], i64 1
+; CHECK-NEXT:    [[V_INDEX1:%.*]] = getelementptr inbounds [2 x i32], ptr [[V_ALLOCA]], i32 0, i32 1
+; CHECK-NEXT:    store i32 [[V_EXTRACT10]], ptr [[V_INDEX1]], align 4
+; CHECK-NEXT:    [[V3_INDEXJ:%.*]] = getelementptr inbounds [2 x i32], ptr [[V_ALLOCA]], i32 0, i32 [[J]]
+; CHECK-NEXT:    [[V3_LOAD:%.*]] = load i32, ptr [[V3_INDEXJ]], align 4
+; CHECK-NEXT:    store i32 3, ptr [[V3_INDEXJ]], align 4
+; CHECK-NEXT:    [[V3_LOAD0:%.*]] = load i32, ptr [[V_INDEX]], align 4
+; CHECK-NEXT:    [[V3_INSERT0:%.*]] = insertelement <2 x i32> poison, i32 [[V3_LOAD0]], i32 0
+; CHECK-NEXT:    [[V3_LOAD1:%.*]] = load i32, ptr [[V_INDEX1]], align 4
+; CHECK-NEXT:    [[V3_INSERT1:%.*]] = insertelement <2 x i32> [[V3_INSERT0]], i32 [[V3_LOAD1]], i32 1
+; CHECK-NEXT:    store i32 [[V3_LOAD]], ptr [[V3_INDEXJ]], align 4
+; CHECK-NEXT:    ret void
+;
+  %v0_0 = insertelement <2 x i32> poison, i32 0, i32 0
+  %v0 = insertelement <2 x i32> %v0_0, i32 0, i32 1
+  %cond = icmp eq i32 %c, 1
+  br i1 %cond, label %if, label %else
+if:
+  %v1 = insertelement <2 x i32> %v0, i32 1, i32 %i
+  br label %exit
+else:
+  %v2 = insertelement <2 x i32> %v0, i32 2, i32 %i
+  br label %exit
+exit:
+  %v = phi <2 x i32> [ %v1, %if ], [ %v2, %else ]
+  %v3 = insertelement <2 x i32> %v, i32 3, i32 %j
+  ret void
+}
+
+; Allocas can be reused across insert/extractelement instructions on the same vector
+define void @test_alloca_reuse(<3 x i32> %v, i32 %a, i32 %i) {
+; CHECK-LABEL: define void @test_alloca_reuse(
+; CHECK-SAME: <3 x i32> [[V:%.*]], i32 [[A:%.*]], i32 [[I:%.*]]) {
 ; CHECK-NEXT:    [[EE1_ALLOCA:%.*]] = alloca [3 x i32], align 4
-; CHECK-NEXT:    [[EE2_ALLOCA:%.*]] = alloca [3 x i32], align 4
-; CHECK-NEXT:    [[EE2_EXTRACT:%.*]] = extractelement <3 x i32> [[V2]], i64 0
-; CHECK-NEXT:    [[EE2_INDEX:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE2_ALLOCA]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[EE2_EXTRACT]], ptr [[EE2_INDEX]], align 4
-; CHECK-NEXT:    [[EE2_EXTRACT10:%.*]] = extractelement <3 x i32> [[V2]], i64 1
-; CHECK-NEXT:    [[EE2_INDEX11:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE2_ALLOCA]], i32 0, i32 1
-; CHECK-NEXT:    store i32 [[EE2_EXTRACT10]], ptr [[EE2_INDEX11]], align 4
-; CHECK-NEXT:    [[EE2_EXTRACT12:%.*]] = extractelement <3 x i32> [[V2]], i64 2
-; CHECK-NEXT:    [[EE2_INDEX13:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE2_ALLOCA]], i32 0, i32 2
-; CHECK-NEXT:    store i32 [[EE2_EXTRACT12]], ptr [[EE2_INDEX13]], align 4
-; CHECK-NEXT:    [[EE1_EXTRACT:%.*]] = extractelement <3 x i32> [[V1]], i64 0
+; CHECK-NEXT:    [[EE1_EXTRACT:%.*]] = extractelement <3 x i32> [[V]], i64 0
 ; CHECK-NEXT:    [[EE1_INDEX:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE1_ALLOCA]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 [[EE1_EXTRACT]], ptr [[EE1_INDEX]], align 4
-; CHECK-NEXT:    [[EE1_EXTRACT1:%.*]] = extractelement <3 x i32> [[V1]], i64 1
+; CHECK-NEXT:    [[EE1_EXTRACT1:%.*]] = extractelement <3 x i32> [[V]], i64 1
 ; CHECK-NEXT:    [[EE1_INDEX2:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE1_ALLOCA]], i32 0, i32 1
 ; CHECK-NEXT:    store i32 [[EE1_EXTRACT1]], ptr [[EE1_INDEX2]], align 4
-; CHECK-NEXT:    [[EE1_EXTRACT3:%.*]] = extractelement <3 x i32> [[V1]], i64 2
+; CHECK-NEXT:    [[EE1_EXTRACT3:%.*]] = extractelement <3 x i32> [[V]], i64 2
 ; CHECK-NEXT:    [[EE1_INDEX4:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE1_ALLOCA]], i32 0, i32 2
 ; CHECK-NEXT:    store i32 [[EE1_EXTRACT3]], ptr [[EE1_INDEX4]], align 4
-; CHECK-NEXT:    br label %[[BODY:.*]]
-; CHECK:       [[BODY]]:
 ; CHECK-NEXT:    [[EE1_INDEX5:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE1_ALLOCA]], i32 0, i32 [[I]]
 ; CHECK-NEXT:    [[EE1_LOAD:%.*]] = load i32, ptr [[EE1_INDEX5]], align 4
 ; CHECK-NEXT:    [[IE1_DYNINDEX:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE1_ALLOCA]], i32 0, i32 [[I]]
+; CHECK-NEXT:    [[IE1_LOAD1:%.*]] = load i32, ptr [[IE1_DYNINDEX]], align 4
 ; CHECK-NEXT:    store i32 [[A]], ptr [[IE1_DYNINDEX]], align 4
 ; CHECK-NEXT:    [[IE1_LOAD:%.*]] = load i32, ptr [[EE1_INDEX]], align 4
 ; CHECK-NEXT:    [[IE1_INSERT:%.*]] = insertelement <3 x i32> poison, i32 [[IE1_LOAD]], i32 0
@@ -39,16 +97,11 @@ define void @alloca_placement_and_reuse(<3 x i32> %v1, <3 x i32> %v2, i32 %a, i3
 ; CHECK-NEXT:    [[IE1_INSERT7:%.*]] = insertelement <3 x i32> [[IE1_INSERT]], i32 [[IE1_LOAD6]], i32 1
 ; CHECK-NEXT:    [[IE1_LOAD8:%.*]] = load i32, ptr [[EE1_INDEX4]], align 4
 ; CHECK-NEXT:    [[IE1_INSERT9:%.*]] = insertelement <3 x i32> [[IE1_INSERT7]], i32 [[IE1_LOAD8]], i32 2
-; CHECK-NEXT:    [[EE2_INDEX14:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE2_ALLOCA]], i32 0, i32 [[J]]
-; CHECK-NEXT:    [[EE2_LOAD:%.*]] = load i32, ptr [[EE2_INDEX14]], align 4
+; CHECK-NEXT:    store i32 [[IE1_LOAD1]], ptr [[IE1_DYNINDEX]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %al = alloca [3 x i32], align 4
-  br label %body
-body:
-  %ee1 = extractelement <3 x i32> %v1, i32 %i
-  %ie1 = insertelement <3 x i32> %v1, i32 %a, i32 %i
-  %ee2 = extractelement <3 x i32> %v2, i32 %j
+  %ee1 = extractelement <3 x i32> %v, i32 %i
+  %ie1 = insertelement <3 x i32> %v, i32 %a, i32 %i
   ret void
 }
 
@@ -90,6 +143,7 @@ define <3 x i32> @insert_i32_vec_dynamic(<3 x i32> %v, i32 %a, i32 %i) {
 ; CHECK-NEXT:    [[IE_INDEX4:%.*]] = getelementptr inbounds [3 x i32], ptr [[IE_ALLOCA]], i32 0, i32 2
 ; CHECK-NEXT:    store i32 [[IE_EXTRACT3]], ptr [[IE_INDEX4]], align 4
 ; CHECK-NEXT:    [[IE_DYNINDEX:%.*]] = getelementptr inbounds [3 x i32], ptr [[IE_ALLOCA]], i32 0, i32 [[I]]
+; CHECK-NEXT:    [[IE_LOAD1:%.*]] = load i32, ptr [[IE_DYNINDEX]], align 4
 ; CHECK-NEXT:    store i32 [[A]], ptr [[IE_DYNINDEX]], align 4
 ; CHECK-NEXT:    [[IE_LOAD:%.*]] = load i32, ptr [[IE_INDEX]], align 4
 ; CHECK-NEXT:    [[IE_INSERT:%.*]] = insertelement <3 x i32> poison, i32 [[IE_LOAD]], i32 0
@@ -97,6 +151,7 @@ define <3 x i32> @insert_i32_vec_dynamic(<3 x i32> %v, i32 %a, i32 %i) {
 ; CHECK-NEXT:    [[IE_INSERT6:%.*]] = insertelement <3 x i32> [[IE_INSERT]], i32 [[IE_LOAD5]], i32 1
 ; CHECK-NEXT:    [[IE_LOAD7:%.*]] = load i32, ptr [[IE_INDEX4]], align 4
 ; CHECK-NEXT:    [[IE_INSERT8:%.*]] = insertelement <3 x i32> [[IE_INSERT6]], i32 [[IE_LOAD7]], i32 2
+; CHECK-NEXT:    store i32 [[IE_LOAD1]], ptr [[IE_DYNINDEX]], align 4
 ; CHECK-NEXT:    ret <3 x i32> [[IE_INSERT8]]
 ;
   %ie = insertelement <3 x i32> %v, i32 %a, i32 %i



More information about the llvm-commits mailing list