[llvm] [DirectX] Split long vectors in DXILResourceAccess (PR #184732)

via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 4 19:49:50 PST 2026


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-directx

Author: Justin Bogner (bogner)

<details>
<summary>Changes</summary>

If a vector has more than 4 elements, the `resource.load` and `resource.store`
intrinsics aren't able to handle it. Split these into multiple calls.

Fixes #<!-- -->167542

---
Full diff: https://github.com/llvm/llvm-project/pull/184732.diff


3 Files Affected:

- (modified) llvm/lib/Target/DirectX/DXILResourceAccess.cpp (+88-33) 
- (added) llvm/test/CodeGen/DirectX/ResourceAccess/load-rawbuffer-wide.ll (+71) 
- (added) llvm/test/CodeGen/DirectX/ResourceAccess/store-rawbuffer-wide.ll (+58) 


``````````diff
diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
index 2896c68d11de7..f939dea85de6e 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
@@ -10,6 +10,7 @@
 #include "DirectX.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/DXILResource.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/Frontend/HLSL/HLSLResource.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -142,8 +143,24 @@ static void createTypedBufferStore(IntrinsicInst *II, StoreInst *SI,
   SI->replaceAllUsesWith(Inst);
 }
 
-static void createRawStore(IntrinsicInst *II, StoreInst *SI,
-                           dxil::ResourceTypeInfo &RTI) {
+static void emitRawStore(IRBuilder<> &Builder, Value *Buffer, Value *Index,
+                         Value *Offset, Value *V, dxil::ResourceTypeInfo &RTI) {
+  // For raw buffer (ie, HLSL's ByteAddressBuffer), we need to fold the access
+  // entirely into the index.
+  if (!RTI.isStruct()) {
+    auto *ConstantOffset = dyn_cast<ConstantInt>(Offset);
+    if (!ConstantOffset || !ConstantOffset->isZero())
+      Index = Builder.CreateAdd(Index, Offset);
+    Offset = llvm::PoisonValue::get(Builder.getInt32Ty());
+  }
+
+  Builder.CreateIntrinsic(Builder.getVoidTy(),
+                          Intrinsic::dx_resource_store_rawbuffer,
+                          {Buffer, Index, Offset, V});
+}
+
+static void createRawStores(IntrinsicInst *II, StoreInst *SI,
+                            dxil::ResourceTypeInfo &RTI) {
   const DataLayout &DL = SI->getDataLayout();
   IRBuilder<> Builder(SI);
 
@@ -157,19 +174,27 @@ static void createRawStore(IntrinsicInst *II, StoreInst *SI,
   Value *Offset =
       traverseGEPOffsets(DL, Builder, SI->getPointerOperand(), AccessSize);
 
-  // For raw buffer (ie, HLSL's ByteAddressBuffer), we need to fold the access
-  // entirely into the index.
-  if (!RTI.isStruct()) {
-    auto *ConstantOffset = dyn_cast<ConstantInt>(Offset);
-    if (!ConstantOffset || !ConstantOffset->isZero())
-      Index = Builder.CreateAdd(Index, Offset);
-    Offset = llvm::PoisonValue::get(Builder.getInt32Ty());
-  }
+  auto *VT = dyn_cast<FixedVectorType>(V->getType());
+  if (VT && VT->getNumElements() > 4) {
+    // Split into stores of at most 4 elements.
+    Type *EltTy = VT->getElementType();
+    Value *Stride = ConstantInt::get(Builder.getInt32Ty(),
+                                     4 * (DL.getTypeSizeInBits(EltTy) / 8));
 
-  auto *Inst = Builder.CreateIntrinsic(Builder.getVoidTy(),
-                                       Intrinsic::dx_resource_store_rawbuffer,
-                                       {II->getOperand(0), Index, Offset, V});
-  SI->replaceAllUsesWith(Inst);
+    SmallVector<int, 4> Indices;
+    for (unsigned int I = 0, N = VT->getNumElements(); I < N; I += 4) {
+      if (I > 0)
+        Offset = Builder.CreateAdd(Offset, Stride);
+
+      for (unsigned int J = I, E = std::min(N, J + 4); J < E; ++J)
+        Indices.push_back(J);
+      Value *Part = Builder.CreateShuffleVector(V, Indices);
+      emitRawStore(Builder, II->getOperand(0), Index, Offset, Part, RTI);
+
+      Indices.clear();
+    }
+  } else
+    emitRawStore(Builder, II->getOperand(0), Index, Offset, V, RTI);
 }
 
 static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI,
@@ -179,7 +204,7 @@ static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI,
     return createTypedBufferStore(II, SI, RTI);
   case dxil::ResourceKind::RawBuffer:
   case dxil::ResourceKind::StructuredBuffer:
-    return createRawStore(II, SI, RTI);
+    return createRawStores(II, SI, RTI);
   case dxil::ResourceKind::Texture1D:
   case dxil::ResourceKind::Texture2D:
   case dxil::ResourceKind::Texture2DMS:
@@ -234,34 +259,64 @@ static void createTypedBufferLoad(IntrinsicInst *II, LoadInst *LI,
   LI->replaceAllUsesWith(V);
 }
 
-static void createRawLoad(IntrinsicInst *II, LoadInst *LI,
+static Value *emitRawLoad(IRBuilder<> &Builder, Type *Ty, Value *Buffer,
+                          Value *Index, Value *Offset,
                           dxil::ResourceTypeInfo &RTI) {
+  // For raw buffer (ie, HLSL's ByteAddressBuffer), we need to fold the access
+  // entirely into the index.
+  if (!RTI.isStruct()) {
+    auto *ConstantOffset = dyn_cast<ConstantInt>(Offset);
+    if (!ConstantOffset || !ConstantOffset->isZero())
+      Index = Builder.CreateAdd(Index, Offset);
+    Offset = llvm::PoisonValue::get(Builder.getInt32Ty());
+  }
+
+  // The load intrinsic includes the bit for CheckAccessFullyMapped, so we need
+  // to add that to the return type.
+  Type *TypeWithCheck = StructType::get(Ty, Builder.getInt1Ty());
+  Value *V = Builder.CreateIntrinsic(TypeWithCheck,
+                                     Intrinsic::dx_resource_load_rawbuffer,
+                                     {Buffer, Index, Offset});
+  return Builder.CreateExtractValue(V, {0});
+}
+
+static void createRawLoads(IntrinsicInst *II, LoadInst *LI,
+                           dxil::ResourceTypeInfo &RTI) {
   const DataLayout &DL = LI->getDataLayout();
   IRBuilder<> Builder(LI);
 
-  Type *LoadType = StructType::get(LI->getType(), Builder.getInt1Ty());
-  assert(!LI->getType()->isAggregateType() &&
-         "Resource load should be scalar or vector type");
-
   Value *Index = II->getOperand(1);
   // The offset for the rawbuffer load and store ops is always in bytes.
   uint64_t AccessSize = 1;
   Value *Offset =
       traverseGEPOffsets(DL, Builder, LI->getPointerOperand(), AccessSize);
 
-  // For raw buffer (ie, HLSL's ByteAddressBuffer), we need to fold the access
-  // entirely into the index.
-  if (!RTI.isStruct()) {
-    auto *ConstantOffset = dyn_cast<ConstantInt>(Offset);
-    if (!ConstantOffset || !ConstantOffset->isZero())
-      Index = Builder.CreateAdd(Index, Offset);
-    Offset = llvm::PoisonValue::get(Builder.getInt32Ty());
-  }
+  // TODO: We could make this handle aggregates by walking the structure and
+  // handling each field individually, but we don't ever generate code that
+  // would hit that so it seems superfluous.
+  assert(!LI->getType()->isAggregateType() &&
+         "Resource load should be scalar or vector type");
 
-  Value *V =
-      Builder.CreateIntrinsic(LoadType, Intrinsic::dx_resource_load_rawbuffer,
-                              {II->getOperand(0), Index, Offset});
-  V = Builder.CreateExtractValue(V, {0});
+  Value *V;
+  if (auto *VT = dyn_cast<FixedVectorType>(LI->getType())) {
+    // Split into loads of at most 4 elements.
+    Type *EltTy = VT->getElementType();
+    Value *Stride = ConstantInt::get(Builder.getInt32Ty(),
+                                     4 * (DL.getTypeSizeInBits(EltTy) / 8));
+
+    SmallVector<Value *> Parts;
+    for (unsigned int I = 0, N = VT->getNumElements(); I < N; I += 4) {
+      Type *Ty = FixedVectorType::get(EltTy, N - I < 4 ? N - I : 4);
+      if (I > 0)
+        Offset = Builder.CreateAdd(Offset, Stride);
+      Parts.push_back(
+          emitRawLoad(Builder, Ty, II->getOperand(0), Index, Offset, RTI));
+    }
+
+    V = Parts.size() > 1 ? concatenateVectors(Builder, Parts) : Parts[0];
+  } else
+    V = emitRawLoad(Builder, LI->getType(), II->getOperand(0), Index, Offset,
+                    RTI);
 
   LI->replaceAllUsesWith(V);
 }
@@ -415,7 +470,7 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI,
     return createTypedBufferLoad(II, LI, RTI);
   case dxil::ResourceKind::RawBuffer:
   case dxil::ResourceKind::StructuredBuffer:
-    return createRawLoad(II, LI, RTI);
+    return createRawLoads(II, LI, RTI);
   case dxil::ResourceKind::CBuffer:
     return createCBufferLoad(II, LI, RTI);
   case dxil::ResourceKind::Texture1D:
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-rawbuffer-wide.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-rawbuffer-wide.ll
new file mode 100644
index 0000000000000..4c767d93f08db
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-rawbuffer-wide.ll
@@ -0,0 +1,71 @@
+; RUN: opt -S -dxil-resource-access %s | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+declare void @v8f32_user(<8 x float>)
+declare void @v7f32_user(<7 x float>)
+
+; CHECK-LABEL: define void @loadfloat4x2_struct
+define void @loadfloat4x2_struct(i32 %index) {
+  %buffer = call target("dx.RawBuffer", [2 x <4 x float>], 1, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+
+  ; CHECK-NOT: @llvm.dx.resource.getpointer
+  %ptr = call ptr @llvm.dx.resource.getpointer(
+      target("dx.RawBuffer", [2 x <4 x float>], 1, 0) %buffer, i32 %index)
+
+  ; CHECK: %[[LOAD1:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.rawbuffer.v4f32.tdx.RawBuffer_a2v4f32_1_0t(target("dx.RawBuffer", [2 x <4 x float>], 1, 0) %buffer, i32 %index, i32 0)
+  ; CHECK: %[[VAL1:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD1]], 0
+  ; CHECK: %[[LOAD2:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.rawbuffer.v4f32.tdx.RawBuffer_a2v4f32_1_0t(target("dx.RawBuffer", [2 x <4 x float>], 1, 0) %buffer, i32 %index, i32 16)
+  ; CHECK: %[[VAL2:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD2]], 0
+  ; CHECK: %[[MERGED:.*]] = shufflevector <4 x float> %[[VAL1]], <4 x float> %[[VAL2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ; CHECK: call void @v8f32_user(<8 x float> %[[MERGED]])
+  %data = load <8 x float>, ptr %ptr
+  call void @v8f32_user(<8 x float> %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadfloat4x2_byte
+define void @loadfloat4x2_byte(i32 %index) {
+  %buffer = call target("dx.RawBuffer", i8, 1, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+
+  ; CHECK-NOT: @llvm.dx.resource.getpointer
+  %ptr = call ptr @llvm.dx.resource.getpointer(
+      target("dx.RawBuffer", i8, 1, 0) %buffer, i32 %index)
+
+  ; CHECK: %[[LOAD1:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.rawbuffer.v4f32.tdx.RawBuffer_i8_1_0t(target("dx.RawBuffer", i8, 1, 0) %buffer, i32 %index, i32 poison)
+  ; CHECK: %[[VAL1:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD1]], 0
+  ; CHECK: %[[NEXTINDEX:.*]] = add i32 %index, 16
+  ; CHECK: %[[LOAD2:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.rawbuffer.v4f32.tdx.RawBuffer_i8_1_0t(target("dx.RawBuffer", i8, 1, 0) %buffer, i32 %[[NEXTINDEX]], i32 poison)
+  ; CHECK: %[[VAL2:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD2]], 0
+  ; CHECK: %[[MERGED:.*]] = shufflevector <4 x float> %[[VAL1]], <4 x float> %[[VAL2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ; CHECK: call void @v8f32_user(<8 x float> %[[MERGED]])
+  %data = load <8 x float>, ptr %ptr
+  call void @v8f32_user(<8 x float> %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadfloat7
+define void @loadfloat7(i32 %index) {
+  %buffer = call target("dx.RawBuffer", <7 x float>, 1, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+
+  ; CHECK-NOT: @llvm.dx.resource.getpointer
+  %ptr = call ptr @llvm.dx.resource.getpointer(
+      target("dx.RawBuffer", <7 x float>, 1, 0) %buffer, i32 %index)
+
+  ; CHECK: %[[LOAD1:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.rawbuffer.v4f32.tdx.RawBuffer_v7f32_1_0t(target("dx.RawBuffer", <7 x float>, 1, 0) %buffer, i32 %index, i32 0)
+  ; CHECK: %[[VAL1:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD1]], 0
+  ; CHECK: %[[LOAD2:.*]] = call { <3 x float>, i1 } @llvm.dx.resource.load.rawbuffer.v3f32.tdx.RawBuffer_v7f32_1_0t(target("dx.RawBuffer", <7 x float>, 1, 0) %buffer, i32 %index, i32 16)
+  ; CHECK: %[[VAL2:.*]] = extractvalue { <3 x float>, i1 } %[[LOAD2]], 0
+  ; CHECK: %[[TMP:.*]] = shufflevector <3 x float> %[[VAL2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ; CHECK: %[[MERGED:.*]] = shufflevector <4 x float> %[[VAL1]], <4 x float> %[[TMP]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+  ; CHECK: call void @v7f32_user(<7 x float> %[[MERGED]])
+  %data = load <7 x float>, ptr %ptr
+  call void @v7f32_user(<7 x float> %data)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/store-rawbuffer-wide.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/store-rawbuffer-wide.ll
new file mode 100644
index 0000000000000..097db741331c5
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/store-rawbuffer-wide.ll
@@ -0,0 +1,58 @@
+; RUN: opt -S -dxil-resource-access %s | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+; CHECK-LABEL: define void @storefloat4x2_struct
+define void @storefloat4x2_struct(i32 %index, <8 x float> %data) {
+  %buffer = call target("dx.RawBuffer", [2 x <4 x float>], 1, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+
+  ; CHECK-NOT: @llvm.dx.resource.getpointer
+  %ptr = call ptr @llvm.dx.resource.getpointer(
+      target("dx.RawBuffer", [2 x <4 x float>], 1, 0) %buffer, i32 %index)
+
+  ; CHECK: %[[DATA1:.*]] = shufflevector <8 x float> %data, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK: call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_a2v4f32_1_0t.v4f32(target("dx.RawBuffer", [2 x <4 x float>], 1, 0) %buffer, i32 %index, i32 0, <4 x float> %[[DATA1]])
+  ; CHECK: %[[DATA2:.*]] = shufflevector <8 x float> %data, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ; CHECK: call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_a2v4f32_1_0t.v4f32(target("dx.RawBuffer", [2 x <4 x float>], 1, 0) %buffer, i32 %index, i32 16, <4 x float> %[[DATA2]])
+  store <8 x float> %data, ptr %ptr
+
+  ret void
+}
+
+; CHECK-LABEL: define void @storefloat4x2_byte
+define void @storefloat4x2_byte(i32 %index, <8 x float> %data) {
+  %buffer = call target("dx.RawBuffer", i8, 1, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+
+  ; CHECK-NOT: @llvm.dx.resource.getpointer
+  %ptr = call ptr @llvm.dx.resource.getpointer(
+      target("dx.RawBuffer", i8, 1, 0) %buffer, i32 %index)
+
+  ; CHECK: %[[DATA1:.*]] = shufflevector <8 x float> %data, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK: call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_i8_1_0t.v4f32(target("dx.RawBuffer", i8, 1, 0) %buffer, i32 %index, i32 poison, <4 x float> %[[DATA1]])
+  ; CHECK: %[[DATA2:.*]] = shufflevector <8 x float> %data, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ; CHECK: %[[NEXTINDEX:.*]] = add i32 %index, 16
+  ; CHECK: call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_i8_1_0t.v4f32(target("dx.RawBuffer", i8, 1, 0) %buffer, i32 %[[NEXTINDEX]], i32 poison, <4 x float> %[[DATA2]])
+  store <8 x float> %data, ptr %ptr
+
+  ret void
+}
+
+; CHECK-LABEL: define void @storev7f32
+define void @storev7f32(i32 %index, <7 x float> %data) {
+  %buffer = call target("dx.RawBuffer", <7 x float>, 1, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+
+  ; CHECK-NOT: @llvm.dx.resource.getpointer
+  %ptr = call ptr @llvm.dx.resource.getpointer(
+      target("dx.RawBuffer", <7 x float>, 1, 0) %buffer, i32 %index)
+
+  ; CHECK: %[[DATA1:.*]] = shufflevector <7 x float> %data, <7 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK: call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_v7f32_1_0t.v4f32(target("dx.RawBuffer", <7 x float>, 1, 0) %buffer, i32 %index, i32 0, <4 x float> %[[DATA1]])
+  ; CHECK: %[[DATA2:.*]] = shufflevector <7 x float> %data, <7 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
+  ; CHECK: call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_v7f32_1_0t.v3f32(target("dx.RawBuffer", <7 x float>, 1, 0) %buffer, i32 %index, i32 16, <3 x float> %[[DATA2]])
+  store <7 x float> %data, ptr %ptr
+
+  ret void
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/184732


More information about the llvm-commits mailing list