[llvm] [DirectX] add support for i64 buffer load/stores (PR #145047)

Fri Jun 20 11:18:17 PDT 2025

https://github.com/farzonl updated https://github.com/llvm/llvm-project/pull/145047

>From c7810528ae60427ec487db4ffb68a288920f5efb Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Wed, 18 Jun 2025 11:53:52 -0400
Subject: [PATCH 1/3] [DirectX] add support for i64 buffer load/stores

---
 .../Target/DirectX/DXILIntrinsicExpansion.cpp | 179 +++++++++++++-----
 llvm/test/CodeGen/DirectX/BufferLoadDouble.ll |   4 +-
 llvm/test/CodeGen/DirectX/BufferLoadInt64.ll  |  56 ++++++
 .../test/CodeGen/DirectX/BufferStoreDouble.ll |  43 +++++
 llvm/test/CodeGen/DirectX/BufferStoreInt64.ll |  46 +++++
 5 files changed, 281 insertions(+), 47 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/BufferLoadInt64.ll
 create mode 100644 llvm/test/CodeGen/DirectX/BufferStoreInt64.ll

diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index f99e8e7ccdc5d..eb9268e78a9ad 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -70,15 +71,17 @@ static bool isIntrinsicExpansion(Function &F) {
   case Intrinsic::vector_reduce_add:
   case Intrinsic::vector_reduce_fadd:
     return true;
-  case Intrinsic::dx_resource_load_typedbuffer:
-    // We need to handle doubles and vector of doubles.
-    return F.getReturnType()
-        ->getStructElementType(0)
-        ->getScalarType()
-        ->isDoubleTy();
-  case Intrinsic::dx_resource_store_typedbuffer:
-    // We need to handle doubles and vector of doubles.
-    return F.getFunctionType()->getParamType(2)->getScalarType()->isDoubleTy();
+  case Intrinsic::dx_resource_load_typedbuffer: {
+    // We need to handle i64, doubles, and vectors of them.
+    Type *ScalarTy =
+        F.getReturnType()->getStructElementType(0)->getScalarType();
+    return ScalarTy->isDoubleTy() || ScalarTy->isIntegerTy(64);
+  }
+  case Intrinsic::dx_resource_store_typedbuffer: {
+    // We need to handle i64 and doubles and vectors of i64 and doubles.
+    Type *ScalarTy = F.getFunctionType()->getParamType(2)->getScalarType();
+    return ScalarTy->isDoubleTy() || ScalarTy->isIntegerTy(64);
+  }
   }
   return false;
 }
@@ -545,13 +548,15 @@ static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) {
   IRBuilder<> Builder(Orig);
 
   Type *BufferTy = Orig->getType()->getStructElementType(0);
-  assert(BufferTy->getScalarType()->isDoubleTy() &&
-         "Only expand double or double2");
+  Type *ScalarTy = BufferTy->getScalarType();
+  bool IsDouble = ScalarTy->isDoubleTy();
+  assert(IsDouble || ScalarTy->isIntegerTy(64) &&
+                         "Only expand double or int64 scalars or vectors");
 
   unsigned ExtractNum = 2;
   if (auto *VT = dyn_cast<FixedVectorType>(BufferTy)) {
     assert(VT->getNumElements() == 2 &&
-           "TypedBufferLoad double vector has wrong size");
+           "TypedBufferLoad vector must be size 2");
     ExtractNum = 4;
   }
 
@@ -570,22 +575,54 @@ static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) {
     ExtractElements.push_back(
         Builder.CreateExtractElement(Extract, Builder.getInt32(I)));
 
-  // combine into double(s)
+  // combine into double(s) or int64(s)
   Value *Result = PoisonValue::get(BufferTy);
   for (unsigned I = 0; I < ExtractNum; I += 2) {
-    Value *Dbl =
-        Builder.CreateIntrinsic(Builder.getDoubleTy(), Intrinsic::dx_asdouble,
-                                {ExtractElements[I], ExtractElements[I + 1]});
+    Value *Combined = nullptr;
+    if (IsDouble) {
+      // For doubles, use dx_asdouble intrinsic
+      Combined =
+          Builder.CreateIntrinsic(Builder.getDoubleTy(), Intrinsic::dx_asdouble,
+                                  {ExtractElements[I], ExtractElements[I + 1]});
+    } else {
+      // For int64, manually combine two int32s
+      // First, zero-extend both values to i64
+      Value *Lo = Builder.CreateZExt(ExtractElements[I], Builder.getInt64Ty());
+      Value *Hi =
+          Builder.CreateZExt(ExtractElements[I + 1], Builder.getInt64Ty());
+      // Shift the high bits left by 32 bits
+      Value *ShiftedHi = Builder.CreateShl(Hi, Builder.getInt64(32));
+      // OR the high and low bits together
+      Combined = Builder.CreateOr(Lo, ShiftedHi);
+    }
+
     if (ExtractNum == 4)
-      Result =
-          Builder.CreateInsertElement(Result, Dbl, Builder.getInt32(I / 2));
+      Result = Builder.CreateInsertElement(Result, Combined,
+                                           Builder.getInt32(I / 2));
     else
-      Result = Dbl;
+      Result = Combined;
   }
 
   Value *CheckBit = nullptr;
   for (User *U : make_early_inc_range(Orig->users())) {
-    auto *EVI = cast<ExtractValueInst>(U);
+    if (auto *Ret = dyn_cast<ReturnInst>(U)) {
+      // For return instructions, we need to handle the case where the function
+      // is directly returning the result of the call
+      Type *RetTy = Ret->getFunction()->getReturnType();
+      Value *StructRet = PoisonValue::get(RetTy);
+      StructRet = Builder.CreateInsertValue(StructRet, Result, {0});
+      Value *CheckBitForRet = Builder.CreateExtractValue(Load, {1});
+      StructRet = Builder.CreateInsertValue(StructRet, CheckBitForRet, {1});
+      Ret->setOperand(0, StructRet);
+      continue;
+    }
+    auto *EVI = dyn_cast<ExtractValueInst>(U);
+    if (!EVI) {
+      // If it's not a ReturnInst or ExtractValueInst, we don't know how to
+      // handle it
+      llvm_unreachable("Unexpected user of typedbufferload");
+    }
+
     ArrayRef<unsigned> Indices = EVI->getIndices();
     assert(Indices.size() == 1);
 
@@ -609,38 +646,90 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) {
   IRBuilder<> Builder(Orig);
 
   Type *BufferTy = Orig->getFunctionType()->getParamType(2);
-  assert(BufferTy->getScalarType()->isDoubleTy() &&
-         "Only expand double or double2");
+  Type *ScalarTy = BufferTy->getScalarType();
+  bool IsDouble = ScalarTy->isDoubleTy();
+  assert((IsDouble || ScalarTy->isIntegerTy(64)) &&
+         "Only expand double or int64 scalars or vectors");
 
   unsigned ExtractNum = 2;
   if (auto *VT = dyn_cast<FixedVectorType>(BufferTy)) {
     assert(VT->getNumElements() == 2 &&
-           "TypedBufferStore double vector has wrong size");
+           "TypedBufferStore vector must be size 2");
     ExtractNum = 4;
   }
+  if (IsDouble) {
+    Type *SplitElementTy = Builder.getInt32Ty();
+    if (ExtractNum == 4)
+      SplitElementTy = VectorType::get(SplitElementTy, 2, false);
+
+    // Handle double type(s) - keep original behavior
+    auto *SplitTy = llvm::StructType::get(SplitElementTy, SplitElementTy);
+    Value *Split = Builder.CreateIntrinsic(SplitTy, Intrinsic::dx_splitdouble,
+                                           {Orig->getOperand(2)});
+    // create our vector
+    Value *LowBits = Builder.CreateExtractValue(Split, 0);
+    Value *HighBits = Builder.CreateExtractValue(Split, 1);
+    Value *Val;
+    if (ExtractNum == 2) {
+      Val = PoisonValue::get(VectorType::get(Builder.getInt32Ty(), 2, false));
+      Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0));
+      Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
+    } else
+      Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3});
+
+    Builder.CreateIntrinsic(Builder.getVoidTy(),
+                            Intrinsic::dx_resource_store_typedbuffer,
+                            {Orig->getOperand(0), Orig->getOperand(1), Val});
+  } else {
+    // Handle int64 type(s)
+    Value *InputVal = Orig->getOperand(2);
+    Value *Val;
+
+    if (ExtractNum == 4) {
+      // Handle vector of int64
+      Type *Int32x4Ty = VectorType::get(Builder.getInt32Ty(), 4, false);
+      Val = PoisonValue::get(Int32x4Ty);
+
+      for (unsigned I = 0; I < 2; ++I) {
+        // Extract each int64 element
+        Value *Int64Val =
+            Builder.CreateExtractElement(InputVal, Builder.getInt32(I));
+
+        // Get low 32 bits by truncating to i32
+        Value *LowBits = Builder.CreateTrunc(Int64Val, Builder.getInt32Ty());
+
+        // Get high 32 bits by shifting right by 32 and truncating
+        Value *ShiftedVal = Builder.CreateLShr(Int64Val, Builder.getInt64(32));
+        Value *HighBits = Builder.CreateTrunc(ShiftedVal, Builder.getInt32Ty());
+
+        // Insert into our final vector
+        Val =
+            Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(I * 2));
+        Val = Builder.CreateInsertElement(Val, HighBits,
+                                          Builder.getInt32(I * 2 + 1));
+      }
+    } else {
+      // Handle scalar int64
+      Type *Int32x2Ty = VectorType::get(Builder.getInt32Ty(), 2, false);
+      Val = PoisonValue::get(Int32x2Ty);
+
+      // Get low 32 bits by truncating to i32
+      Value *LowBits = Builder.CreateTrunc(InputVal, Builder.getInt32Ty());
+
+      // Get high 32 bits by shifting right by 32 and truncating
+      Value *ShiftedVal = Builder.CreateLShr(InputVal, Builder.getInt64(32));
+      Value *HighBits = Builder.CreateTrunc(ShiftedVal, Builder.getInt32Ty());
+
+      // Insert into our final vector
+      Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0));
+      Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
+    }
+
+    Builder.CreateIntrinsic(Builder.getVoidTy(),
+                            Intrinsic::dx_resource_store_typedbuffer,
+                            {Orig->getOperand(0), Orig->getOperand(1), Val});
+  }
 
-  Type *SplitElementTy = Builder.getInt32Ty();
-  if (ExtractNum == 4)
-    SplitElementTy = VectorType::get(SplitElementTy, 2, false);
-
-  // split our double(s)
-  auto *SplitTy = llvm::StructType::get(SplitElementTy, SplitElementTy);
-  Value *Split = Builder.CreateIntrinsic(SplitTy, Intrinsic::dx_splitdouble,
-                                         Orig->getOperand(2));
-  // create our vector
-  Value *LowBits = Builder.CreateExtractValue(Split, 0);
-  Value *HighBits = Builder.CreateExtractValue(Split, 1);
-  Value *Val;
-  if (ExtractNum == 2) {
-    Val = PoisonValue::get(VectorType::get(SplitElementTy, 2, false));
-    Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0));
-    Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
-  } else
-    Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3});
-
-  Builder.CreateIntrinsic(Builder.getVoidTy(),
-                          Intrinsic::dx_resource_store_typedbuffer,
-                          {Orig->getOperand(0), Orig->getOperand(1), Val});
   Orig->eraseFromParent();
   return true;
 }
diff --git a/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll b/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll
index 80a071a66364b..af3ec9df37967 100644
--- a/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll
+++ b/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -S -dxil-intrinsic-expansion %s | FileCheck %s
 
-target triple = "dxil-pc-shadermodel6.6-compute"
+target triple = "dxil-pc-shadermodel6.2-compute"
 
 define void @loadf64() {
   ; check the handle from binding is unchanged
@@ -88,4 +88,4 @@ define void @loadf64WithCheckBit() {
   ; CHECK-NOT: extractvalue { double, i1 }
   %cb = extractvalue {double, i1} %load0, 1
   ret void
-}
\ No newline at end of file
+}
diff --git a/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll b/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll
new file mode 100644
index 0000000000000..cea475524945c
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -dxil-intrinsic-expansion %s | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.2-compute"
+
+define { i64, i1 } @loadi64() {
+; CHECK-LABEL: define { i64, i1 } @loadi64() {
+; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+; CHECK-NEXT:    [[TMP1:%.*]] = call { <2 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v2i32.tdx.TypedBuffer_i64_1_0_0t(target("dx.TypedBuffer", i64, 1, 0, 0) [[BUFFER]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 32
+; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { i64, i1 } [[TMP9]], i1 [[TMP10]], 1
+; CHECK-NEXT:    ret { i64, i1 } [[TMP11]]
+;
+  %buffer = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+  %result = call { i64, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_i64_1_0_0t(
+  target("dx.TypedBuffer", i64, 1, 0, 0) %buffer, i32 0)
+  ret { i64, i1 } %result
+}
+
+define { <2 x i64>, i1 } @loadv2i64() {
+; CHECK-LABEL: define { <2 x i64>, i1 } @loadv2i64() {
+; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+; CHECK-NEXT:    [[TMP1:%.*]] = call { <4 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v2i64_1_0_0t(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) [[BUFFER]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = shl i64 [[TMP8]], 32
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = shl i64 [[TMP13]], 32
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP12]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP15]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { <2 x i64>, i1 } poison, <2 x i64> [[TMP16]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <2 x i64>, i1 } [[TMP17]], i1 [[TMP18]], 1
+; CHECK-NEXT:    ret { <2 x i64>, i1 } [[TMP19]]
+;
+  %buffer = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+  %result = call { <2 x i64>, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t(
+  target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) %buffer, i32 0)
+  ret { <2 x i64>, i1 } %result
+}
diff --git a/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll b/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll
index 9c3dab0cc1e46..882948b6dce74 100644
--- a/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll
+++ b/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll
@@ -45,3 +45,46 @@ define void @storev2f64(<2 x double> %0) {
       <2 x double> %0)
   ret void
 }
+
+define { double, i1 } @loadAndReturnf64() {
+; CHECK-LABEL: define { double, i1 } @loadAndReturnf64() {
+; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+; CHECK-NEXT:    [[TMP1:%.*]] = call { <2 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v2i32.tdx.TypedBuffer_f64_1_0_0t(target("dx.TypedBuffer", double, 1, 0, 0) [[BUFFER]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP3]], i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { double, i1 } poison, double [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP6]], i1 [[TMP7]], 1
+; CHECK-NEXT:    ret { double, i1 } [[TMP8]]
+;
+  %buffer = tail call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+  %result = call { double, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_f64_1_0_0t(
+  target("dx.TypedBuffer", double, 1, 0, 0) %buffer, i32 0)
+  ret { double, i1 } %result
+}
+
+define { <2 x double>, i1 } @loadAndReturnv2f64() {
+; CHECK-LABEL: define { <2 x double>, i1 } @loadAndReturnv2f64() {
+; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x double>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+; CHECK-NEXT:    [[TMP1:%.*]] = call { <4 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) [[BUFFER]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP3]], i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP5]], i32 [[TMP6]])
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP8]], double [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <2 x double>, i1 } poison, <2 x double> [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { <2 x double>, i1 } [[TMP11]], i1 [[TMP12]], 1
+; CHECK-NEXT:    ret { <2 x double>, i1 } [[TMP13]]
+;
+  %buffer = tail call target("dx.TypedBuffer", <2 x double>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+  %result = call { <2 x double>, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t(
+  target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 0)
+  ret { <2 x double>, i1 } %result
+}
diff --git a/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll b/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll
new file mode 100644
index 0000000000000..efb7c0ac104ed
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -dxil-intrinsic-expansion %s | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+define void @storei64(i64 %0) {
+; CHECK-LABEL: define void @storei64(
+; CHECK-SAME: i64 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP0]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i32 1
+; CHECK-NEXT:    call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_i64_1_0_0t.v2i32(target("dx.TypedBuffer", i64, 1, 0, 0) [[BUFFER]], i32 0, <2 x i32> [[TMP6]])
+; CHECK-NEXT:    ret void
+;
+  %buffer = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+  call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_i64_1_0_0t(target("dx.TypedBuffer", i64, 1, 0, 0) %buffer, i32 0,i64 %0)
+  ret void
+}
+
+
+define void @storev2i64(<2 x i64> %0) {
+; CHECK-LABEL: define void @storev2i64(
+; CHECK-SAME: <2 x i64> [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = lshr i64 [[TMP8]], 32
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP9]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP11]], i32 3
+; CHECK-NEXT:    call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t.v4i32(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) [[BUFFER]], i32 0, <4 x i32> [[TMP13]])
+; CHECK-NEXT:    ret void
+;
+  %buffer = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
+  call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) %buffer, i32 0, <2 x i64> %0)
+  ret void
+}

>From e7d5e228c41ee4b77f3ed2d86489c73c6c8d8268 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Fri, 20 Jun 2025 10:58:21 -0400
Subject: [PATCH 2/3] minimize code diff between double and i64

---
 .../Target/DirectX/DXILIntrinsicExpansion.cpp | 74 +++++++++----------
 1 file changed, 33 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index eb9268e78a9ad..45d8e497165cf 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -651,58 +651,56 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) {
   assert((IsDouble || ScalarTy->isIntegerTy(64)) &&
          "Only expand double or int64 scalars or vectors");
 
-  unsigned ExtractNum = 2;
-  if (auto *VT = dyn_cast<FixedVectorType>(BufferTy)) {
-    assert(VT->getNumElements() == 2 &&
+  // Determine if we're dealing with a vector or scalar
+  bool IsVector = isa<FixedVectorType>(BufferTy);
+  if (IsVector) {
+    assert(cast<FixedVectorType>(BufferTy)->getNumElements() == 2 &&
            "TypedBufferStore vector must be size 2");
-    ExtractNum = 4;
   }
+
+  // Create the appropriate vector type for the result
+  Type *Int32Ty = Builder.getInt32Ty();
+  Type *ResultTy = VectorType::get(Int32Ty, IsVector ? 4 : 2, false);
+  Value *Val = PoisonValue::get(ResultTy);
+
+  // Split the 64-bit values into 32-bit components
   if (IsDouble) {
-    Type *SplitElementTy = Builder.getInt32Ty();
-    if (ExtractNum == 4)
+    // Handle double type(s)
+    Type *SplitElementTy = Int32Ty;
+    if (IsVector)
       SplitElementTy = VectorType::get(SplitElementTy, 2, false);
 
-    // Handle double type(s) - keep original behavior
     auto *SplitTy = llvm::StructType::get(SplitElementTy, SplitElementTy);
     Value *Split = Builder.CreateIntrinsic(SplitTy, Intrinsic::dx_splitdouble,
                                            {Orig->getOperand(2)});
-    // create our vector
     Value *LowBits = Builder.CreateExtractValue(Split, 0);
     Value *HighBits = Builder.CreateExtractValue(Split, 1);
-    Value *Val;
-    if (ExtractNum == 2) {
-      Val = PoisonValue::get(VectorType::get(Builder.getInt32Ty(), 2, false));
+
+    if (IsVector) {
+      // For vector doubles, use shuffle to create the final vector
+      Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3});
+    } else {
+      // For scalar doubles, insert the elements
       Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0));
       Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
-    } else
-      Val = Builder.CreateShuffleVector(LowBits, HighBits, {0, 2, 1, 3});
-
-    Builder.CreateIntrinsic(Builder.getVoidTy(),
-                            Intrinsic::dx_resource_store_typedbuffer,
-                            {Orig->getOperand(0), Orig->getOperand(1), Val});
+    }
   } else {
     // Handle int64 type(s)
     Value *InputVal = Orig->getOperand(2);
-    Value *Val;
 
-    if (ExtractNum == 4) {
+    if (IsVector) {
       // Handle vector of int64
-      Type *Int32x4Ty = VectorType::get(Builder.getInt32Ty(), 4, false);
-      Val = PoisonValue::get(Int32x4Ty);
-
       for (unsigned I = 0; I < 2; ++I) {
         // Extract each int64 element
         Value *Int64Val =
             Builder.CreateExtractElement(InputVal, Builder.getInt32(I));
 
-        // Get low 32 bits by truncating to i32
-        Value *LowBits = Builder.CreateTrunc(Int64Val, Builder.getInt32Ty());
-
-        // Get high 32 bits by shifting right by 32 and truncating
+        // Split into low and high 32-bit parts
+        Value *LowBits = Builder.CreateTrunc(Int64Val, Int32Ty);
         Value *ShiftedVal = Builder.CreateLShr(Int64Val, Builder.getInt64(32));
-        Value *HighBits = Builder.CreateTrunc(ShiftedVal, Builder.getInt32Ty());
+        Value *HighBits = Builder.CreateTrunc(ShiftedVal, Int32Ty);
 
-        // Insert into our final vector
+        // Insert into result vector
         Val =
             Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(I * 2));
         Val = Builder.CreateInsertElement(Val, HighBits,
@@ -710,26 +708,20 @@ static bool expandTypedBufferStoreIntrinsic(CallInst *Orig) {
       }
     } else {
       // Handle scalar int64
-      Type *Int32x2Ty = VectorType::get(Builder.getInt32Ty(), 2, false);
-      Val = PoisonValue::get(Int32x2Ty);
-
-      // Get low 32 bits by truncating to i32
-      Value *LowBits = Builder.CreateTrunc(InputVal, Builder.getInt32Ty());
-
-      // Get high 32 bits by shifting right by 32 and truncating
+      Value *LowBits = Builder.CreateTrunc(InputVal, Int32Ty);
       Value *ShiftedVal = Builder.CreateLShr(InputVal, Builder.getInt64(32));
-      Value *HighBits = Builder.CreateTrunc(ShiftedVal, Builder.getInt32Ty());
+      Value *HighBits = Builder.CreateTrunc(ShiftedVal, Int32Ty);
 
-      // Insert into our final vector
       Val = Builder.CreateInsertElement(Val, LowBits, Builder.getInt32(0));
       Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
     }
-
-    Builder.CreateIntrinsic(Builder.getVoidTy(),
-                            Intrinsic::dx_resource_store_typedbuffer,
-                            {Orig->getOperand(0), Orig->getOperand(1), Val});
   }
 
+  // Create the final intrinsic call
+  Builder.CreateIntrinsic(Builder.getVoidTy(),
+                          Intrinsic::dx_resource_store_typedbuffer,
+                          {Orig->getOperand(0), Orig->getOperand(1), Val});
+
   Orig->eraseFromParent();
   return true;
 }

>From f7b3d844001508f4bb8cd58d9af0ae1c76e8daf9 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Fri, 20 Jun 2025 14:16:19 -0400
Subject: [PATCH 3/3] remove return handling

---
 .../Target/DirectX/DXILIntrinsicExpansion.cpp | 18 ++------
 llvm/test/CodeGen/DirectX/BufferLoadDouble.ll |  2 +-
 llvm/test/CodeGen/DirectX/BufferLoadInt64.ll  | 28 +++++-------
 .../test/CodeGen/DirectX/BufferStoreDouble.ll | 43 -------------------
 4 files changed, 14 insertions(+), 77 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index 45d8e497165cf..d50279461800e 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -605,23 +605,11 @@ static bool expandTypedBufferLoadIntrinsic(CallInst *Orig) {
 
   Value *CheckBit = nullptr;
   for (User *U : make_early_inc_range(Orig->users())) {
-    if (auto *Ret = dyn_cast<ReturnInst>(U)) {
-      // For return instructions, we need to handle the case where the function
-      // is directly returning the result of the call
-      Type *RetTy = Ret->getFunction()->getReturnType();
-      Value *StructRet = PoisonValue::get(RetTy);
-      StructRet = Builder.CreateInsertValue(StructRet, Result, {0});
-      Value *CheckBitForRet = Builder.CreateExtractValue(Load, {1});
-      StructRet = Builder.CreateInsertValue(StructRet, CheckBitForRet, {1});
-      Ret->setOperand(0, StructRet);
-      continue;
-    }
+    // If it's not a ExtractValueInst, we don't know how to
+    // handle it
     auto *EVI = dyn_cast<ExtractValueInst>(U);
-    if (!EVI) {
-      // If it's not a ReturnInst or ExtractValueInst, we don't know how to
-      // handle it
+    if (!EVI)
       llvm_unreachable("Unexpected user of typedbufferload");
-    }
 
     ArrayRef<unsigned> Indices = EVI->getIndices();
     assert(Indices.size() == 1);
diff --git a/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll b/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll
index af3ec9df37967..25abf2111060c 100644
--- a/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll
+++ b/llvm/test/CodeGen/DirectX/BufferLoadDouble.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -S -dxil-intrinsic-expansion %s | FileCheck %s
 
-target triple = "dxil-pc-shadermodel6.2-compute"
+target triple = "dxil-pc-shadermodel6.6-compute"
 
 define void @loadf64() {
   ; check the handle from binding is unchanged
diff --git a/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll b/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll
index cea475524945c..42c0012ff3475 100644
--- a/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll
+++ b/llvm/test/CodeGen/DirectX/BufferLoadInt64.ll
@@ -3,8 +3,8 @@
 
 target triple = "dxil-pc-shadermodel6.2-compute"
 
-define { i64, i1 } @loadi64() {
-; CHECK-LABEL: define { i64, i1 } @loadi64() {
+define void @loadi64() {
+; CHECK-LABEL: define void @loadi64() {
 ; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call { <2 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v2i32.tdx.TypedBuffer_i64_1_0_0t(target("dx.TypedBuffer", i64, 1, 0, 0) [[BUFFER]], i32 0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 0
@@ -14,19 +14,15 @@ define { i64, i1 } @loadi64() {
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 32
 ; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { i64, i1 } [[TMP9]], i1 [[TMP10]], 1
-; CHECK-NEXT:    ret { i64, i1 } [[TMP11]]
+; CHECK-NEXT:    ret void
 ;
   %buffer = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
-  %result = call { i64, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_i64_1_0_0t(
-  target("dx.TypedBuffer", i64, 1, 0, 0) %buffer, i32 0)
-  ret { i64, i1 } %result
+  %result = call { i64, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_i64_1_0_0t(target("dx.TypedBuffer", i64, 1, 0, 0) %buffer, i32 0)
+  ret void
 }
 
-define { <2 x i64>, i1 } @loadv2i64() {
-; CHECK-LABEL: define { <2 x i64>, i1 } @loadv2i64() {
+define void @loadv2i64() {
+; CHECK-LABEL: define void @loadv2i64() {
 ; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call { <4 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v2i64_1_0_0t(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) [[BUFFER]], i32 0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 0
@@ -44,13 +40,9 @@ define { <2 x i64>, i1 } @loadv2i64() {
 ; CHECK-NEXT:    [[TMP14:%.*]] = shl i64 [[TMP13]], 32
 ; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP12]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP15]], i32 1
-; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { <2 x i64>, i1 } poison, <2 x i64> [[TMP16]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <2 x i64>, i1 } [[TMP17]], i1 [[TMP18]], 1
-; CHECK-NEXT:    ret { <2 x i64>, i1 } [[TMP19]]
+; CHECK-NEXT:    ret void
 ;
   %buffer = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
-  %result = call { <2 x i64>, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t(
-  target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) %buffer, i32 0)
-  ret { <2 x i64>, i1 } %result
+  %result = call { <2 x i64>, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) %buffer, i32 0)
+  ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll b/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll
index 882948b6dce74..9c3dab0cc1e46 100644
--- a/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll
+++ b/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll
@@ -45,46 +45,3 @@ define void @storev2f64(<2 x double> %0) {
       <2 x double> %0)
   ret void
 }
-
-define { double, i1 } @loadAndReturnf64() {
-; CHECK-LABEL: define { double, i1 } @loadAndReturnf64() {
-; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
-; CHECK-NEXT:    [[TMP1:%.*]] = call { <2 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v2i32.tdx.TypedBuffer_f64_1_0_0t(target("dx.TypedBuffer", double, 1, 0, 0) [[BUFFER]], i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP3]], i32 [[TMP4]])
-; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { double, i1 } poison, double [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i32>, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP6]], i1 [[TMP7]], 1
-; CHECK-NEXT:    ret { double, i1 } [[TMP8]]
-;
-  %buffer = tail call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
-  %result = call { double, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_f64_1_0_0t(
-  target("dx.TypedBuffer", double, 1, 0, 0) %buffer, i32 0)
-  ret { double, i1 } %result
-}
-
-define { <2 x double>, i1 } @loadAndReturnv2f64() {
-; CHECK-LABEL: define { <2 x double>, i1 } @loadAndReturnv2f64() {
-; CHECK-NEXT:    [[BUFFER:%.*]] = tail call target("dx.TypedBuffer", <2 x double>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
-; CHECK-NEXT:    [[TMP1:%.*]] = call { <4 x i32>, i1 } @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) [[BUFFER]], i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP3]], i32 [[TMP4]])
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.dx.asdouble.i32(i32 [[TMP5]], i32 [[TMP6]])
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP8]], double [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <2 x double>, i1 } poison, <2 x double> [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <4 x i32>, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue { <2 x double>, i1 } [[TMP11]], i1 [[TMP12]], 1
-; CHECK-NEXT:    ret { <2 x double>, i1 } [[TMP13]]
-;
-  %buffer = tail call target("dx.TypedBuffer", <2 x double>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2f64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
-  %result = call { <2 x double>, i1 } @llvm.dx.resource.load.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t(
-  target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 0)
-  ret { <2 x double>, i1 } %result
-}