[llvm] 9970273 - [AMDGPURewriteOutArguments] Don't use pointer element type

Tue Feb 8 07:10:48 PST 2022

Author: Nikita Popov
Date: 2022-02-08T16:10:41+01:00
New Revision: 997027347db7c704d14d5901c828a0f249d30c3f

URL: https://github.com/llvm/llvm-project/commit/997027347db7c704d14d5901c828a0f249d30c3f
DIFF: https://github.com/llvm/llvm-project/commit/997027347db7c704d14d5901c828a0f249d30c3f.diff

LOG: [AMDGPURewriteOutArguments] Don't use pointer element type

Instead of using the pointer element type, look at how the pointer
is actually being used in store instructions, while looking through
bitcasts. This makes the transform compatible with opaque pointers
and a bit more general.

It's worth noting that I have dropped the 3-vector to 4-vector
shufflevector special case, because this is now handled in a
different way: If the value is actually used as a 4-vector, then
we're directly going to use that type, instead of shuffling to a
3-vector in between.

Differential Revision: https://reviews.llvm.org/D119237

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
    llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll
    llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 1c6c63dd5b251..4f8a61a770973 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -83,12 +83,8 @@ class AMDGPURewriteOutArguments : public FunctionPass {
   const DataLayout *DL = nullptr;
   MemoryDependenceResults *MDA = nullptr;
 
-  bool checkArgumentUses(Value &Arg) const;
-  bool isOutArgumentCandidate(Argument &Arg) const;
-
-#ifndef NDEBUG
-  bool isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const;
-#endif
+  Type *getStoredType(Value &Arg) const;
+  Type *getOutArgumentType(Argument &Arg) const;
 
 public:
   static char ID;
@@ -114,72 +110,61 @@ INITIALIZE_PASS_END(AMDGPURewriteOutArguments, DEBUG_TYPE,
 
 char AMDGPURewriteOutArguments::ID = 0;
 
-bool AMDGPURewriteOutArguments::checkArgumentUses(Value &Arg) const {
+Type *AMDGPURewriteOutArguments::getStoredType(Value &Arg) const {
   const int MaxUses = 10;
   int UseCount = 0;
 
-  for (Use &U : Arg.uses()) {
-    StoreInst *SI = dyn_cast<StoreInst>(U.getUser());
-    if (UseCount > MaxUses)
-      return false;
+  SmallVector<Use *> Worklist;
+  for (Use &U : Arg.uses())
+    Worklist.push_back(&U);
 
-    if (!SI) {
-      auto *BCI = dyn_cast<BitCastInst>(U.getUser());
-      if (!BCI || !BCI->hasOneUse())
-        return false;
-
-      // We don't handle multiple stores currently, so stores to aggregate
-      // pointers aren't worth the trouble since they are canonically split up.
-      Type *DestEltTy = BCI->getType()->getPointerElementType();
-      if (DestEltTy->isAggregateType())
-        return false;
-
-      // We could handle these if we had a convenient way to bitcast between
-      // them.
-      Type *SrcEltTy = Arg.getType()->getPointerElementType();
-      if (SrcEltTy->isArrayTy())
-        return false;
-
-      // Special case handle structs with single members. It is useful to handle
-      // some casts between structs and non-structs, but we can't bitcast
-      // directly between them. Blender uses some casts that look like
-      // { <3 x float> }* to <4 x float>*
-      if ((SrcEltTy->isStructTy() && (SrcEltTy->getStructNumElements() != 1)))
-        return false;
-
-      // Clang emits OpenCL 3-vector type accesses with a bitcast to the
-      // equivalent 4-element vector and accesses that, and we're looking for
-      // this pointer cast.
-      if (DL->getTypeAllocSize(SrcEltTy) != DL->getTypeAllocSize(DestEltTy))
-        return false;
-
-      return checkArgumentUses(*BCI);
+  Type *StoredType = nullptr;
+  while (!Worklist.empty()) {
+    Use *U = Worklist.pop_back_val();
+
+    if (auto *BCI = dyn_cast<BitCastInst>(U->getUser())) {
+      for (Use &U : BCI->uses())
+        Worklist.push_back(&U);
+      continue;
     }
 
-    if (!SI->isSimple() ||
-        U.getOperandNo() != StoreInst::getPointerOperandIndex())
-      return false;
+    if (auto *SI = dyn_cast<StoreInst>(U->getUser())) {
+      if (UseCount++ > MaxUses)
+        return nullptr;
+
+      if (!SI->isSimple() ||
+          U->getOperandNo() != StoreInst::getPointerOperandIndex())
+        return nullptr;
 
-    ++UseCount;
+      if (StoredType && StoredType != SI->getValueOperand()->getType())
+        return nullptr; // More than one type.
+      StoredType = SI->getValueOperand()->getType();
+      continue;
+    }
+
+    // Unsupported user.
+    return nullptr;
   }
 
-  // Skip unused arguments.
-  return UseCount > 0;
+  return StoredType;
 }
 
-bool AMDGPURewriteOutArguments::isOutArgumentCandidate(Argument &Arg) const {
+Type *AMDGPURewriteOutArguments::getOutArgumentType(Argument &Arg) const {
   const unsigned MaxOutArgSizeBytes = 4 * MaxNumRetRegs;
   PointerType *ArgTy = dyn_cast<PointerType>(Arg.getType());
 
   // TODO: It might be useful for any out arguments, not just privates.
   if (!ArgTy || (ArgTy->getAddressSpace() != DL->getAllocaAddrSpace() &&
                  !AnyAddressSpace) ||
-      Arg.hasByValAttr() || Arg.hasStructRetAttr() ||
-      DL->getTypeStoreSize(ArgTy->getPointerElementType()) > MaxOutArgSizeBytes) {
-    return false;
+      Arg.hasByValAttr() || Arg.hasStructRetAttr()) {
+    return nullptr;
   }
 
-  return checkArgumentUses(Arg);
+  Type *StoredType = getStoredType(Arg);
+  if (!StoredType || DL->getTypeStoreSize(StoredType) > MaxOutArgSizeBytes)
+    return nullptr;
+
+  return StoredType;
 }
 
 bool AMDGPURewriteOutArguments::doInitialization(Module &M) {
@@ -187,22 +172,6 @@ bool AMDGPURewriteOutArguments::doInitialization(Module &M) {
   return false;
 }
 
-#ifndef NDEBUG
-bool AMDGPURewriteOutArguments::isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const {
-  auto *VT0 = dyn_cast<FixedVectorType>(Ty0);
-  auto *VT1 = dyn_cast<FixedVectorType>(Ty1);
-  if (!VT0 || !VT1)
-    return false;
-
-  if (VT0->getNumElements() != 3 ||
-      VT1->getNumElements() != 4)
-    return false;
-
-  return DL->getTypeSizeInBits(VT0->getElementType()) ==
-         DL->getTypeSizeInBits(VT1->getElementType());
-}
-#endif
-
 bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
@@ -215,7 +184,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
   MDA = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
 
   unsigned ReturnNumRegs = 0;
-  SmallSet<int, 4> OutArgIndexes;
+  SmallDenseMap<int, Type *, 4> OutArgIndexes;
   SmallVector<Type *, 4> ReturnTypes;
   Type *RetTy = F.getReturnType();
   if (!RetTy->isVoidTy()) {
@@ -227,12 +196,12 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
     ReturnTypes.push_back(RetTy);
   }
 
-  SmallVector<Argument *, 4> OutArgs;
+  SmallVector<std::pair<Argument *, Type *>, 4> OutArgs;
   for (Argument &Arg : F.args()) {
-    if (isOutArgumentCandidate(Arg)) {
+    if (Type *Ty = getOutArgumentType(Arg)) {
       LLVM_DEBUG(dbgs() << "Found possible out argument " << Arg
                         << " in function " << F.getName() << '\n');
-      OutArgs.push_back(&Arg);
+      OutArgs.push_back({&Arg, Ty});
     }
   }
 
@@ -264,11 +233,12 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
     // first. On the second iteration we've removed that out clobbering argument
     // (by effectively moving it into another function) and will find the second
     // argument is OK to move.
-    for (Argument *OutArg : OutArgs) {
+    for (const auto &Pair : OutArgs) {
       bool ThisReplaceable = true;
       SmallVector<std::pair<ReturnInst *, StoreInst *>, 4> ReplaceableStores;
 
-      Type *ArgTy = OutArg->getType()->getPointerElementType();
+      Argument *OutArg = Pair.first;
+      Type *ArgTy = Pair.second;
 
       // Skip this argument if converting it will push us over the register
       // count to return limit.
@@ -324,7 +294,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
 
       if (ThisReplaceable) {
         ReturnTypes.push_back(ArgTy);
-        OutArgIndexes.insert(OutArg->getArgNo());
+        OutArgIndexes.insert({OutArg->getArgNo(), ArgTy});
         ++NumOutArgumentsReplaced;
         Changing = true;
       }
@@ -376,32 +346,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
     if (RetVal)
       NewRetVal = B.CreateInsertValue(NewRetVal, RetVal, RetIdx++);
 
-    for (std::pair<Argument *, Value *> ReturnPoint : Replacement.second) {
-      Argument *Arg = ReturnPoint.first;
-      Value *Val = ReturnPoint.second;
-      Type *EltTy = Arg->getType()->getPointerElementType();
-      if (Val->getType() != EltTy) {
-        Type *EffectiveEltTy = EltTy;
-        if (StructType *CT = dyn_cast<StructType>(EltTy)) {
-          assert(CT->getNumElements() == 1);
-          EffectiveEltTy = CT->getElementType(0);
-        }
-
-        if (DL->getTypeSizeInBits(EffectiveEltTy) !=
-            DL->getTypeSizeInBits(Val->getType())) {
-          assert(isVec3ToVec4Shuffle(EffectiveEltTy, Val->getType()));
-          Val = B.CreateShuffleVector(Val, ArrayRef<int>{0, 1, 2});
-        }
-
-        Val = B.CreateBitCast(Val, EffectiveEltTy);
-
-        // Re-create single element composite.
-        if (EltTy != EffectiveEltTy)
-          Val = B.CreateInsertValue(UndefValue::get(EltTy), Val, 0);
-      }
-
-      NewRetVal = B.CreateInsertValue(NewRetVal, Val, RetIdx++);
-    }
+    for (std::pair<Argument *, Value *> ReturnPoint : Replacement.second)
+      NewRetVal = B.CreateInsertValue(NewRetVal, ReturnPoint.second, RetIdx++);
 
     if (RetVal)
       RI->setOperand(0, NewRetVal);
@@ -433,7 +379,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
 
     PointerType *ArgType = cast<PointerType>(Arg.getType());
 
-    auto *EltTy = ArgType->getPointerElementType();
+    Type *EltTy = OutArgIndexes[Arg.getArgNo()];
     const auto Align =
         DL->getValueOrABITypeAlignment(Arg.getParamAlign(), EltTy);
 

diff  --git a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll
index e23ffc96342ae..d150e68d5b7b7 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-any-address-space-out-arguments -amdgpu-rewrite-out-arguments < %s | FileCheck %s
 
 ; CHECK: %void_one_out_non_private_arg_i32_1_use = type { i32 }
-; CHECK: %bitcast_pointer_as1 = type { <3 x i32> }
+; CHECK: %bitcast_pointer_as1 = type { <4 x i32> }
 
 ; CHECK-LABEL: define private %void_one_out_non_private_arg_i32_1_use @void_one_out_non_private_arg_i32_1_use.body(i32 addrspace(1)* %val) #0 {
 ; CHECK-NEXT: ret %void_one_out_non_private_arg_i32_1_use zeroinitializer
@@ -19,9 +19,8 @@ define void @void_one_out_non_private_arg_i32_1_use(i32 addrspace(1)* %val) #0 {
 ; CHECK-LABEL: define private %bitcast_pointer_as1 @bitcast_pointer_as1.body(<3 x i32> addrspace(1)* %out) #0 {
 ; CHECK-NEXT: %load = load volatile <4 x i32>, <4 x i32> addrspace(1)* undef
 ; CHECK-NEXT: %bitcast = bitcast <3 x i32> addrspace(1)* %out to <4 x i32> addrspace(1)*
-; CHECK-NEXT: %1 = shufflevector <4 x i32> %load, <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT: %2 = insertvalue %bitcast_pointer_as1 undef, <3 x i32> %1, 0
-; CHECK-NEXT: ret %bitcast_pointer_as1 %2
+; CHECK-NEXT: %1 = insertvalue %bitcast_pointer_as1 undef, <4 x i32> %load, 0
+; CHECK-NEXT: ret %bitcast_pointer_as1 %1
 
 ; CHECK-LABEL: define void @bitcast_pointer_as1(<3 x i32> addrspace(1)* %0) #1 {
 ; CHECK-NEXT: %2 = call %bitcast_pointer_as1 @bitcast_pointer_as1.body(<3 x i32> addrspace(1)* undef)

diff  --git a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
index 7e39cd6920340..af4fcce2791e8 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
@@ -814,16 +814,16 @@ attributes #2 = { alwaysinline nounwind }
 ; CHECK-SAME: (void ()** [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FUNC:%.*]] = load i32 ()*, i32 ()** undef, align 8
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast void ()** [[OUT]] to i32 ()**
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 ()* [[FUNC]] to void ()*
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[BITCAST_FUNC_PTR_TYPE:%.*]] undef, void ()* [[TMP1]], 0
-; CHECK-NEXT:    ret [[BITCAST_FUNC_PTR_TYPE]] [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_FUNC_PTR_TYPE:%.*]] undef, i32 ()* [[FUNC]], 0
+; CHECK-NEXT:    ret [[BITCAST_FUNC_PTR_TYPE]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_func_ptr_type
 ; CHECK-SAME: (void ()** [[TMP0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_FUNC_PTR_TYPE:%.*]] @bitcast_func_ptr_type.body(void ()** undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_FUNC_PTR_TYPE]] [[TMP2]], 0
-; CHECK-NEXT:    store void ()* [[TMP3]], void ()** [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast void ()** [[TMP0]] to i32 ()**
+; CHECK-NEXT:    store i32 ()* [[TMP3]], i32 ()** [[TMP4]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -925,16 +925,16 @@ attributes #2 = { alwaysinline nounwind }
 ; CHECK-SAME: (<3 x i32>* [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
 ; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <3 x i32>* [[OUT]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[LOAD]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3I32:%.*]] undef, <3 x i32> [[TMP1]], 0
-; CHECK-NEXT:    ret [[BITCAST_POINTER_V4I32_V3I32]] [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3I32:%.*]] undef, <4 x i32> [[LOAD]], 0
+; CHECK-NEXT:    ret [[BITCAST_POINTER_V4I32_V3I32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3i32
 ; CHECK-SAME: (<3 x i32>* [[TMP0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3I32:%.*]] @bitcast_pointer_v4i32_v3i32.body(<3 x i32>* undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_V4I32_V3I32]] [[TMP2]], 0
-; CHECK-NEXT:    store <3 x i32> [[TMP3]], <3 x i32>* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <3 x i32>* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -942,17 +942,16 @@ attributes #2 = { alwaysinline nounwind }
 ; CHECK-SAME: (<3 x float>* [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
 ; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <3 x float>* [[OUT]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[LOAD]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <3 x i32> [[TMP1]] to <3 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3F32:%.*]] undef, <3 x float> [[TMP2]], 0
-; CHECK-NEXT:    ret [[BITCAST_POINTER_V4I32_V3F32]] [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3F32:%.*]] undef, <4 x i32> [[LOAD]], 0
+; CHECK-NEXT:    ret [[BITCAST_POINTER_V4I32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3f32
 ; CHECK-SAME: (<3 x float>* [[TMP0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3F32:%.*]] @bitcast_pointer_v4i32_v3f32.body(<3 x float>* undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_V4I32_V3F32]] [[TMP2]], 0
-; CHECK-NEXT:    store <3 x float> [[TMP3]], <3 x float>* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <3 x float>* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -960,32 +959,50 @@ attributes #2 = { alwaysinline nounwind }
 ; CHECK-SAME: (float* [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
 ; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float* [[OUT]] to i32*
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[LOAD]] to float
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[BITCAST_POINTER_I32_F32:%.*]] undef, float [[TMP1]], 0
-; CHECK-NEXT:    ret [[BITCAST_POINTER_I32_F32]] [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_I32_F32:%.*]] undef, i32 [[LOAD]], 0
+; CHECK-NEXT:    ret [[BITCAST_POINTER_I32_F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f32
 ; CHECK-SAME: (float* [[TMP0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F32:%.*]] @bitcast_pointer_i32_f32.body(float* undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_I32_F32]] [[TMP2]], 0
-; CHECK-NEXT:    store float [[TMP3]], float* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP0]] to i32*
+; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16
+; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16.body
 ; CHECK-SAME: (half* [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
 ; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast half* [[OUT]] to i32*
-; CHECK-NEXT:    store i32 [[LOAD]], i32* [[BITCAST]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_I32_F16:%.*]] undef, i32 [[LOAD]], 0
+; CHECK-NEXT:    ret [[BITCAST_POINTER_I32_F16]] [[TMP1]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16
+; CHECK-SAME: (half* [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F16:%.*]] @bitcast_pointer_i32_f16.body(half* undef)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_I32_F16]] [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast half* [[TMP0]] to i32*
+; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32
+; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32.body
 ; CHECK-SAME: (i32* [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile half, half addrspace(1)* undef, align 2
 ; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[OUT]] to half*
-; CHECK-NEXT:    store half [[LOAD]], half* [[BITCAST]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_F16_I32:%.*]] undef, half [[LOAD]], 0
+; CHECK-NEXT:    ret [[BITCAST_POINTER_F16_I32]] [[TMP1]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32
+; CHECK-SAME: (i32* [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_F16_I32:%.*]] @bitcast_pointer_f16_i32.body(i32* undef)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_F16_I32]] [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP0]] to half*
+; CHECK-NEXT:    store half [[TMP3]], half* [[TMP4]], align 2
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -993,17 +1010,16 @@ attributes #2 = { alwaysinline nounwind }
 ; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[EXTRACTVEC]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_V3F32:%.*]] undef, <3 x float> [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3F32:%.*]] undef, [[STRUCT_V3F32]] [[TMP2]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V3F32]] [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3F32:%.*]] undef, <4 x float> [[EXTRACTVEC]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3f32
 ; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3F32:%.*]] @bitcast_struct_v3f32_v3f32.body(%struct.v3f32* undef, <3 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    store [[STRUCT_V3F32:%.*]] [[TMP4]], %struct.v3f32* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -1011,52 +1027,48 @@ attributes #2 = { alwaysinline nounwind }
 ; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <3 x i32> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[VALUE]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[EXTRACTVEC]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <3 x i32> [[TMP1]] to <3 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_V3F32:%.*]] undef, <3 x float> [[TMP2]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3I32:%.*]] undef, [[STRUCT_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V3I32]] [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3I32:%.*]] undef, <4 x i32> [[EXTRACTVEC]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V3I32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3i32
 ; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <3 x i32> [[TMP1:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3I32:%.*]] @bitcast_struct_v3f32_v3i32.body(%struct.v3f32* undef, <3 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V3I32]] [[TMP3]], 0
-; CHECK-NEXT:    store [[STRUCT_V3F32:%.*]] [[TMP4]], %struct.v3f32* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v4f32.body
 ; CHECK-SAME: (%struct.v4f32* [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v4f32* [[OUT]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_V4F32:%.*]] undef, <4 x float> [[VALUE]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V4F32:%.*]] undef, [[STRUCT_V4F32]] [[TMP1]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V4F32_V4F32]] [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V4F32:%.*]] undef, <4 x float> [[VALUE]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V4F32_V4F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v4f32
 ; CHECK-SAME: (%struct.v4f32* [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V4F32:%.*]] @bitcast_struct_v4f32_v4f32.body(%struct.v4f32* undef, <4 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V4F32_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    store [[STRUCT_V4F32:%.*]] [[TMP4]], %struct.v4f32* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v4f32* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v4i32.body
 ; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <4 x i32> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[VALUE]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <3 x i32> [[TMP1]] to <3 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_V3F32:%.*]] undef, <3 x float> [[TMP2]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V4I32:%.*]] undef, [[STRUCT_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V4I32]] [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V4I32:%.*]] undef, <4 x i32> [[VALUE]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V4I32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v4i32
 ; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V4I32:%.*]] @bitcast_struct_v3f32_v4i32.body(%struct.v3f32* undef, <4 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V4I32]] [[TMP3]], 0
-; CHECK-NEXT:    store [[STRUCT_V3F32:%.*]] [[TMP4]], %struct.v3f32* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -1064,62 +1076,97 @@ attributes #2 = { alwaysinline nounwind }
 ; CHECK-SAME: (%struct.v4f32* [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v4f32* [[OUT]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_V4F32:%.*]] undef, <4 x float> [[EXTRACTVEC]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V3F32:%.*]] undef, [[STRUCT_V4F32]] [[TMP1]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V4F32_V3F32]] [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V3F32:%.*]] undef, <4 x float> [[EXTRACTVEC]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V4F32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v3f32
 ; CHECK-SAME: (%struct.v4f32* [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V3F32:%.*]] @bitcast_struct_v4f32_v3f32.body(%struct.v4f32* undef, <3 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V4F32_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    store [[STRUCT_V4F32:%.*]] [[TMP4]], %struct.v4f32* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v4f32* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v2f32
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v2f32.body
 ; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[VALUE]], <2 x float>* [[CAST]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V2F32:%.*]] undef, <2 x float> [[VALUE]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V2F32]] [[TMP1]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v2f32
+; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <2 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V2F32:%.*]] @bitcast_struct_v3f32_v2f32.body(%struct.v3f32* undef, <2 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V2F32]] [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32.body
 ; CHECK-SAME: (%struct.v3f32.f32* [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32.f32* [[OUT]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[EXTRACTVEC]], <4 x float>* [[CAST]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_F32_V3F32:%.*]] undef, <4 x float> [[EXTRACTVEC]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_F32_V3F32]] [[TMP1]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32
+; CHECK-SAME: (%struct.v3f32.f32* [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V3F32:%.*]] @bitcast_struct_v3f32_f32_v3f32.body(%struct.v3f32.f32* undef, <3 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_F32_V3F32]] [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32.f32* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v4f32
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v4f32.body
 ; CHECK-SAME: (%struct.v3f32.f32* [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32.f32* [[OUT]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[VALUE]], <4 x float>* [[CAST]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_F32_V4F32:%.*]] undef, <4 x float> [[VALUE]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_F32_V4F32]] [[TMP1]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v4f32
+; CHECK-SAME: (%struct.v3f32.f32* [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V4F32:%.*]] @bitcast_struct_v3f32_f32_v4f32.body(%struct.v3f32.f32* undef, <4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_F32_V4F32]] [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32.f32* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_i128_v4f32.body
 ; CHECK-SAME: (%struct.i128* [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.i128* [[OUT]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[VALUE]] to i128
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_I128:%.*]] undef, i128 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[BITCAST_STRUCT_I128_V4F32:%.*]] undef, [[STRUCT_I128]] [[TMP2]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_I128_V4F32]] [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_I128_V4F32:%.*]] undef, <4 x float> [[VALUE]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_I128_V4F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_i128_v4f32
 ; CHECK-SAME: (%struct.i128* [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_I128_V4F32:%.*]] @bitcast_struct_i128_v4f32.body(%struct.i128* undef, <4 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_I128_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    store [[STRUCT_I128:%.*]] [[TMP4]], %struct.i128* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.i128* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_array_v4i32_v4f32
+; CHECK-LABEL: define {{[^@]+}}@bitcast_array_v4i32_v4f32.body
 ; CHECK-SAME: ([4 x i32]* [[OUT:%.*]], [4 x float] [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast [4 x i32]* [[OUT]] to [4 x float]*
-; CHECK-NEXT:    store [4 x float] [[VALUE]], [4 x float]* [[CAST]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_ARRAY_V4I32_V4F32:%.*]] undef, [4 x float] [[VALUE]], 0
+; CHECK-NEXT:    ret [[BITCAST_ARRAY_V4I32_V4F32]] [[TMP1]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@bitcast_array_v4i32_v4f32
+; CHECK-SAME: ([4 x i32]* [[TMP0:%.*]], [4 x float] [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_ARRAY_V4I32_V4F32:%.*]] @bitcast_array_v4i32_v4f32.body([4 x i32]* undef, [4 x float] [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_ARRAY_V4I32_V4F32]] [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast [4 x i32]* [[TMP0]] to [4 x float]*
+; CHECK-NEXT:    store [4 x float] [[TMP4]], [4 x float]* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -1130,30 +1177,36 @@ attributes #2 = { alwaysinline nounwind }
 ; CHECK:       ret0:
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
 ; CHECK-NEXT:    [[CAST0:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x float>*
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[EXTRACTVEC]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_V3F32:%.*]] undef, <3 x float> [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] undef, [[STRUCT_V3F32]] [[TMP1]], 0
-; CHECK-NEXT:    ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] undef, <4 x float> [[EXTRACTVEC]], 0
+; CHECK-NEXT:    ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP0]]
 ; CHECK:       ret1:
 ; CHECK-NEXT:    [[CAST1:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x float>*
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x float>, <4 x float> addrspace(1)* undef, align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[LOAD]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[STRUCT_V3F32]] undef, <3 x float> [[TMP3]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] undef, [[STRUCT_V3F32]] [[TMP4]], 0
-; CHECK-NEXT:    ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] undef, <4 x float> [[LOAD]], 0
+; CHECK-NEXT:    ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multi_return_bitcast_struct_v3f32_v3f32
 ; CHECK-SAME: (i1 [[TMP0:%.*]], %struct.v3f32* [[TMP1:%.*]], <3 x float> [[TMP2:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = call [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] @multi_return_bitcast_struct_v3f32_v3f32.body(i1 [[TMP0]], %struct.v3f32* undef, <3 x float> [[TMP2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP4]], 0
-; CHECK-NEXT:    store [[STRUCT_V3F32:%.*]] [[TMP5]], %struct.v3f32* [[TMP1]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.v3f32* [[TMP1]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP6]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_v3f32_struct_v3f32
+; CHECK-LABEL: define {{[^@]+}}@bitcast_v3f32_struct_v3f32.body
 ; CHECK-SAME: (<3 x float>* [[OUT:%.*]], [[STRUCT_V3F32:%.*]] [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast <3 x float>* [[OUT]] to %struct.v3f32*
-; CHECK-NEXT:    store [[STRUCT_V3F32]] [[VALUE]], %struct.v3f32* [[CAST]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_V3F32_STRUCT_V3F32:%.*]] undef, [[STRUCT_V3F32]] [[VALUE]], 0
+; CHECK-NEXT:    ret [[BITCAST_V3F32_STRUCT_V3F32]] [[TMP1]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@bitcast_v3f32_struct_v3f32
+; CHECK-SAME: (<3 x float>* [[TMP0:%.*]], [[STRUCT_V3F32:%.*]] [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_V3F32_STRUCT_V3F32:%.*]] @bitcast_v3f32_struct_v3f32.body(<3 x float>* undef, [[STRUCT_V3F32]] [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_V3F32_STRUCT_V3F32]] [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <3 x float>* [[TMP0]] to %struct.v3f32*
+; CHECK-NEXT:    store [[STRUCT_V3F32]] [[TMP4]], %struct.v3f32* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;