[clang] [llvm] [SROA] Prefer integer types over non-promotable aggregates (PR #167771)

Wed Nov 26 13:50:32 PST 2025

https://github.com/YonahGoldberg updated https://github.com/llvm/llvm-project/pull/167771

>From b64130735b97f310ddc226c85fe718880fa6877a Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 12 Nov 2025 21:53:29 +0000
Subject: [PATCH 01/12] prefer integer partitions

---
 llvm/lib/Transforms/Scalar/SROA.cpp             |  4 +++-
 .../Transforms/SROA/prefer-integer-partition.ll | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SROA/prefer-integer-partition.ll

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 5c60fad6f91aa..7905cfe95336d 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5234,7 +5234,9 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
       SliceTy = TypePartitionTy;
 
   // If still not, can we use the largest bitwidth integer type used?
-  if (!SliceTy && CommonUseTy.second)
+  // If SliceTy is a non-promotable aggregate, prefer to represent as an integer type
+  // because it's more likely to be promotable.
+  if ((!SliceTy || !SliceTy->isSingleValueType()) && CommonUseTy.second)
     if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) {
       SliceTy = CommonUseTy.second;
       SliceVecTy = dyn_cast<VectorType>(SliceTy);
diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
new file mode 100644
index 0000000000000..3606af8debd69
--- /dev/null
+++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=sroa -S | FileCheck %s
+
+; Ensure that the [2 x half] alloca is spanned by an i32 partition.
+
+define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32 42 to float
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [2 x half]
+  store i32 42, ptr %alloca
+  %val = load float, ptr %alloca
+  ret void
+}

>From b4d756acbeabbdf58744d470fa96c5b418ca8637 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 12 Nov 2025 22:16:38 +0000
Subject: [PATCH 02/12] format

---
 llvm/lib/Transforms/Scalar/SROA.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 7905cfe95336d..e0eeb416092b1 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5234,8 +5234,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
       SliceTy = TypePartitionTy;
 
   // If still not, can we use the largest bitwidth integer type used?
-  // If SliceTy is a non-promotable aggregate, prefer to represent as an integer type
-  // because it's more likely to be promotable.
+  // If SliceTy is a non-promotable aggregate, prefer to represent as an integer
+  // type because it's more likely to be promotable.
   if ((!SliceTy || !SliceTy->isSingleValueType()) && CommonUseTy.second)
     if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) {
       SliceTy = CommonUseTy.second;

>From 89e8fbce60922a98fac635c4be5418a12c9d7cc3 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 13 Nov 2025 01:06:26 +0000
Subject: [PATCH 03/12] julia fix

---
 llvm/lib/Transforms/Scalar/SROA.cpp           |  4 +--
 .../SROA/prefer-integer-partition.ll          | 36 +++++++++++++++++--
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index e0eeb416092b1..883a9c0c4612c 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5241,8 +5241,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
       SliceTy = CommonUseTy.second;
       SliceVecTy = dyn_cast<VectorType>(SliceTy);
     }
-  if ((!SliceTy || (SliceTy->isArrayTy() &&
-                    SliceTy->getArrayElementType()->isIntegerTy())) &&
+  // Try representing the partition as a legal integer type of the same size as the alloca.
+  if ((!SliceTy || SliceTy->isArrayTy()) &&
       DL.isLegalInteger(P.size() * 8)) {
     SliceTy = Type::getIntNTy(*C, P.size() * 8);
   }
diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
index 3606af8debd69..0ed400f18bc37 100644
--- a/llvm/test/Transforms/SROA/prefer-integer-partition.ll
+++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
@@ -1,10 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=sroa -S | FileCheck %s
 
-; Ensure that the [2 x half] alloca is spanned by an i32 partition.
+; Test that SROA converts array types to integer types for promotion.
 
-define void @test() {
-; CHECK-LABEL: @test(
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32-ni:10:11:12:13"
+
+define void @test_float_array_only_intrinsics() {
+; CHECK-LABEL: @test_float_array_only_intrinsics(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src = alloca [2 x float], align 4
+  %dst = alloca [2 x float], align 4
+  
+  ; Initialize src
+  call void @llvm.lifetime.start.p0(i64 8, ptr %src)
+  call void @llvm.lifetime.start.p0(i64 8, ptr %dst)
+  
+  ; Only intrinsic uses - no scalar loads/stores to establish common type
+  call void @llvm.memset.p0.i64(ptr %src, i8 42, i64 8, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr %src, ptr %dst, i64 8, i1 false)
+  
+  call void @llvm.lifetime.end.p0(i64 8, ptr %dst)
+  call void @llvm.lifetime.end.p0(i64 8, ptr %src)
+  ret void
+}
+
+define void @test_mixed_types() {
+; CHECK-LABEL: @test_mixed_types(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32 42 to float
 ; CHECK-NEXT:    ret void
@@ -15,3 +40,8 @@ entry:
   %val = load float, ptr %alloca
   ret void
 }
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)

>From 758018e1d4de6c02a207bcd7805c3094add22732 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 13 Nov 2025 01:14:32 +0000
Subject: [PATCH 04/12] format

---
 llvm/lib/Transforms/Scalar/SROA.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 883a9c0c4612c..f348bf2ca7353 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5241,9 +5241,9 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
       SliceTy = CommonUseTy.second;
       SliceVecTy = dyn_cast<VectorType>(SliceTy);
     }
-  // Try representing the partition as a legal integer type of the same size as the alloca.
-  if ((!SliceTy || SliceTy->isArrayTy()) &&
-      DL.isLegalInteger(P.size() * 8)) {
+  // Try representing the partition as a legal integer type of the same size as
+  // the alloca.
+  if ((!SliceTy || SliceTy->isArrayTy()) && DL.isLegalInteger(P.size() * 8)) {
     SliceTy = Type::getIntNTy(*C, P.size() * 8);
   }
 

>From 5a58db436fc9ea39677f454fa735af055037e6d2 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 13 Nov 2025 01:21:45 +0000
Subject: [PATCH 05/12] remove comment

---
 llvm/test/Transforms/SROA/prefer-integer-partition.ll | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
index 0ed400f18bc37..78d3b22d3fdc0 100644
--- a/llvm/test/Transforms/SROA/prefer-integer-partition.ll
+++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
@@ -14,7 +14,6 @@ entry:
   %src = alloca [2 x float], align 4
   %dst = alloca [2 x float], align 4
   
-  ; Initialize src
   call void @llvm.lifetime.start.p0(i64 8, ptr %src)
   call void @llvm.lifetime.start.p0(i64 8, ptr %dst)
   

>From 74cd7b7021d0df53ef9940fe15403114a04606cc Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 19 Nov 2025 22:48:36 +0000
Subject: [PATCH 06/12] test

---
 llvm/test/Transforms/SROA/prefer-integer-partition.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
index 78d3b22d3fdc0..b9a7af6276565 100644
--- a/llvm/test/Transforms/SROA/prefer-integer-partition.ll
+++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
@@ -13,15 +13,15 @@ define void @test_float_array_only_intrinsics() {
 entry:
   %src = alloca [2 x float], align 4
   %dst = alloca [2 x float], align 4
-  
+
   call void @llvm.lifetime.start.p0(i64 8, ptr %src)
   call void @llvm.lifetime.start.p0(i64 8, ptr %dst)
-  
+
   ; Only intrinsic uses - no scalar loads/stores to establish common type
   call void @llvm.memset.p0.i64(ptr %src, i8 42, i64 8, i1 false)
   call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false)
   call void @llvm.memcpy.p0.p0.i64(ptr %src, ptr %dst, i64 8, i1 false)
-  
+
   call void @llvm.lifetime.end.p0(i64 8, ptr %dst)
   call void @llvm.lifetime.end.p0(i64 8, ptr %src)
   ret void

>From 7b49609c2de6461ee74f77d58e5df4a8906e6de6 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 25 Nov 2025 18:22:17 +0000
Subject: [PATCH 07/12] solve regression

---
 llvm/lib/Transforms/Scalar/SROA.cpp | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index f348bf2ca7353..e906358a9ba50 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5262,12 +5262,28 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
   assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size());
 
-  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
-
-  VectorType *VecTy =
-      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale);
-  if (VecTy)
-    SliceTy = VecTy;
+  // Prefer vector promotion over integer widening for floating-point vectors
+  // because it is more likely the user is just accessing whole vector elements
+  // and not doing bitsise arithmetic.
+  bool PreferVectorPromotion = false;
+  if (auto *FixedVecSliceTy = dyn_cast<FixedVectorType>(SliceTy))
+    PreferVectorPromotion = FixedVecSliceTy->getElementType()->isFloatingPointTy();
+
+  bool IsIntegerPromotable = false;
+  VectorType *VecTy = nullptr;
+
+  if (PreferVectorPromotion) {
+    // For float vectors, try vector promotion first
+    VecTy = isVectorPromotionViable(P, DL, VScale);
+    if (!VecTy)
+      IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
+  } else {
+    // For integer vectors (especially small integers like i8), try integer
+    // widening first as InstCombine can optimize the resulting operations
+    IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
+    if (!IsIntegerPromotable)
+      VecTy = isVectorPromotionViable(P, DL, VScale);
+  }
 
   // Check for the case where we're going to rewrite to a new alloca of the
   // exact same type as the original, and with the same access offsets. In that

>From 035337851338f723fa149a1bbaa5a73b9e5f56c3 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 26 Nov 2025 08:06:00 +0000
Subject: [PATCH 08/12] updated to fix regression

---
 llvm/lib/Transforms/Scalar/SROA.cpp | 124 +++++++++++-----------------
 1 file changed, 50 insertions(+), 74 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index e906358a9ba50..a017c2d3b49ff 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5209,81 +5209,57 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
 /// promoted.
 AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
                                    Partition &P) {
-  // Try to compute a friendly type for this partition of the alloca. This
-  // won't always succeed, in which case we fall back to a legal integer type
-  // or an i8 array of an appropriate size.
-  Type *SliceTy = nullptr;
-  VectorType *SliceVecTy = nullptr;
   const DataLayout &DL = AI.getDataLayout();
-  unsigned VScale = AI.getFunction()->getVScaleValue();
-
-  std::pair<Type *, IntegerType *> CommonUseTy =
-      findCommonType(P.begin(), P.end(), P.endOffset());
-  // Do all uses operate on the same type?
-  if (CommonUseTy.first) {
-    TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first);
-    if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
-      SliceTy = CommonUseTy.first;
-      SliceVecTy = dyn_cast<VectorType>(SliceTy);
+  auto ComputePartitionTy = [&]() -> std::tuple<Type *, bool, VectorType *> {
+    // First check if the partition is viable for vetor promotion. If it is
+    // via a floating-point vector, we are done because we would never prefer integer widening.
+    VectorType *VecTy = isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue());
+    if (VecTy) {
+      if (VecTy->getElementType()->isFloatingPointTy()) {
+        return {VecTy, false, VecTy};
+      }
     }
-  }
-  // If not, can we find an appropriate subtype in the original allocated type?
-  if (!SliceTy)
-    if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
-                                                 P.beginOffset(), P.size()))
-      SliceTy = TypePartitionTy;
-
-  // If still not, can we use the largest bitwidth integer type used?
-  // If SliceTy is a non-promotable aggregate, prefer to represent as an integer
-  // type because it's more likely to be promotable.
-  if ((!SliceTy || !SliceTy->isSingleValueType()) && CommonUseTy.second)
-    if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) {
-      SliceTy = CommonUseTy.second;
-      SliceVecTy = dyn_cast<VectorType>(SliceTy);
+
+    // Otherwise, check if there is a common type that all slices of the
+    // partition use. Collect the largest integer type used as a backup.
+    auto CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset());
+    // If there is a common type that spans the partition, use it.
+    if (CommonUseTy.first) {
+      TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first);
+      if (CommonUseSize.isFixed() &&
+          CommonUseSize.getFixedValue() >= P.size()) {
+
+        if (VecTy)
+          return {VecTy, false, VecTy};
+        return {CommonUseTy.first, isIntegerWideningViable(P, CommonUseTy.first, DL), nullptr};
+      }
     }
-  // Try representing the partition as a legal integer type of the same size as
-  // the alloca.
-  if ((!SliceTy || SliceTy->isArrayTy()) && DL.isLegalInteger(P.size() * 8)) {
-    SliceTy = Type::getIntNTy(*C, P.size() * 8);
-  }
-
-  // If the common use types are not viable for promotion then attempt to find
-  // another type that is viable.
-  if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale))
-    if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
-                                                 P.beginOffset(), P.size())) {
-      VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy);
-      if (TypePartitionVecTy &&
-          checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale))
-        SliceTy = TypePartitionTy;
+
+    // If not, can we find an appropriate subtype in the original allocated type?
+    if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) {
+      if (TypePartitionTy->isArrayTy() && TypePartitionTy->getArrayElementType()->isIntegerTy() && DL.isLegalInteger(P.size() * 8))
+        TypePartitionTy = Type::getIntNTy(*C, P.size() * 8);
+      
+      if (isIntegerWideningViable(P, TypePartitionTy, DL))
+        return {TypePartitionTy, true, nullptr};
+      if (VecTy)
+        return {VecTy, false, VecTy};
+      if (CommonUseTy.second && DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size() && isIntegerWideningViable(P, CommonUseTy.second, DL))
+        return {CommonUseTy.second, true, nullptr};
+      return {TypePartitionTy, false, nullptr};
     }
 
-  if (!SliceTy)
-    SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
-  assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size());
-
-  // Prefer vector promotion over integer widening for floating-point vectors
-  // because it is more likely the user is just accessing whole vector elements
-  // and not doing bitsise arithmetic.
-  bool PreferVectorPromotion = false;
-  if (auto *FixedVecSliceTy = dyn_cast<FixedVectorType>(SliceTy))
-    PreferVectorPromotion = FixedVecSliceTy->getElementType()->isFloatingPointTy();
-
-  bool IsIntegerPromotable = false;
-  VectorType *VecTy = nullptr;
-
-  if (PreferVectorPromotion) {
-    // For float vectors, try vector promotion first
-    VecTy = isVectorPromotionViable(P, DL, VScale);
-    if (!VecTy)
-      IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
-  } else {
-    // For integer vectors (especially small integers like i8), try integer
-    // widening first as InstCombine can optimize the resulting operations
-    IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
-    if (!IsIntegerPromotable)
-      VecTy = isVectorPromotionViable(P, DL, VScale);
-  }
+    // If still not, can we use the largest bitwidth integer type used?
+    if (CommonUseTy.second && DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size())
+      return {CommonUseTy.second, false, nullptr};
+
+    if (DL.isLegalInteger(P.size() * 8))
+      return {Type::getIntNTy(*C, P.size() * 8), false, nullptr};
+
+    return {ArrayType::get(Type::getInt8Ty(*C), P.size()), false, nullptr};
+  };
+
+  auto [PartitionTy, IsIntegerPromotable, VecTy] = ComputePartitionTy();
 
   // Check for the case where we're going to rewrite to a new alloca of the
   // exact same type as the original, and with the same access offsets. In that
@@ -5292,7 +5268,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   // P.beginOffset() can be non-zero even with the same type in a case with
   // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
   AllocaInst *NewAI;
-  if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
+  if (PartitionTy == AI.getAllocatedType() && P.beginOffset() == 0) {
     NewAI = &AI;
     // FIXME: We should be able to bail at this point with "nothing changed".
     // FIXME: We might want to defer PHI speculation until after here.
@@ -5302,10 +5278,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
     // If we will get at least this much alignment from the type alone, leave
     // the alloca's alignment unconstrained.
-    const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
+    const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(PartitionTy);
     NewAI = new AllocaInst(
-        SliceTy, AI.getAddressSpace(), nullptr,
-        IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
+        PartitionTy, AI.getAddressSpace(), nullptr,
+        IsUnconstrained ? DL.getPrefTypeAlign(PartitionTy) : Alignment,
         AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
         AI.getIterator());
     // Copy the old AI debug location over to the new one.

>From c55ee9f5592d1dbf63c2298accd80fc719af5b0d Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 26 Nov 2025 08:10:43 +0000
Subject: [PATCH 09/12] remove julia test

---
 .../SROA/prefer-integer-partition.ll          | 82 +++++++++++++------
 1 file changed, 56 insertions(+), 26 deletions(-)

diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
index b9a7af6276565..5b639169cc207 100644
--- a/llvm/test/Transforms/SROA/prefer-integer-partition.ll
+++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
@@ -1,30 +1,65 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=sroa -S | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt %s -passes=sroa -S | FileCheck %s
 
-; Test that SROA converts array types to integer types for promotion.
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
 
-target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32-ni:10:11:12:13"
+%"struct.pbrt::RaySamples" = type { %struct.anon.45, %struct.anon.46, i8, %struct.anon.47 }
+%struct.anon.45 = type { %"class.pbrt::Point2", float }
+%"class.pbrt::Point2" = type { %"class.pbrt::Tuple2" }
+%"class.pbrt::Tuple2" = type { float, float }
+%struct.anon.46 = type { float, float, %"class.pbrt::Point2" }
+%struct.anon.47 = type { float, %"class.pbrt::Point2" }
 
-define void @test_float_array_only_intrinsics() {
-; CHECK-LABEL: @test_float_array_only_intrinsics(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret void
+define <2 x float> @subsurface_test() local_unnamed_addr {
+; CHECK-LABEL: define <2 x float> @subsurface_test() local_unnamed_addr {
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr inttoptr (i64 12 to ptr), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i1
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[BB4:.*]], label %[[_ZNK4PBRT3SOAINS_10RAYSAMPLESEEIXEI_EXIT:.*]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = load volatile { <2 x float>, <2 x float> }, ptr null, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP5]], 1
+; CHECK-NEXT:    [[BC_I:%.*]] = bitcast <2 x float> [[TMP6]] to <2 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[BC_I]], i64 1
+; CHECK-NEXT:    [[BC2_I:%.*]] = bitcast <2 x float> [[TMP7]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[BC2_I]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32 [[TMP8]] to float
+; CHECK-NEXT:    [[DOTSROA_1_36_VEC_INSERT:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[TMP9]] to float
+; CHECK-NEXT:    [[DOTSROA_1_40_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_1_36_VEC_INSERT]], float [[TMP11]], i32 1
+; CHECK-NEXT:    br label %[[_ZNK4PBRT3SOAINS_10RAYSAMPLESEEIXEI_EXIT]]
+; CHECK:       [[_ZNK4PBRT3SOAINS_10RAYSAMPLESEEIXEI_EXIT]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <2 x float> [ [[DOTSROA_1_40_VEC_INSERT]], %[[BB4]] ], [ zeroinitializer, [[TMP0:%.*]] ]
+; CHECK-NEXT:    ret <2 x float> [[TMP10]]
 ;
-entry:
-  %src = alloca [2 x float], align 4
-  %dst = alloca [2 x float], align 4
+  %1 = alloca %"struct.pbrt::RaySamples", align 4
+  %2 = getelementptr i8, ptr %1, i64 36
+  store i64 0, ptr %2, align 4
+  %3 = load float, ptr inttoptr (i64 12 to ptr), align 4
+  %4 = fptosi float %3 to i32
+  %5 = trunc i32 %4 to i1
+  br i1 %5, label %6, label %_ZNK4pbrt3SOAINS_10RaySamplesEEixEi.exit
 
-  call void @llvm.lifetime.start.p0(i64 8, ptr %src)
-  call void @llvm.lifetime.start.p0(i64 8, ptr %dst)
+6:                                                ; preds = %0
+  %7 = load volatile { <2 x float>, <2 x float> }, ptr null, align 8
+  %8 = extractvalue { <2 x float>, <2 x float> } %7, 0
+  %9 = extractvalue { <2 x float>, <2 x float> } %7, 1
+  store float 0.000000e+00, ptr %1, align 4
+  %bc.i = bitcast <2 x float> %8 to <2 x i32>
+  %10 = extractelement <2 x i32> %bc.i, i64 1
+  %bc2.i = bitcast <2 x float> %9 to <2 x i32>
+  %11 = extractelement <2 x i32> %bc2.i, i64 0
+  store i32 %10, ptr %2, align 4
+  %.sroa_idx1.i = getelementptr i8, ptr %1, i64 40
+  store i32 %11, ptr %.sroa_idx1.i, align 4
+  br label %_ZNK4pbrt3SOAINS_10RaySamplesEEixEi.exit
 
-  ; Only intrinsic uses - no scalar loads/stores to establish common type
-  call void @llvm.memset.p0.i64(ptr %src, i8 42, i64 8, i1 false)
-  call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false)
-  call void @llvm.memcpy.p0.p0.i64(ptr %src, ptr %dst, i64 8, i1 false)
-
-  call void @llvm.lifetime.end.p0(i64 8, ptr %dst)
-  call void @llvm.lifetime.end.p0(i64 8, ptr %src)
-  ret void
+_ZNK4pbrt3SOAINS_10RaySamplesEEixEi.exit:         ; preds = %0, %6
+  %12 = getelementptr inbounds nuw i8, ptr %1, i64 36
+  %.sroa.01.0.copyload = load <2 x float>, ptr %12, align 4
+  ret <2 x float> %.sroa.01.0.copyload
 }
 
 define void @test_mixed_types() {
@@ -39,8 +74,3 @@ entry:
   %val = load float, ptr %alloca
   ret void
 }
-
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture)

>From ee7fb53531008e1088d8115557ab547f842bcdb9 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 26 Nov 2025 08:10:51 +0000
Subject: [PATCH 10/12] format

---
 llvm/lib/Transforms/Scalar/SROA.cpp | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index a017c2d3b49ff..f583dd67d60a8 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5212,8 +5212,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   const DataLayout &DL = AI.getDataLayout();
   auto ComputePartitionTy = [&]() -> std::tuple<Type *, bool, VectorType *> {
     // First check if the partition is viable for vetor promotion. If it is
-    // via a floating-point vector, we are done because we would never prefer integer widening.
-    VectorType *VecTy = isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue());
+    // via a floating-point vector, we are done because we would never prefer
+    // integer widening.
+    VectorType *VecTy =
+        isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue());
     if (VecTy) {
       if (VecTy->getElementType()->isFloatingPointTy()) {
         return {VecTy, false, VecTy};
@@ -5231,26 +5233,34 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
 
         if (VecTy)
           return {VecTy, false, VecTy};
-        return {CommonUseTy.first, isIntegerWideningViable(P, CommonUseTy.first, DL), nullptr};
+        return {CommonUseTy.first,
+                isIntegerWideningViable(P, CommonUseTy.first, DL), nullptr};
       }
     }
 
-    // If not, can we find an appropriate subtype in the original allocated type?
-    if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) {
-      if (TypePartitionTy->isArrayTy() && TypePartitionTy->getArrayElementType()->isIntegerTy() && DL.isLegalInteger(P.size() * 8))
+    // If not, can we find an appropriate subtype in the original allocated
+    // type?
+    if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
+                                                 P.beginOffset(), P.size())) {
+      if (TypePartitionTy->isArrayTy() &&
+          TypePartitionTy->getArrayElementType()->isIntegerTy() &&
+          DL.isLegalInteger(P.size() * 8))
         TypePartitionTy = Type::getIntNTy(*C, P.size() * 8);
-      
+
       if (isIntegerWideningViable(P, TypePartitionTy, DL))
         return {TypePartitionTy, true, nullptr};
       if (VecTy)
         return {VecTy, false, VecTy};
-      if (CommonUseTy.second && DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size() && isIntegerWideningViable(P, CommonUseTy.second, DL))
+      if (CommonUseTy.second &&
+          DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size() &&
+          isIntegerWideningViable(P, CommonUseTy.second, DL))
         return {CommonUseTy.second, true, nullptr};
       return {TypePartitionTy, false, nullptr};
     }
 
     // If still not, can we use the largest bitwidth integer type used?
-    if (CommonUseTy.second && DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size())
+    if (CommonUseTy.second &&
+        DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size())
       return {CommonUseTy.second, false, nullptr};
 
     if (DL.isLegalInteger(P.size() * 8))

>From 1aa16e3737600aa1daad2c2000c09fc939c4da33 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 26 Nov 2025 21:00:40 +0000
Subject: [PATCH 11/12] improvements on arm

---
 .../AArch64/neon-scalar-x-indexed-elem.c      |  98 +++-----
 .../CodeGen/arm-bf16-convert-intrinsics.c     | 237 ++++++++----------
 2 files changed, 143 insertions(+), 192 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
index 9b98126500444..2b1af62789eac 100644
--- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
@@ -111,8 +111,8 @@ float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i32 0
-// CHECK-NEXT:    [[VGET_LANE4:%.*]] = extractelement <1 x double> [[B]], i32 0
-// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE4]])
+// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <1 x double> [[B]], i32 0
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE3]])
 // CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> [[A]], double [[VMULXD_F64_I]], i32 0
 // CHECK-NEXT:    ret <1 x double> [[VSET_LANE]]
 //
@@ -196,19 +196,13 @@ float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vfma_lane_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT:    [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
-// CHECK-NEXT:    [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
-// CHECK-NEXT:    [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
 // CHECK-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
 // CHECK-NEXT:    ret <1 x double> [[FMLA2]]
 //
@@ -219,20 +213,14 @@ float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vfms_lane_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT:    [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <1 x double> [[B]]
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64
-// CHECK-NEXT:    [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
-// CHECK-NEXT:    [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
 // CHECK-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
 // CHECK-NEXT:    ret <1 x double> [[FMLA2]]
 //
@@ -243,21 +231,16 @@ float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT:    [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
-// CHECK-NEXT:    [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
-// CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]])
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP10]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP7]]
 //
 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfma_laneq_f64(a, b, v, 0);
@@ -266,22 +249,17 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT:    [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <1 x double> [[B]]
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64
-// CHECK-NEXT:    [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
-// CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]])
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP10]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP7]]
 //
 float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfms_laneq_f64(a, b, v, 0);
@@ -555,8 +533,8 @@ int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
 // CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
-// CHECK-NEXT:    [[VGET_LANE9:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
-// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE9]])
+// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE8]])
 // CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
 // CHECK-NEXT:    ret <1 x double> [[VSET_LANE]]
 //
diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index 65a23dc0325c8..ee1c1af53811d 100644
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -26,21 +26,19 @@
 // CHECK-A64-NEXT:  entry:
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16>
 // CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-// CHECK-A64-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
-// CHECK-A64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
-// CHECK-A64-NEXT:    ret <4 x float> [[TMP4]]
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A64-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
+// CHECK-A64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
+// CHECK-A64-NEXT:    ret <4 x float> [[TMP3]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvt_f32_bf16(
 // CHECK-A32-HARDFP-NEXT:  entry:
 // CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16>
 // CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP4]]
+// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
+// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP3]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvt_f32_bf16(
 // CHECK-A32-SOFTFP-NEXT:  entry:
@@ -49,11 +47,10 @@
 // CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <4 x i16>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP6]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP7]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP5]], splat (i32 16)
+// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP6]]
 //
 float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
   return vcvt_f32_bf16(a);
@@ -64,22 +61,20 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
 // CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
 // CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
-// CHECK-A64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A64-NEXT:    ret <4 x float> [[TMP4]]
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
+// CHECK-A64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A64-NEXT:    ret <4 x float> [[TMP3]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_f32_bf16(
 // CHECK-A32-HARDFP-NEXT:  entry:
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
 // CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP4]]
+// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
+// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP3]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_f32_bf16(
 // CHECK-A32-SOFTFP-NEXT:  entry:
@@ -95,11 +90,10 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
 // CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP8]] to <4 x i16>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP13]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = sext <4 x i16> [[TMP9]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16)
+// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP12]]
 //
 float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
   return vcvtq_low_f32_bf16(a);
@@ -110,22 +104,20 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
 // CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
 // CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
-// CHECK-A64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A64-NEXT:    ret <4 x float> [[TMP4]]
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
+// CHECK-A64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A64-NEXT:    ret <4 x float> [[TMP3]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_f32_bf16(
 // CHECK-A32-HARDFP-NEXT:  entry:
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
 // CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP4]]
+// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
+// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP3]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_f32_bf16(
 // CHECK-A32-SOFTFP-NEXT:  entry:
@@ -141,11 +133,10 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
 // CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP8]] to <4 x i16>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP13]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = sext <4 x i16> [[TMP9]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16)
+// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP12]]
 //
 float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
   return vcvtq_high_f32_bf16(a);
@@ -153,33 +144,30 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
 
 // CHECK-A64-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x bfloat>
-// CHECK-A64-NEXT:    ret <4 x bfloat> [[TMP3]]
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A64-NEXT:    [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <16 x i8>
+// CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A64-NEXT:    ret <4 x bfloat> [[SHUFFLE_I]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]])
+// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
 // CHECK-A32-HARDFP-NEXT:    ret <4 x bfloat> [[VCVTFP2BF1_I]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A32-SOFTFP-NEXT:  entry:
-// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]])
-// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    ret <2 x i32> [[TMP7]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
+// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[TMP1]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    ret <2 x i32> [[TMP6]]
 //
 bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
   return vcvt_bf16_f32(a);
@@ -187,44 +175,36 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
 
 // CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x bfloat>
-// CHECK-A64-NEXT:    [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP3]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A64-NEXT:    ret <8 x bfloat> [[TMP4]]
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A64-NEXT:    [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <16 x i8>
+// CHECK-A64-NEXT:    ret <8 x bfloat> [[TMP2]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast i64 0 to <4 x bfloat>
-// CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]])
-// CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP0]], <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
+// CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-A32-HARDFP-NEXT:    ret <8 x bfloat> [[SHUFFLE_I]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A32-SOFTFP-NEXT:  entry:
-// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = bitcast i64 0 to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]])
-// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
+// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[TMP1]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP0]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP6]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP8]], <4 x bfloat> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = bitcast <8 x bfloat> [[TMP11]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[TMP12]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[TMP14]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> zeroinitializer to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP5]], <4 x bfloat> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = bitcast <8 x bfloat> [[TMP10]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[TMP11]]
 //
 bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
   return vcvtq_low_bf16_f32(a);
@@ -232,23 +212,18 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
 
 // CHECK-A64-LABEL: @test_vcvtq_high_bf16_f32(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <8 x i16>
-// CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-A64-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x bfloat>
-// CHECK-A64-NEXT:    [[TMP5:%.*]] = shufflevector <8 x bfloat> [[TMP4]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A64-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-A64-NEXT:    [[TMP7:%.*]] = fptrunc <4 x float> [[TMP6]] to <4 x bfloat>
-// CHECK-A64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x bfloat> [[TMP5]], <4 x bfloat> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A64-NEXT:    ret <8 x bfloat> [[TMP8]]
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
+// CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A64-NEXT:    [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT:    [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[TMP4]] to <16 x i8>
+// CHECK-A64-NEXT:    ret <8 x bfloat> [[TMP4]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]])
+// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[INACTIVE:%.*]], <8 x bfloat> [[INACTIVE]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I8:%.*]] = shufflevector <4 x bfloat> [[VCVTFP2BF1_I]], <4 x bfloat> [[SHUFFLE_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-A32-HARDFP-NEXT:    ret <8 x bfloat> [[SHUFFLE_I8]]
@@ -258,29 +233,27 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
 // CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[INACTIVE_COERCE:%.*]] to <8 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <4 x i32>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]])
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP9]], <8 x bfloat> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = bitcast <4 x bfloat> [[TMP7]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP13:%.*]] = bitcast <4 x bfloat> [[TMP11]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
+// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP8]], <8 x bfloat> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = bitcast <4 x bfloat> [[TMP10]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[TMP11]] to <4 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP12]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[TMP14]], <4 x bfloat> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP16:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I17]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP17:%.*]] = bitcast <4 x i32> [[TMP16]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP18:%.*]] = bitcast <8 x bfloat> [[TMP17]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP19:%.*]] = bitcast <4 x i32> [[TMP18]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP20:%.*]] = bitcast <8 x bfloat> [[TMP19]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[TMP20]]
+// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[TMP13]], <4 x bfloat> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP15:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I17]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP15]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP17:%.*]] = bitcast <8 x bfloat> [[TMP16]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP18:%.*]] = bitcast <4 x i32> [[TMP17]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP19:%.*]] = bitcast <8 x bfloat> [[TMP18]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[TMP19]]
 //
 bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
   return vcvtq_high_bf16_f32(inactive, a);
@@ -308,7 +281,7 @@ bfloat16_t test_vcvth_bf16_f32(float32_t a) {
 // CHECK-LABEL: @test_vcvtah_f32_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast bfloat [[A:%.*]] to i16
-// CHECK-NEXT:    [[CONV_I:%.*]] = zext i16 [[TMP0]] to i32
+// CHECK-NEXT:    [[CONV_I:%.*]] = sext i16 [[TMP0]] to i32
 // CHECK-NEXT:    [[SHL_I:%.*]] = shl i32 [[CONV_I]], 16
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[SHL_I]] to float
 // CHECK-NEXT:    ret float [[TMP1]]

>From 3c3a6773827811c9099032b98c20d76a0f17a412 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 26 Nov 2025 21:50:11 +0000
Subject: [PATCH 12/12] arm changes

---
 .../AArch64/neon-scalar-x-indexed-elem.c      | 120 +++++----
 .../CodeGen/arm-bf16-convert-intrinsics.c     | 231 ++++++++++--------
 2 files changed, 199 insertions(+), 152 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
index 2b1af62789eac..a86a80a939b16 100644
--- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
@@ -56,8 +56,8 @@ float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to double
 // CHECK-NEXT:    [[TMP1:%.*]] = fmul double [[TMP0]], [[B]]
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[TMP1]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP2]]
+// CHECK-NEXT:    [[REF_TMP_I_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double [[TMP1]], i32 0
+// CHECK-NEXT:    ret <1 x double> [[REF_TMP_I_0_VEC_INSERT]]
 //
 float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
   return vmul_n_f64(a, b);
@@ -111,8 +111,8 @@ float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i32 0
-// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <1 x double> [[B]], i32 0
-// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE3]])
+// CHECK-NEXT:    [[VGET_LANE4:%.*]] = extractelement <1 x double> [[B]], i32 0
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE4]])
 // CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> [[A]], double [[VMULXD_F64_I]], i32 0
 // CHECK-NEXT:    ret <1 x double> [[VSET_LANE]]
 //
@@ -196,13 +196,19 @@ float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vfma_lane_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
+// CHECK-NEXT:    [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
+// CHECK-NEXT:    [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
+// CHECK-NEXT:    [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
 // CHECK-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
 // CHECK-NEXT:    ret <1 x double> [[FMLA2]]
 //
@@ -213,14 +219,20 @@ float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vfms_lane_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
+// CHECK-NEXT:    [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <1 x double> [[B]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64
+// CHECK-NEXT:    [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
+// CHECK-NEXT:    [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
 // CHECK-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
 // CHECK-NEXT:    ret <1 x double> [[FMLA2]]
 //
@@ -231,16 +243,21 @@ float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-// CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
+// CHECK-NEXT:    [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
+// CHECK-NEXT:    [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
+// CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP10]]
 //
 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfma_laneq_f64(a, b, v, 0);
@@ -249,17 +266,22 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
+// CHECK-NEXT:    [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <1 x double> [[B]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-// CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64
+// CHECK-NEXT:    [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
+// CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP10]]
 //
 float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfms_laneq_f64(a, b, v, 0);
@@ -530,12 +552,12 @@ int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vmulx_lane_f64_0(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
-// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
-// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE8]])
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
+// CHECK-NEXT:    [[__PROMOTE_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FD6304BC43AB5C2, i32 0
+// CHECK-NEXT:    [[__PROMOTE2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FEE211E215AEEF3, i32 0
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], i32 0
+// CHECK-NEXT:    [[VGET_LANE9:%.*]] = extractelement <1 x double> [[__PROMOTE2_SROA_0_0_VEC_INSERT]], i32 0
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE9]])
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], double [[VMULXD_F64_I]], i32 0
 // CHECK-NEXT:    ret <1 x double> [[VSET_LANE]]
 //
 float64x1_t test_vmulx_lane_f64_0() {
@@ -552,13 +574,13 @@ float64x1_t test_vmulx_lane_f64_0() {
 // CHECK-LABEL: define dso_local <1 x double> @test_vmulx_laneq_f64_2(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
+// CHECK-NEXT:    [[__PROMOTE_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FD6304BC43AB5C2, i32 0
+// CHECK-NEXT:    [[__PROMOTE2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FEE211E215AEEF3, i32 0
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], <1 x double> [[__PROMOTE2_SROA_0_0_VEC_INSERT]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], i32 0
 // CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[SHUFFLE_I]], i32 1
 // CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], double [[VMULXD_F64_I]], i32 0
 // CHECK-NEXT:    ret <1 x double> [[VSET_LANE]]
 //
 float64x1_t test_vmulx_laneq_f64_2() {
diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index ee1c1af53811d..b7f961e4ce15c 100644
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -26,19 +26,21 @@
 // CHECK-A64-NEXT:  entry:
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16>
 // CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A64-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
-// CHECK-A64-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-A64-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-A64-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
+// CHECK-A64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
+// CHECK-A64-NEXT:    ret <4 x float> [[TMP4]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvt_f32_bf16(
 // CHECK-A32-HARDFP-NEXT:  entry:
 // CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16>
 // CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
+// CHECK-A32-HARDFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP4]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvt_f32_bf16(
 // CHECK-A32-SOFTFP-NEXT:  entry:
@@ -47,10 +49,11 @@
 // CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <4 x i16>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP5]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP6]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP6]], splat (i32 16)
+// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP7]]
 //
 float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
   return vcvt_f32_bf16(a);
@@ -61,20 +64,22 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
 // CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
 // CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A64-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-A64-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
+// CHECK-A64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A64-NEXT:    ret <4 x float> [[TMP4]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_f32_bf16(
 // CHECK-A32-HARDFP-NEXT:  entry:
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
 // CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
+// CHECK-A32-HARDFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP4]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_f32_bf16(
 // CHECK-A32-SOFTFP-NEXT:  entry:
@@ -90,10 +95,11 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
 // CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP8]] to <4 x i16>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = sext <4 x i16> [[TMP9]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP12]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16)
+// CHECK-A32-SOFTFP-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP13]]
 //
 float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
   return vcvtq_low_f32_bf16(a);
@@ -104,20 +110,22 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
 // CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
 // CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A64-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-A64-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
+// CHECK-A64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A64-NEXT:    ret <4 x float> [[TMP4]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_f32_bf16(
 // CHECK-A32-HARDFP-NEXT:  entry:
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
 // CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
+// CHECK-A32-HARDFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP4]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_f32_bf16(
 // CHECK-A32-SOFTFP-NEXT:  entry:
@@ -133,10 +141,11 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
 // CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP8]] to <4 x i16>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = sext <4 x i16> [[TMP9]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP12]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16)
+// CHECK-A32-SOFTFP-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP13]]
 //
 float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
   return vcvtq_high_f32_bf16(a);
@@ -144,30 +153,33 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
 
 // CHECK-A64-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <16 x i8>
-// CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A64-NEXT:    ret <4 x bfloat> [[SHUFFLE_I]]
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-A64-NEXT:    [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x bfloat>
+// CHECK-A64-NEXT:    ret <4 x bfloat> [[TMP3]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
+// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]])
 // CHECK-A32-HARDFP-NEXT:    ret <4 x bfloat> [[VCVTFP2BF1_I]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A32-SOFTFP-NEXT:  entry:
-// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
-// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[TMP1]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    ret <2 x i32> [[TMP6]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]])
+// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    ret <2 x i32> [[TMP7]]
 //
 bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
   return vcvt_bf16_f32(a);
@@ -175,36 +187,42 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
 
 // CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <16 x i8>
-// CHECK-A64-NEXT:    ret <8 x bfloat> [[TMP2]]
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-A64-NEXT:    [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x bfloat>
+// CHECK-A64-NEXT:    [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP3]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT:    ret <8 x bfloat> [[TMP4]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
+// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]])
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-A32-HARDFP-NEXT:    ret <8 x bfloat> [[SHUFFLE_I]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A32-SOFTFP-NEXT:  entry:
-// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
-// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[TMP1]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> zeroinitializer to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP5]], <4 x bfloat> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]])
+// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <4 x bfloat> zeroinitializer to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP6]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP7]], <4 x bfloat> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <8 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = bitcast <8 x bfloat> [[TMP10]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[TMP11]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP13:%.*]] = bitcast <8 x bfloat> [[TMP12]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[TMP13]]
 //
 bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
   return vcvtq_low_bf16_f32(a);
@@ -212,18 +230,23 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
 
 // CHECK-A64-LABEL: @test_vcvtq_high_bf16_f32(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
-// CHECK-A64-NEXT:    [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[TMP4]] to <16 x i8>
-// CHECK-A64-NEXT:    ret <8 x bfloat> [[TMP4]]
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <8 x i16>
+// CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-A64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+// CHECK-A64-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x bfloat>
+// CHECK-A64-NEXT:    [[TMP5:%.*]] = shufflevector <8 x bfloat> [[TMP4]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A64-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
+// CHECK-A64-NEXT:    [[TMP7:%.*]] = fptrunc <4 x float> [[TMP6]] to <4 x bfloat>
+// CHECK-A64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x bfloat> [[TMP5]], <4 x bfloat> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT:    ret <8 x bfloat> [[TMP8]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
+// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]])
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[INACTIVE:%.*]], <8 x bfloat> [[INACTIVE]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I8:%.*]] = shufflevector <4 x bfloat> [[VCVTFP2BF1_I]], <4 x bfloat> [[SHUFFLE_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-A32-HARDFP-NEXT:    ret <8 x bfloat> [[SHUFFLE_I8]]
@@ -233,27 +256,29 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
 // CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[INACTIVE_COERCE:%.*]] to <8 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <4 x i32>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
-// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP8]], <8 x bfloat> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = bitcast <4 x bfloat> [[TMP10]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[TMP11]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]])
+// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP9]], <8 x bfloat> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = bitcast <4 x bfloat> [[TMP7]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP13:%.*]] = bitcast <4 x bfloat> [[TMP11]] to <2 x i32>
 // CHECK-A32-SOFTFP-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP12]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[TMP13]], <4 x bfloat> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP15:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I17]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP15]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP17:%.*]] = bitcast <8 x bfloat> [[TMP16]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP18:%.*]] = bitcast <4 x i32> [[TMP17]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP19:%.*]] = bitcast <8 x bfloat> [[TMP18]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[TMP19]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[TMP14]], <4 x bfloat> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP16:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I17]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP17:%.*]] = bitcast <4 x i32> [[TMP16]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP18:%.*]] = bitcast <8 x bfloat> [[TMP17]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP19:%.*]] = bitcast <4 x i32> [[TMP18]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP20:%.*]] = bitcast <8 x bfloat> [[TMP19]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[TMP20]]
 //
 bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
   return vcvtq_high_bf16_f32(inactive, a);
@@ -281,7 +306,7 @@ bfloat16_t test_vcvth_bf16_f32(float32_t a) {
 // CHECK-LABEL: @test_vcvtah_f32_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast bfloat [[A:%.*]] to i16
-// CHECK-NEXT:    [[CONV_I:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i16 [[TMP0]] to i32
 // CHECK-NEXT:    [[SHL_I:%.*]] = shl i32 [[CONV_I]], 16
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[SHL_I]] to float
 // CHECK-NEXT:    ret float [[TMP1]]