[clang] [llvm] [SROA] Refactor rewritePartition alloca type selection process (PR #167771)
Yonah Goldberg via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 9 16:47:10 PST 2025
https://github.com/YonahGoldberg updated https://github.com/llvm/llvm-project/pull/167771
>From 53ec55fa11d0ff7d67dfde04c43fbf668d12b1e4 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 12 Nov 2025 21:53:29 +0000
Subject: [PATCH 01/41] prefer integer partitions
---
llvm/lib/Transforms/Scalar/SROA.cpp | 6 ++++--
.../Transforms/SROA/prefer-integer-partition.ll | 17 +++++++++++++++++
2 files changed, 21 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/Transforms/SROA/prefer-integer-partition.ll
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 3a70830cf8c0e..9a6b2396f2e7a 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5213,8 +5213,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
SliceTy = TypePartitionTy;
// If still not, can we use the largest bitwidth integer type used?
- if (!SliceTy && CommonUseTy.second)
- if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size())
+ // If SliceTy is a non-promotable aggregate, prefer to represent as an integer type
+ // because it's more likely to be promotable.
+ if ((!SliceTy || !SliceTy->isSingleValueType()) && CommonUseTy.second)
+ if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) {
SliceTy = CommonUseTy.second;
if ((!SliceTy || (SliceTy->isArrayTy() &&
SliceTy->getArrayElementType()->isIntegerTy())) &&
diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
new file mode 100644
index 0000000000000..3606af8debd69
--- /dev/null
+++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=sroa -S | FileCheck %s
+
+; Ensure that the [2 x half] alloca is spanned by an i32 partition.
+
+define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 42 to float
+; CHECK-NEXT: ret void
+;
+entry:
+ %alloca = alloca [2 x half]
+ store i32 42, ptr %alloca
+ %val = load float, ptr %alloca
+ ret void
+}
>From 5779a34cb19f91b7fcaac2c2715a9b4ca9c540c4 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 12 Nov 2025 22:16:38 +0000
Subject: [PATCH 02/41] format
---
llvm/lib/Transforms/Scalar/SROA.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 9a6b2396f2e7a..9f9d564c1cdf4 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5213,8 +5213,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
SliceTy = TypePartitionTy;
// If still not, can we use the largest bitwidth integer type used?
- // If SliceTy is a non-promotable aggregate, prefer to represent as an integer type
- // because it's more likely to be promotable.
+ // If SliceTy is a non-promotable aggregate, prefer to represent as an integer
+ // type because it's more likely to be promotable.
if ((!SliceTy || !SliceTy->isSingleValueType()) && CommonUseTy.second)
if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) {
SliceTy = CommonUseTy.second;
>From fbdea1e64c9acd13ce24fda0cd23347d7327ba94 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 13 Nov 2025 01:06:26 +0000
Subject: [PATCH 03/41] julia fix
---
llvm/lib/Transforms/Scalar/SROA.cpp | 6 ++--
.../SROA/prefer-integer-partition.ll | 36 +++++++++++++++++--
2 files changed, 37 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 9f9d564c1cdf4..bd9fdcc9fee1a 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5218,8 +5218,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
if ((!SliceTy || !SliceTy->isSingleValueType()) && CommonUseTy.second)
if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) {
SliceTy = CommonUseTy.second;
- if ((!SliceTy || (SliceTy->isArrayTy() &&
- SliceTy->getArrayElementType()->isIntegerTy())) &&
+ SliceVecTy = dyn_cast<VectorType>(SliceTy);
+ }
+ // Try representing the partition as a legal integer type of the same size as the alloca.
+ if ((!SliceTy || SliceTy->isArrayTy()) &&
DL.isLegalInteger(P.size() * 8)) {
SliceTy = Type::getIntNTy(*C, P.size() * 8);
}
diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
index 3606af8debd69..0ed400f18bc37 100644
--- a/llvm/test/Transforms/SROA/prefer-integer-partition.ll
+++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
@@ -1,10 +1,35 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=sroa -S | FileCheck %s
-; Ensure that the [2 x half] alloca is spanned by an i32 partition.
+; Test that SROA converts array types to integer types for promotion.
-define void @test() {
-; CHECK-LABEL: @test(
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32-ni:10:11:12:13"
+
+define void @test_float_array_only_intrinsics() {
+; CHECK-LABEL: @test_float_array_only_intrinsics(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret void
+;
+entry:
+ %src = alloca [2 x float], align 4
+ %dst = alloca [2 x float], align 4
+
+ ; Initialize src
+ call void @llvm.lifetime.start.p0(i64 8, ptr %src)
+ call void @llvm.lifetime.start.p0(i64 8, ptr %dst)
+
+ ; Only intrinsic uses - no scalar loads/stores to establish common type
+ call void @llvm.memset.p0.i64(ptr %src, i8 42, i64 8, i1 false)
+ call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false)
+ call void @llvm.memcpy.p0.p0.i64(ptr %src, ptr %dst, i64 8, i1 false)
+
+ call void @llvm.lifetime.end.p0(i64 8, ptr %dst)
+ call void @llvm.lifetime.end.p0(i64 8, ptr %src)
+ ret void
+}
+
+define void @test_mixed_types() {
+; CHECK-LABEL: @test_mixed_types(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 42 to float
; CHECK-NEXT: ret void
@@ -15,3 +40,8 @@ entry:
%val = load float, ptr %alloca
ret void
}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
>From f9034d3d05fe80ef26bedbed7d977cba4421c551 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 13 Nov 2025 01:14:32 +0000
Subject: [PATCH 04/41] format
---
llvm/lib/Transforms/Scalar/SROA.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index bd9fdcc9fee1a..981bb41eba2ce 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5220,9 +5220,9 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
SliceTy = CommonUseTy.second;
SliceVecTy = dyn_cast<VectorType>(SliceTy);
}
- // Try representing the partition as a legal integer type of the same size as the alloca.
- if ((!SliceTy || SliceTy->isArrayTy()) &&
- DL.isLegalInteger(P.size() * 8)) {
+ // Try representing the partition as a legal integer type of the same size as
+ // the alloca.
+ if ((!SliceTy || SliceTy->isArrayTy()) && DL.isLegalInteger(P.size() * 8)) {
SliceTy = Type::getIntNTy(*C, P.size() * 8);
}
>From dce0066aaff2fe33a5bf20102e682233a115d138 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 13 Nov 2025 01:21:45 +0000
Subject: [PATCH 05/41] remove comment
---
llvm/test/Transforms/SROA/prefer-integer-partition.ll | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
index 0ed400f18bc37..78d3b22d3fdc0 100644
--- a/llvm/test/Transforms/SROA/prefer-integer-partition.ll
+++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
@@ -14,7 +14,6 @@ entry:
%src = alloca [2 x float], align 4
%dst = alloca [2 x float], align 4
- ; Initialize src
call void @llvm.lifetime.start.p0(i64 8, ptr %src)
call void @llvm.lifetime.start.p0(i64 8, ptr %dst)
>From 19033e91ac1031a91c5345c114bc05ff52643538 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 19 Nov 2025 22:48:36 +0000
Subject: [PATCH 06/41] test
---
llvm/test/Transforms/SROA/prefer-integer-partition.ll | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
index 78d3b22d3fdc0..b9a7af6276565 100644
--- a/llvm/test/Transforms/SROA/prefer-integer-partition.ll
+++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
@@ -13,15 +13,15 @@ define void @test_float_array_only_intrinsics() {
entry:
%src = alloca [2 x float], align 4
%dst = alloca [2 x float], align 4
-
+
call void @llvm.lifetime.start.p0(i64 8, ptr %src)
call void @llvm.lifetime.start.p0(i64 8, ptr %dst)
-
+
; Only intrinsic uses - no scalar loads/stores to establish common type
call void @llvm.memset.p0.i64(ptr %src, i8 42, i64 8, i1 false)
call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false)
call void @llvm.memcpy.p0.p0.i64(ptr %src, ptr %dst, i64 8, i1 false)
-
+
call void @llvm.lifetime.end.p0(i64 8, ptr %dst)
call void @llvm.lifetime.end.p0(i64 8, ptr %src)
ret void
>From 9143867ac80a8108bce4aa2cc414117e0ec83aa3 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 25 Nov 2025 18:22:17 +0000
Subject: [PATCH 07/41] solve regression
---
llvm/lib/Transforms/Scalar/SROA.cpp | 28 ++++++++++++++++++++++------
1 file changed, 22 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 981bb41eba2ce..c58f73455e68c 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5230,12 +5230,28 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size());
- bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
-
- VectorType *VecTy =
- IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale);
- if (VecTy)
- SliceTy = VecTy;
+ // Prefer vector promotion over integer widening for floating-point vectors
+ // because it is more likely the user is just accessing whole vector elements
+ // and not doing bitsise arithmetic.
+ bool PreferVectorPromotion = false;
+ if (auto *FixedVecSliceTy = dyn_cast<FixedVectorType>(SliceTy))
+ PreferVectorPromotion = FixedVecSliceTy->getElementType()->isFloatingPointTy();
+
+ bool IsIntegerPromotable = false;
+ VectorType *VecTy = nullptr;
+
+ if (PreferVectorPromotion) {
+ // For float vectors, try vector promotion first
+ VecTy = isVectorPromotionViable(P, DL, VScale);
+ if (!VecTy)
+ IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
+ } else {
+ // For integer vectors (especially small integers like i8), try integer
+ // widening first as InstCombine can optimize the resulting operations
+ IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
+ if (!IsIntegerPromotable)
+ VecTy = isVectorPromotionViable(P, DL, VScale);
+ }
// Check for the case where we're going to rewrite to a new alloca of the
// exact same type as the original, and with the same access offsets. In that
>From e7a51308e1df613478979579a89c0e24ad4db986 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 26 Nov 2025 08:06:00 +0000
Subject: [PATCH 08/41] updated to fix regression
---
llvm/lib/Transforms/Scalar/SROA.cpp | 116 +++++++++++++---------------
1 file changed, 53 insertions(+), 63 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index c58f73455e68c..20d907b4e071a 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5191,67 +5191,57 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
/// promoted.
AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
Partition &P) {
- // Try to compute a friendly type for this partition of the alloca. This
- // won't always succeed, in which case we fall back to a legal integer type
- // or an i8 array of an appropriate size.
- Type *SliceTy = nullptr;
const DataLayout &DL = AI.getDataLayout();
- unsigned VScale = AI.getFunction()->getVScaleValue();
-
- std::pair<Type *, IntegerType *> CommonUseTy =
- findCommonType(P.begin(), P.end(), P.endOffset());
- // Do all uses operate on the same type?
- if (CommonUseTy.first) {
- TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first);
- if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size())
- SliceTy = CommonUseTy.first;
- }
- // If not, can we find an appropriate subtype in the original allocated type?
- if (!SliceTy)
- if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
- P.beginOffset(), P.size()))
- SliceTy = TypePartitionTy;
-
- // If still not, can we use the largest bitwidth integer type used?
- // If SliceTy is a non-promotable aggregate, prefer to represent as an integer
- // type because it's more likely to be promotable.
- if ((!SliceTy || !SliceTy->isSingleValueType()) && CommonUseTy.second)
- if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) {
- SliceTy = CommonUseTy.second;
- SliceVecTy = dyn_cast<VectorType>(SliceTy);
+ auto ComputePartitionTy = [&]() -> std::tuple<Type *, bool, VectorType *> {
+ // First check if the partition is viable for vetor promotion. If it is
+ // via a floating-point vector, we are done because we would never prefer integer widening.
+ VectorType *VecTy = isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue());
+ if (VecTy) {
+ if (VecTy->getElementType()->isFloatingPointTy()) {
+ return {VecTy, false, VecTy};
+ }
}
- // Try representing the partition as a legal integer type of the same size as
- // the alloca.
- if ((!SliceTy || SliceTy->isArrayTy()) && DL.isLegalInteger(P.size() * 8)) {
- SliceTy = Type::getIntNTy(*C, P.size() * 8);
- }
-
- if (!SliceTy)
- SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
- assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size());
-
- // Prefer vector promotion over integer widening for floating-point vectors
- // because it is more likely the user is just accessing whole vector elements
- // and not doing bitsise arithmetic.
- bool PreferVectorPromotion = false;
- if (auto *FixedVecSliceTy = dyn_cast<FixedVectorType>(SliceTy))
- PreferVectorPromotion = FixedVecSliceTy->getElementType()->isFloatingPointTy();
-
- bool IsIntegerPromotable = false;
- VectorType *VecTy = nullptr;
-
- if (PreferVectorPromotion) {
- // For float vectors, try vector promotion first
- VecTy = isVectorPromotionViable(P, DL, VScale);
- if (!VecTy)
- IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
- } else {
- // For integer vectors (especially small integers like i8), try integer
- // widening first as InstCombine can optimize the resulting operations
- IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
- if (!IsIntegerPromotable)
- VecTy = isVectorPromotionViable(P, DL, VScale);
- }
+
+ // Otherwise, check if there is a common type that all slices of the
+ // partition use. Collect the largest integer type used as a backup.
+ auto CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset());
+ // If there is a common type that spans the partition, use it.
+ if (CommonUseTy.first) {
+ TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first);
+ if (CommonUseSize.isFixed() &&
+ CommonUseSize.getFixedValue() >= P.size()) {
+
+ if (VecTy)
+ return {VecTy, false, VecTy};
+ return {CommonUseTy.first, isIntegerWideningViable(P, CommonUseTy.first, DL), nullptr};
+ }
+ }
+
+ // If not, can we find an appropriate subtype in the original allocated type?
+ if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) {
+ if (TypePartitionTy->isArrayTy() && TypePartitionTy->getArrayElementType()->isIntegerTy() && DL.isLegalInteger(P.size() * 8))
+ TypePartitionTy = Type::getIntNTy(*C, P.size() * 8);
+
+ if (isIntegerWideningViable(P, TypePartitionTy, DL))
+ return {TypePartitionTy, true, nullptr};
+ if (VecTy)
+ return {VecTy, false, VecTy};
+ if (CommonUseTy.second && DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size() && isIntegerWideningViable(P, CommonUseTy.second, DL))
+ return {CommonUseTy.second, true, nullptr};
+ return {TypePartitionTy, false, nullptr};
+ }
+
+ // If still not, can we use the largest bitwidth integer type used?
+ if (CommonUseTy.second && DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size())
+ return {CommonUseTy.second, false, nullptr};
+
+ if (DL.isLegalInteger(P.size() * 8))
+ return {Type::getIntNTy(*C, P.size() * 8), false, nullptr};
+
+ return {ArrayType::get(Type::getInt8Ty(*C), P.size()), false, nullptr};
+ };
+
+ auto [PartitionTy, IsIntegerPromotable, VecTy] = ComputePartitionTy();
// Check for the case where we're going to rewrite to a new alloca of the
// exact same type as the original, and with the same access offsets. In that
@@ -5260,7 +5250,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// P.beginOffset() can be non-zero even with the same type in a case with
// out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
AllocaInst *NewAI;
- if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
+ if (PartitionTy == AI.getAllocatedType() && P.beginOffset() == 0) {
NewAI = &AI;
// FIXME: We should be able to bail at this point with "nothing changed".
// FIXME: We might want to defer PHI speculation until after here.
@@ -5270,10 +5260,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
// If we will get at least this much alignment from the type alone, leave
// the alloca's alignment unconstrained.
- const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
+ const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(PartitionTy);
NewAI = new AllocaInst(
- SliceTy, AI.getAddressSpace(), nullptr,
- IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
+ PartitionTy, AI.getAddressSpace(), nullptr,
+ IsUnconstrained ? DL.getPrefTypeAlign(PartitionTy) : Alignment,
AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
AI.getIterator());
// Copy the old AI debug location over to the new one.
>From aa2e68bbafd0187ee58cb3a48f13a571feec1551 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 26 Nov 2025 08:10:43 +0000
Subject: [PATCH 09/41] remove julia test
---
.../SROA/prefer-integer-partition.ll | 82 +++++++++++++------
1 file changed, 56 insertions(+), 26 deletions(-)
diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
index b9a7af6276565..5b639169cc207 100644
--- a/llvm/test/Transforms/SROA/prefer-integer-partition.ll
+++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
@@ -1,30 +1,65 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=sroa -S | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt %s -passes=sroa -S | FileCheck %s
-; Test that SROA converts array types to integer types for promotion.
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
-target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32-ni:10:11:12:13"
+%"struct.pbrt::RaySamples" = type { %struct.anon.45, %struct.anon.46, i8, %struct.anon.47 }
+%struct.anon.45 = type { %"class.pbrt::Point2", float }
+%"class.pbrt::Point2" = type { %"class.pbrt::Tuple2" }
+%"class.pbrt::Tuple2" = type { float, float }
+%struct.anon.46 = type { float, float, %"class.pbrt::Point2" }
+%struct.anon.47 = type { float, %"class.pbrt::Point2" }
-define void @test_float_array_only_intrinsics() {
-; CHECK-LABEL: @test_float_array_only_intrinsics(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: ret void
+define <2 x float> @subsurface_test() local_unnamed_addr {
+; CHECK-LABEL: define <2 x float> @subsurface_test() local_unnamed_addr {
+; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr inttoptr (i64 12 to ptr), align 4
+; CHECK-NEXT: [[TMP2:%.*]] = fptosi float [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i1
+; CHECK-NEXT: br i1 [[TMP3]], label %[[BB4:.*]], label %[[_ZNK4PBRT3SOAINS_10RAYSAMPLESEEIXEI_EXIT:.*]]
+; CHECK: [[BB4]]:
+; CHECK-NEXT: [[TMP5:%.*]] = load volatile { <2 x float>, <2 x float> }, ptr null, align 8
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP5]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP5]], 1
+; CHECK-NEXT: [[BC_I:%.*]] = bitcast <2 x float> [[TMP6]] to <2 x i32>
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[BC_I]], i64 1
+; CHECK-NEXT: [[BC2_I:%.*]] = bitcast <2 x float> [[TMP7]] to <2 x i32>
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[BC2_I]], i64 0
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP8]] to float
+; CHECK-NEXT: [[DOTSROA_1_36_VEC_INSERT:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP12]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP9]] to float
+; CHECK-NEXT: [[DOTSROA_1_40_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_1_36_VEC_INSERT]], float [[TMP11]], i32 1
+; CHECK-NEXT: br label %[[_ZNK4PBRT3SOAINS_10RAYSAMPLESEEIXEI_EXIT]]
+; CHECK: [[_ZNK4PBRT3SOAINS_10RAYSAMPLESEEIXEI_EXIT]]:
+; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x float> [ [[DOTSROA_1_40_VEC_INSERT]], %[[BB4]] ], [ zeroinitializer, [[TMP0:%.*]] ]
+; CHECK-NEXT: ret <2 x float> [[TMP10]]
;
-entry:
- %src = alloca [2 x float], align 4
- %dst = alloca [2 x float], align 4
+ %1 = alloca %"struct.pbrt::RaySamples", align 4
+ %2 = getelementptr i8, ptr %1, i64 36
+ store i64 0, ptr %2, align 4
+ %3 = load float, ptr inttoptr (i64 12 to ptr), align 4
+ %4 = fptosi float %3 to i32
+ %5 = trunc i32 %4 to i1
+ br i1 %5, label %6, label %_ZNK4pbrt3SOAINS_10RaySamplesEEixEi.exit
- call void @llvm.lifetime.start.p0(i64 8, ptr %src)
- call void @llvm.lifetime.start.p0(i64 8, ptr %dst)
+6: ; preds = %0
+ %7 = load volatile { <2 x float>, <2 x float> }, ptr null, align 8
+ %8 = extractvalue { <2 x float>, <2 x float> } %7, 0
+ %9 = extractvalue { <2 x float>, <2 x float> } %7, 1
+ store float 0.000000e+00, ptr %1, align 4
+ %bc.i = bitcast <2 x float> %8 to <2 x i32>
+ %10 = extractelement <2 x i32> %bc.i, i64 1
+ %bc2.i = bitcast <2 x float> %9 to <2 x i32>
+ %11 = extractelement <2 x i32> %bc2.i, i64 0
+ store i32 %10, ptr %2, align 4
+ %.sroa_idx1.i = getelementptr i8, ptr %1, i64 40
+ store i32 %11, ptr %.sroa_idx1.i, align 4
+ br label %_ZNK4pbrt3SOAINS_10RaySamplesEEixEi.exit
- ; Only intrinsic uses - no scalar loads/stores to establish common type
- call void @llvm.memset.p0.i64(ptr %src, i8 42, i64 8, i1 false)
- call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false)
- call void @llvm.memcpy.p0.p0.i64(ptr %src, ptr %dst, i64 8, i1 false)
-
- call void @llvm.lifetime.end.p0(i64 8, ptr %dst)
- call void @llvm.lifetime.end.p0(i64 8, ptr %src)
- ret void
+_ZNK4pbrt3SOAINS_10RaySamplesEEixEi.exit: ; preds = %0, %6
+ %12 = getelementptr inbounds nuw i8, ptr %1, i64 36
+ %.sroa.01.0.copyload = load <2 x float>, ptr %12, align 4
+ ret <2 x float> %.sroa.01.0.copyload
}
define void @test_mixed_types() {
@@ -39,8 +74,3 @@ entry:
%val = load float, ptr %alloca
ret void
}
-
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
>From bab7209226a67e0dd828b3e6d6e264494d1a1f8e Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 26 Nov 2025 08:10:51 +0000
Subject: [PATCH 10/41] format
---
llvm/lib/Transforms/Scalar/SROA.cpp | 28 +++++++++++++++++++---------
1 file changed, 19 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 20d907b4e071a..a767fcdd1dc96 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5194,8 +5194,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
const DataLayout &DL = AI.getDataLayout();
auto ComputePartitionTy = [&]() -> std::tuple<Type *, bool, VectorType *> {
// First check if the partition is viable for vetor promotion. If it is
- // via a floating-point vector, we are done because we would never prefer integer widening.
- VectorType *VecTy = isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue());
+ // via a floating-point vector, we are done because we would never prefer
+ // integer widening.
+ VectorType *VecTy =
+ isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue());
if (VecTy) {
if (VecTy->getElementType()->isFloatingPointTy()) {
return {VecTy, false, VecTy};
@@ -5213,26 +5215,34 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
if (VecTy)
return {VecTy, false, VecTy};
- return {CommonUseTy.first, isIntegerWideningViable(P, CommonUseTy.first, DL), nullptr};
+ return {CommonUseTy.first,
+ isIntegerWideningViable(P, CommonUseTy.first, DL), nullptr};
}
}
- // If not, can we find an appropriate subtype in the original allocated type?
- if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) {
- if (TypePartitionTy->isArrayTy() && TypePartitionTy->getArrayElementType()->isIntegerTy() && DL.isLegalInteger(P.size() * 8))
+ // If not, can we find an appropriate subtype in the original allocated
+ // type?
+ if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
+ P.beginOffset(), P.size())) {
+ if (TypePartitionTy->isArrayTy() &&
+ TypePartitionTy->getArrayElementType()->isIntegerTy() &&
+ DL.isLegalInteger(P.size() * 8))
TypePartitionTy = Type::getIntNTy(*C, P.size() * 8);
-
+
if (isIntegerWideningViable(P, TypePartitionTy, DL))
return {TypePartitionTy, true, nullptr};
if (VecTy)
return {VecTy, false, VecTy};
- if (CommonUseTy.second && DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size() && isIntegerWideningViable(P, CommonUseTy.second, DL))
+ if (CommonUseTy.second &&
+ DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size() &&
+ isIntegerWideningViable(P, CommonUseTy.second, DL))
return {CommonUseTy.second, true, nullptr};
return {TypePartitionTy, false, nullptr};
}
// If still not, can we use the largest bitwidth integer type used?
- if (CommonUseTy.second && DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size())
+ if (CommonUseTy.second &&
+ DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size())
return {CommonUseTy.second, false, nullptr};
if (DL.isLegalInteger(P.size() * 8))
>From 76ea73494276c94ef3721f26ac95b501c23806d3 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 26 Nov 2025 21:00:40 +0000
Subject: [PATCH 11/41] improvements on arm
---
.../AArch64/neon-scalar-x-indexed-elem.c | 98 +++-----
.../CodeGen/arm-bf16-convert-intrinsics.c | 237 ++++++++----------
2 files changed, 143 insertions(+), 192 deletions(-)
diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
index 9b98126500444..2b1af62789eac 100644
--- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
@@ -111,8 +111,8 @@ float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i32 0
-// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <1 x double> [[B]], i32 0
-// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE4]])
+// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <1 x double> [[B]], i32 0
+// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE3]])
// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[A]], double [[VMULXD_F64_I]], i32 0
// CHECK-NEXT: ret <1 x double> [[VSET_LANE]]
//
@@ -196,19 +196,13 @@ float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
// CHECK-LABEL: define dso_local <1 x double> @test_vfma_lane_f64(
// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
-// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
-// CHECK-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer
-// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to <8 x i8>
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
// CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
// CHECK-NEXT: ret <1 x double> [[FMLA2]]
//
@@ -219,20 +213,14 @@ float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
// CHECK-LABEL: define dso_local <1 x double> @test_vfms_lane_f64(
// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
// CHECK-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]]
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64
-// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
-// CHECK-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer
-// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to <8 x i8>
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
// CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
// CHECK-NEXT: ret <1 x double> [[FMLA2]]
//
@@ -243,21 +231,16 @@ float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
// CHECK-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64(
// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
-// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
-// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double
-// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double
-// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
-// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]])
-// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
-// CHECK-NEXT: ret <1 x double> [[TMP10]]
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <16 x i8>
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
+// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// CHECK-NEXT: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
+// CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
+// CHECK-NEXT: ret <1 x double> [[TMP7]]
//
float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
return vfma_laneq_f64(a, b, v, 0);
@@ -266,22 +249,17 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
// CHECK-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64(
// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
// CHECK-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]]
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64
-// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
-// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double
-// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double
-// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
-// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]])
-// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
-// CHECK-NEXT: ret <1 x double> [[TMP10]]
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <16 x i8>
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
+// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// CHECK-NEXT: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
+// CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
+// CHECK-NEXT: ret <1 x double> [[TMP7]]
//
float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
return vfms_laneq_f64(a, b, v, 0);
@@ -555,8 +533,8 @@ int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
-// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
-// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE9]])
+// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE8]])
// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
// CHECK-NEXT: ret <1 x double> [[VSET_LANE]]
//
diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index 65a23dc0325c8..ee1c1af53811d 100644
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -26,21 +26,19 @@
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16>
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-A64-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-// CHECK-A64-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
-// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
-// CHECK-A64-NEXT: ret <4 x float> [[TMP4]]
+// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A64-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
+// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
+// CHECK-A64-NEXT: ret <4 x float> [[TMP3]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvt_f32_bf16(
// CHECK-A32-HARDFP-NEXT: entry:
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16>
// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP4]]
+// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
+// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvt_f32_bf16(
// CHECK-A32-SOFTFP-NEXT: entry:
@@ -49,11 +47,10 @@
// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <4 x i16>
// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP6]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP7]]
+// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP5]], splat (i32 16)
+// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP6]]
//
float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
return vcvt_f32_bf16(a);
@@ -64,22 +61,20 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-A64-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
-// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A64-NEXT: ret <4 x float> [[TMP4]]
+// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
+// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A64-NEXT: ret <4 x float> [[TMP3]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_f32_bf16(
// CHECK-A32-HARDFP-NEXT: entry:
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP4]]
+// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
+// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_f32_bf16(
// CHECK-A32-SOFTFP-NEXT: entry:
@@ -95,11 +90,10 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP8]] to <4 x i16>
// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP13]]
+// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = sext <4 x i16> [[TMP9]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16)
+// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP12]]
//
float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
return vcvtq_low_f32_bf16(a);
@@ -110,22 +104,20 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-A64-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
-// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A64-NEXT: ret <4 x float> [[TMP4]]
+// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
+// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A64-NEXT: ret <4 x float> [[TMP3]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_f32_bf16(
// CHECK-A32-HARDFP-NEXT: entry:
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP4]]
+// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
+// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_f32_bf16(
// CHECK-A32-SOFTFP-NEXT: entry:
@@ -141,11 +133,10 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP8]] to <4 x i16>
// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP13]]
+// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = sext <4 x i16> [[TMP9]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16)
+// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP12]]
//
float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
return vcvtq_high_f32_bf16(a);
@@ -153,33 +144,30 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
// CHECK-A64-LABEL: @test_vcvt_bf16_f32(
// CHECK-A64-NEXT: entry:
-// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x bfloat>
-// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP3]]
+// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <16 x i8>
+// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
-// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]])
+// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
// CHECK-A32-HARDFP-NEXT: ret <4 x bfloat> [[VCVTFP2BF1_I]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvt_bf16_f32(
// CHECK-A32-SOFTFP-NEXT: entry:
-// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]])
-// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: ret <2 x i32> [[TMP7]]
+// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
+// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[TMP1]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: ret <2 x i32> [[TMP6]]
//
bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
return vcvt_bf16_f32(a);
@@ -187,44 +175,36 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
// CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A64-NEXT: entry:
-// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x bfloat>
-// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP3]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]]
+// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <16 x i8>
+// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP2]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
-// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast i64 0 to <4 x bfloat>
-// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]])
-// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP0]], <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
+// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A32-SOFTFP-NEXT: entry:
-// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast i64 0 to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]])
-// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
+// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[TMP1]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP0]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP6]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP8]], <4 x bfloat> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <8 x bfloat> [[TMP11]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[TMP12]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP14]]
+// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> zeroinitializer to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP5]], <4 x bfloat> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <8 x bfloat> [[TMP10]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP11]]
//
bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
return vcvtq_low_bf16_f32(a);
@@ -232,23 +212,18 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
// CHECK-A64-LABEL: @test_vcvtq_high_bf16_f32(
// CHECK-A64-NEXT: entry:
-// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <8 x i16>
-// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x bfloat>
-// CHECK-A64-NEXT: [[TMP5:%.*]] = shufflevector <8 x bfloat> [[TMP4]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-A64-NEXT: [[TMP7:%.*]] = fptrunc <4 x float> [[TMP6]] to <4 x bfloat>
-// CHECK-A64-NEXT: [[TMP8:%.*]] = shufflevector <4 x bfloat> [[TMP5]], <4 x bfloat> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP8]]
+// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
+// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[TMP4]] to <16 x i8>
+// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
-// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]])
+// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[INACTIVE:%.*]], <8 x bfloat> [[INACTIVE]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I8:%.*]] = shufflevector <4 x bfloat> [[VCVTFP2BF1_I]], <4 x bfloat> [[SHUFFLE_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I8]]
@@ -258,29 +233,27 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[INACTIVE_COERCE:%.*]] to <8 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <4 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]])
-// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP9]], <8 x bfloat> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x bfloat> [[TMP7]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x bfloat> [[TMP11]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
+// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP8]], <8 x bfloat> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x bfloat> [[TMP10]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> [[TMP11]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP12]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[TMP14]], <4 x bfloat> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A32-SOFTFP-NEXT: [[TMP16:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I17]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP17:%.*]] = bitcast <4 x i32> [[TMP16]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP18:%.*]] = bitcast <8 x bfloat> [[TMP17]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP18]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP20:%.*]] = bitcast <8 x bfloat> [[TMP19]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP20]]
+// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[TMP13]], <4 x bfloat> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-SOFTFP-NEXT: [[TMP15:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I17]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP15]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP17:%.*]] = bitcast <8 x bfloat> [[TMP16]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP18:%.*]] = bitcast <4 x i32> [[TMP17]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP19:%.*]] = bitcast <8 x bfloat> [[TMP18]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP19]]
//
bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
return vcvtq_high_bf16_f32(inactive, a);
@@ -308,7 +281,7 @@ bfloat16_t test_vcvth_bf16_f32(float32_t a) {
// CHECK-LABEL: @test_vcvtah_f32_bf16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast bfloat [[A:%.*]] to i16
-// CHECK-NEXT: [[CONV_I:%.*]] = zext i16 [[TMP0]] to i32
+// CHECK-NEXT: [[CONV_I:%.*]] = sext i16 [[TMP0]] to i32
// CHECK-NEXT: [[SHL_I:%.*]] = shl i32 [[CONV_I]], 16
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[SHL_I]] to float
// CHECK-NEXT: ret float [[TMP1]]
>From c81564c5d06ee2442866b6547340bae4f8c6c794 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 26 Nov 2025 21:50:11 +0000
Subject: [PATCH 12/41] arm changes
---
.../AArch64/neon-scalar-x-indexed-elem.c | 120 +++++----
.../CodeGen/arm-bf16-convert-intrinsics.c | 231 ++++++++++--------
2 files changed, 199 insertions(+), 152 deletions(-)
diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
index 2b1af62789eac..a86a80a939b16 100644
--- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
@@ -56,8 +56,8 @@ float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to double
// CHECK-NEXT: [[TMP1:%.*]] = fmul double [[TMP0]], [[B]]
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to <1 x double>
-// CHECK-NEXT: ret <1 x double> [[TMP2]]
+// CHECK-NEXT: [[REF_TMP_I_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double [[TMP1]], i32 0
+// CHECK-NEXT: ret <1 x double> [[REF_TMP_I_0_VEC_INSERT]]
//
float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
return vmul_n_f64(a, b);
@@ -111,8 +111,8 @@ float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i32 0
-// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <1 x double> [[B]], i32 0
-// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE3]])
+// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <1 x double> [[B]], i32 0
+// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE4]])
// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[A]], double [[VMULXD_F64_I]], i32 0
// CHECK-NEXT: ret <1 x double> [[VSET_LANE]]
//
@@ -196,13 +196,19 @@ float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
// CHECK-LABEL: define dso_local <1 x double> @test_vfma_lane_f64(
// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to <8 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to <8 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
-// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
+// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
+// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
+// CHECK-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer
+// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
// CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
// CHECK-NEXT: ret <1 x double> [[FMLA2]]
//
@@ -213,14 +219,20 @@ float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
// CHECK-LABEL: define dso_local <1 x double> @test_vfms_lane_f64(
// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
+// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
// CHECK-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]]
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to <8 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
-// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64
+// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
+// CHECK-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer
+// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
// CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
// CHECK-NEXT: ret <1 x double> [[FMLA2]]
//
@@ -231,16 +243,21 @@ float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
// CHECK-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64(
// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to <8 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <16 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-// CHECK-NEXT: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
-// CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
-// CHECK-NEXT: ret <1 x double> [[TMP7]]
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
+// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
+// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double
+// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double
+// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
+// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]])
+// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
+// CHECK-NEXT: ret <1 x double> [[TMP10]]
//
float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
return vfma_laneq_f64(a, b, v, 0);
@@ -249,17 +266,22 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
// CHECK-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64(
// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
+// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
// CHECK-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]]
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <16 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-// CHECK-NEXT: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
-// CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
-// CHECK-NEXT: ret <1 x double> [[TMP7]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64
+// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
+// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double
+// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double
+// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
+// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]])
+// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
+// CHECK-NEXT: ret <1 x double> [[TMP10]]
//
float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
return vfms_laneq_f64(a, b, v, 0);
@@ -530,12 +552,12 @@ int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_lane_f64_0(
// CHECK-SAME: ) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
-// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
-// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
-// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE8]])
-// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
+// CHECK-NEXT: [[__PROMOTE_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FD6304BC43AB5C2, i32 0
+// CHECK-NEXT: [[__PROMOTE2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FEE211E215AEEF3, i32 0
+// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], i32 0
+// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <1 x double> [[__PROMOTE2_SROA_0_0_VEC_INSERT]], i32 0
+// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE9]])
+// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], double [[VMULXD_F64_I]], i32 0
// CHECK-NEXT: ret <1 x double> [[VSET_LANE]]
//
float64x1_t test_vmulx_lane_f64_0() {
@@ -552,13 +574,13 @@ float64x1_t test_vmulx_lane_f64_0() {
// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_laneq_f64_2(
// CHECK-SAME: ) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
-// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
-// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
+// CHECK-NEXT: [[__PROMOTE_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FD6304BC43AB5C2, i32 0
+// CHECK-NEXT: [[__PROMOTE2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FEE211E215AEEF3, i32 0
+// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], <1 x double> [[__PROMOTE2_SROA_0_0_VEC_INSERT]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], i32 0
// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[SHUFFLE_I]], i32 1
// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
-// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
+// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], double [[VMULXD_F64_I]], i32 0
// CHECK-NEXT: ret <1 x double> [[VSET_LANE]]
//
float64x1_t test_vmulx_laneq_f64_2() {
diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index ee1c1af53811d..b7f961e4ce15c 100644
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -26,19 +26,21 @@
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16>
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A64-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
-// CHECK-A64-NEXT: ret <4 x float> [[TMP3]]
+// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-A64-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-A64-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
+// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
+// CHECK-A64-NEXT: ret <4 x float> [[TMP4]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvt_f32_bf16(
// CHECK-A32-HARDFP-NEXT: entry:
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16>
// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]]
+// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
+// CHECK-A32-HARDFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP4]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvt_f32_bf16(
// CHECK-A32-SOFTFP-NEXT: entry:
@@ -47,10 +49,11 @@
// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <4 x i16>
// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP5]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP6]]
+// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP6]], splat (i32 16)
+// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP7]]
//
float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
return vcvt_f32_bf16(a);
@@ -61,20 +64,22 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A64-NEXT: ret <4 x float> [[TMP3]]
+// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-A64-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
+// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A64-NEXT: ret <4 x float> [[TMP4]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_f32_bf16(
// CHECK-A32-HARDFP-NEXT: entry:
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]]
+// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
+// CHECK-A32-HARDFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP4]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_f32_bf16(
// CHECK-A32-SOFTFP-NEXT: entry:
@@ -90,10 +95,11 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP8]] to <4 x i16>
// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = sext <4 x i16> [[TMP9]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP12]]
+// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16)
+// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP13]]
//
float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
return vcvtq_low_f32_bf16(a);
@@ -104,20 +110,22 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A64-NEXT: ret <4 x float> [[TMP3]]
+// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-A64-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
+// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A64-NEXT: ret <4 x float> [[TMP4]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_f32_bf16(
// CHECK-A32-HARDFP-NEXT: entry:
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]]
+// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
+// CHECK-A32-HARDFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP4]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_f32_bf16(
// CHECK-A32-SOFTFP-NEXT: entry:
@@ -133,10 +141,11 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP8]] to <4 x i16>
// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = sext <4 x i16> [[TMP9]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
-// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP12]]
+// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16)
+// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP13]]
//
float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
return vcvtq_high_f32_bf16(a);
@@ -144,30 +153,33 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
// CHECK-A64-LABEL: @test_vcvt_bf16_f32(
// CHECK-A64-NEXT: entry:
-// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
-// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <16 x i8>
-// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]]
+// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x bfloat>
+// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP3]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
-// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
+// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]])
// CHECK-A32-HARDFP-NEXT: ret <4 x bfloat> [[VCVTFP2BF1_I]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvt_bf16_f32(
// CHECK-A32-SOFTFP-NEXT: entry:
-// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
-// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[TMP1]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: ret <2 x i32> [[TMP6]]
+// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]])
+// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: ret <2 x i32> [[TMP7]]
//
bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
return vcvt_bf16_f32(a);
@@ -175,36 +187,42 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
// CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A64-NEXT: entry:
-// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
-// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <16 x i8>
-// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP2]]
+// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x bfloat>
+// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP3]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
-// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
+// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]])
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A32-SOFTFP-NEXT: entry:
-// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
-// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[TMP1]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> zeroinitializer to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP5]], <4 x bfloat> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]])
+// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> zeroinitializer to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP6]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP7]], <4 x bfloat> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <8 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <8 x bfloat> [[TMP10]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP11]]
+// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <8 x bfloat> [[TMP12]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP13]]
//
bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
return vcvtq_low_bf16_f32(a);
@@ -212,18 +230,23 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
// CHECK-A64-LABEL: @test_vcvtq_high_bf16_f32(
// CHECK-A64-NEXT: entry:
-// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
-// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[TMP4]] to <16 x i8>
-// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]]
+// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <8 x i16>
+// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x bfloat>
+// CHECK-A64-NEXT: [[TMP5:%.*]] = shufflevector <8 x bfloat> [[TMP4]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
+// CHECK-A64-NEXT: [[TMP7:%.*]] = fptrunc <4 x float> [[TMP6]] to <4 x bfloat>
+// CHECK-A64-NEXT: [[TMP8:%.*]] = shufflevector <4 x bfloat> [[TMP5]], <4 x bfloat> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP8]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
-// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
+// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]])
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[INACTIVE:%.*]], <8 x bfloat> [[INACTIVE]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I8:%.*]] = shufflevector <4 x bfloat> [[VCVTFP2BF1_I]], <4 x bfloat> [[SHUFFLE_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I8]]
@@ -233,27 +256,29 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[INACTIVE_COERCE:%.*]] to <8 x bfloat>
// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <4 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
-// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP8]], <8 x bfloat> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x bfloat> [[TMP10]] to <2 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> [[TMP11]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]])
+// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP9]], <8 x bfloat> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x bfloat> [[TMP7]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x bfloat> [[TMP11]] to <2 x i32>
// CHECK-A32-SOFTFP-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP12]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[TMP13]], <4 x bfloat> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A32-SOFTFP-NEXT: [[TMP15:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I17]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> [[TMP15]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP17:%.*]] = bitcast <8 x bfloat> [[TMP16]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: [[TMP18:%.*]] = bitcast <4 x i32> [[TMP17]] to <8 x bfloat>
-// CHECK-A32-SOFTFP-NEXT: [[TMP19:%.*]] = bitcast <8 x bfloat> [[TMP18]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP19]]
+// CHECK-A32-SOFTFP-NEXT: [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[TMP14]], <4 x bfloat> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-SOFTFP-NEXT: [[TMP16:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I17]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP17:%.*]] = bitcast <4 x i32> [[TMP16]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP18:%.*]] = bitcast <8 x bfloat> [[TMP17]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP18]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT: [[TMP20:%.*]] = bitcast <8 x bfloat> [[TMP19]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP20]]
//
bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
return vcvtq_high_bf16_f32(inactive, a);
@@ -281,7 +306,7 @@ bfloat16_t test_vcvth_bf16_f32(float32_t a) {
// CHECK-LABEL: @test_vcvtah_f32_bf16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast bfloat [[A:%.*]] to i16
-// CHECK-NEXT: [[CONV_I:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK-NEXT: [[CONV_I:%.*]] = zext i16 [[TMP0]] to i32
// CHECK-NEXT: [[SHL_I:%.*]] = shl i32 [[CONV_I]], 16
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[SHL_I]] to float
// CHECK-NEXT: ret float [[TMP1]]
>From 182b53576cb9a84a4357c108b0e38f90cbdda464 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 27 Nov 2025 00:53:50 +0000
Subject: [PATCH 13/41] julia fix
---
llvm/lib/Transforms/Scalar/SROA.cpp | 36 ++++++++++++++++----------
llvm/test/Transforms/SROA/basictest.ll | 12 ++++-----
2 files changed, 28 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index a767fcdd1dc96..c59acc599e5ce 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1482,12 +1482,14 @@ LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
/// Walk the range of a partitioning looking for a common type to cover this
/// sequence of slices.
-static std::pair<Type *, IntegerType *>
+/// Returns: {CommonType, LargestIntegerType, OnlyIntrinsicUsers}
+static std::tuple<Type *, IntegerType *, bool>
findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
uint64_t EndOffset) {
Type *Ty = nullptr;
bool TyIsCommon = true;
IntegerType *ITy = nullptr;
+ bool OnlyIntrinsicUsers = true;
// Note that we need to look at *every* alloca slice's Use to ensure we
// always get consistent results regardless of the order of slices.
@@ -1495,6 +1497,8 @@ findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
Use *U = I->getUse();
if (isa<IntrinsicInst>(*U->getUser()))
continue;
+ // We found a non-intrinsic user
+ OnlyIntrinsicUsers = false;
if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
continue;
@@ -1528,7 +1532,7 @@ findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
Ty = UserTy;
}
- return {TyIsCommon ? Ty : nullptr, ITy};
+ return {TyIsCommon ? Ty : nullptr, ITy, OnlyIntrinsicUsers};
}
/// PHI instructions that use an alloca and are subsequently loaded can be
@@ -5206,20 +5210,24 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// Otherwise, check if there is a common type that all slices of the
// partition use. Collect the largest integer type used as a backup.
- auto CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset());
+ auto [CommonUseTy, LargestIntTy, OnlyIntrinsicUsers] =
+ findCommonType(P.begin(), P.end(), P.endOffset());
// If there is a common type that spans the partition, use it.
- if (CommonUseTy.first) {
- TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first);
+ if (CommonUseTy) {
+ TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy);
if (CommonUseSize.isFixed() &&
CommonUseSize.getFixedValue() >= P.size()) {
if (VecTy)
return {VecTy, false, VecTy};
- return {CommonUseTy.first,
- isIntegerWideningViable(P, CommonUseTy.first, DL), nullptr};
+ return {CommonUseTy, isIntegerWideningViable(P, CommonUseTy, DL),
+ nullptr};
}
}
+ if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8))
+ return {Type::getIntNTy(*C, P.size() * 8), false, nullptr};
+
// If not, can we find an appropriate subtype in the original allocated
// type?
if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
@@ -5233,17 +5241,17 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
return {TypePartitionTy, true, nullptr};
if (VecTy)
return {VecTy, false, VecTy};
- if (CommonUseTy.second &&
- DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size() &&
- isIntegerWideningViable(P, CommonUseTy.second, DL))
- return {CommonUseTy.second, true, nullptr};
+ if (LargestIntTy &&
+ DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size() &&
+ isIntegerWideningViable(P, LargestIntTy, DL))
+ return {LargestIntTy, true, nullptr};
return {TypePartitionTy, false, nullptr};
}
// If still not, can we use the largest bitwidth integer type used?
- if (CommonUseTy.second &&
- DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size())
- return {CommonUseTy.second, false, nullptr};
+ if (LargestIntTy &&
+ DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size())
+ return {LargestIntTy, false, nullptr};
if (DL.isLegalInteger(P.size() * 8))
return {Type::getIntNTy(*C, P.size() * 8), false, nullptr};
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index 15803f7b5a25b..b530742bd66ac 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -785,7 +785,7 @@ define i64 @test19(ptr %x) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[X:%.*]], align 1
; CHECK-NEXT: [[A_SROA_2_0_X_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 8
-; CHECK-NEXT: [[A_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr [[A_SROA_2_0_X_SROA_IDX]], align 1
+; CHECK-NEXT: [[A_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[A_SROA_2_0_X_SROA_IDX]], align 1
; CHECK-NEXT: ret i64 [[A_SROA_0_0_COPYLOAD]]
;
entry:
@@ -809,7 +809,7 @@ define i64 @test19_addrspacecast(ptr %x) {
; CHECK-NEXT: [[CAST1:%.*]] = addrspacecast ptr [[X:%.*]] to ptr addrspace(1)
; CHECK-NEXT: [[A_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr addrspace(1) [[CAST1]], align 1
; CHECK-NEXT: [[A_SROA_2_0_CAST1_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[CAST1]], i16 8
-; CHECK-NEXT: [[A_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(1) [[A_SROA_2_0_CAST1_SROA_IDX]], align 1
+; CHECK-NEXT: [[A_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(1) [[A_SROA_2_0_CAST1_SROA_IDX]], align 1
; CHECK-NEXT: ret i64 [[A_SROA_0_0_COPYLOAD]]
;
entry:
@@ -1332,10 +1332,10 @@ define void @PR15674(ptr %data, ptr %src, i32 %size) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP_SROA_0:%.*]] = alloca i32, align 4
; CHECK-NEXT: switch i32 [[SIZE:%.*]], label [[END:%.*]] [
-; CHECK-NEXT: i32 4, label [[BB4:%.*]]
-; CHECK-NEXT: i32 3, label [[BB3:%.*]]
-; CHECK-NEXT: i32 2, label [[BB2:%.*]]
-; CHECK-NEXT: i32 1, label [[BB1:%.*]]
+; CHECK-NEXT: i32 4, label [[BB4:%.*]]
+; CHECK-NEXT: i32 3, label [[BB3:%.*]]
+; CHECK-NEXT: i32 2, label [[BB2:%.*]]
+; CHECK-NEXT: i32 1, label [[BB1:%.*]]
; CHECK-NEXT: ]
; CHECK: bb4:
; CHECK-NEXT: [[SRC_GEP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i32 3
>From bc7240ff8aa3d4921ccdcd4d30ed95a67a1c892c Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 27 Nov 2025 00:56:01 +0000
Subject: [PATCH 14/41] julia test
---
.../SROA/prefer-integer-partition.ll | 22 +++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
index 5b639169cc207..bf0d2562a8745 100644
--- a/llvm/test/Transforms/SROA/prefer-integer-partition.ll
+++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
@@ -62,6 +62,28 @@ _ZNK4pbrt3SOAINS_10RaySamplesEEixEi.exit: ; preds = %0, %6
ret <2 x float> %.sroa.01.0.copyload
}
+define void @test_float_array_only_intrinsics() {
+; CHECK-LABEL: @test_float_array_only_intrinsics(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret void
+;
+entry:
+ %src = alloca [2 x float], align 4
+ %dst = alloca [2 x float], align 4
+
+ call void @llvm.lifetime.start.p0(i64 8, ptr %src)
+ call void @llvm.lifetime.start.p0(i64 8, ptr %dst)
+
+ ; Only intrinsic uses - no scalar loads/stores to establish common type
+ call void @llvm.memset.p0.i64(ptr %src, i8 42, i64 8, i1 false)
+ call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false)
+ call void @llvm.memcpy.p0.p0.i64(ptr %src, ptr %dst, i64 8, i1 false)
+
+ call void @llvm.lifetime.end.p0(i64 8, ptr %dst)
+ call void @llvm.lifetime.end.p0(i64 8, ptr %src)
+ ret void
+}
+
define void @test_mixed_types() {
; CHECK-LABEL: @test_mixed_types(
; CHECK-NEXT: entry:
>From 1af3e2093955e07bbf7e03b9aae351ba80462413 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 27 Nov 2025 01:12:11 +0000
Subject: [PATCH 15/41] adding comments
---
llvm/lib/Transforms/Scalar/SROA.cpp | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index c59acc599e5ce..7322b2005461f 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5196,7 +5196,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
Partition &P) {
const DataLayout &DL = AI.getDataLayout();
- auto ComputePartitionTy = [&]() -> std::tuple<Type *, bool, VectorType *> {
+ // Try to compute a friendly type for this partition of the alloca. This
+ // won't always succeed, in which case we fall back to a legal integer type
+ // or an i8 array of an appropriate size.
+ auto SelectPartitionTy = [&]() -> std::tuple<Type *, bool, VectorType *> {
// First check if the partition is viable for vetor promotion. If it is
// via a floating-point vector, we are done because we would never prefer
// integer widening.
@@ -5207,36 +5210,35 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
return {VecTy, false, VecTy};
}
}
-
// Otherwise, check if there is a common type that all slices of the
- // partition use. Collect the largest integer type used as a backup.
+ // partition use that spans the partition.
auto [CommonUseTy, LargestIntTy, OnlyIntrinsicUsers] =
findCommonType(P.begin(), P.end(), P.endOffset());
- // If there is a common type that spans the partition, use it.
if (CommonUseTy) {
TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy);
if (CommonUseSize.isFixed() &&
CommonUseSize.getFixedValue() >= P.size()) {
-
+ // Prefer vector promotion here because we already calculated it.
if (VecTy)
return {VecTy, false, VecTy};
return {CommonUseTy, isIntegerWideningViable(P, CommonUseTy, DL),
nullptr};
}
}
-
+ // If there are only intrinsic users, try to represent as a legal integer type
+ // because we are probably just copying data around and the integer can be promoted.
if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8))
return {Type::getIntNTy(*C, P.size() * 8), false, nullptr};
-
- // If not, can we find an appropriate subtype in the original allocated
+ // Can we find an appropriate subtype in the original allocated
// type?
if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
P.beginOffset(), P.size())) {
+ // If the partition is an integer array that can be spanned by a legal integer type,
+ // prefer to represent it as a legal integer type because it's more likely to be promotable.
if (TypePartitionTy->isArrayTy() &&
TypePartitionTy->getArrayElementType()->isIntegerTy() &&
DL.isLegalInteger(P.size() * 8))
TypePartitionTy = Type::getIntNTy(*C, P.size() * 8);
-
if (isIntegerWideningViable(P, TypePartitionTy, DL))
return {TypePartitionTy, true, nullptr};
if (VecTy)
@@ -5259,7 +5261,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
return {ArrayType::get(Type::getInt8Ty(*C), P.size()), false, nullptr};
};
- auto [PartitionTy, IsIntegerPromotable, VecTy] = ComputePartitionTy();
+ auto [PartitionTy, IsIntegerPromotable, VecTy] = SelectPartitionTy();
// Check for the case where we're going to rewrite to a new alloca of the
// exact same type as the original, and with the same access offsets. In that
>From 3097439971c0ebeeb84a7cdcc7ac01edc4048a32 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 27 Nov 2025 01:51:35 +0000
Subject: [PATCH 16/41] add comments
---
llvm/lib/Transforms/Scalar/SROA.cpp | 35 +++++++++++++++++++----------
1 file changed, 23 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 7322b2005461f..a3975e9a041f0 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5200,35 +5200,41 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// won't always succeed, in which case we fall back to a legal integer type
// or an i8 array of an appropriate size.
auto SelectPartitionTy = [&]() -> std::tuple<Type *, bool, VectorType *> {
- // First check if the partition is viable for vetor promotion. If it is
- // via a floating-point vector, we are done because we would never prefer
- // integer widening.
+ // First check if the partition is viable for vetor promotion.
+ // We prefer vector promotion over integer widening promotion when:
+ // - The vector element type is a floating-point type.
+ // - All the loads/stores to the alloca are vector loads/stores to the entire alloca.
+ // Otherwise when there is a integer vector with mixed loads/stores we prefer integer widening
+ // promotion because it's more likely the user is doing bitwise arithmetic and we
+ // generate better code.
VectorType *VecTy =
isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue());
- if (VecTy) {
- if (VecTy->getElementType()->isFloatingPointTy()) {
- return {VecTy, false, VecTy};
- }
- }
- // Otherwise, check if there is a common type that all slices of the
- // partition use that spans the partition.
+ // If the vector element type is a floating-point type, we prefer vector promotion.
+ if (VecTy && VecTy->getElementType()->isFloatingPointTy())
+ return {VecTy, false, VecTy};
+
+ // Check if there is a common type that all slices of the partition use that spans the partition.
auto [CommonUseTy, LargestIntTy, OnlyIntrinsicUsers] =
findCommonType(P.begin(), P.end(), P.endOffset());
if (CommonUseTy) {
TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy);
if (CommonUseSize.isFixed() &&
CommonUseSize.getFixedValue() >= P.size()) {
- // Prefer vector promotion here because we already calculated it.
+ // We prefer vector promotion here because if vector promotion is viable and
+ // there is a common type used, then it implies the second listed condition for prefering
+ // vector promotion is true.
if (VecTy)
return {VecTy, false, VecTy};
return {CommonUseTy, isIntegerWideningViable(P, CommonUseTy, DL),
nullptr};
}
}
+
// If there are only intrinsic users, try to represent as a legal integer type
// because we are probably just copying data around and the integer can be promoted.
if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8))
return {Type::getIntNTy(*C, P.size() * 8), false, nullptr};
+
// Can we find an appropriate subtype in the original allocated
// type?
if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
@@ -5239,25 +5245,30 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
TypePartitionTy->getArrayElementType()->isIntegerTy() &&
DL.isLegalInteger(P.size() * 8))
TypePartitionTy = Type::getIntNTy(*C, P.size() * 8);
+ // There was no common type used, so we prefer integer widening promotion.
if (isIntegerWideningViable(P, TypePartitionTy, DL))
return {TypePartitionTy, true, nullptr};
if (VecTy)
return {VecTy, false, VecTy};
+ // If we couldn't promotion with TypePartitionTy, try with the largest integer type used.
if (LargestIntTy &&
DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size() &&
isIntegerWideningViable(P, LargestIntTy, DL))
return {LargestIntTy, true, nullptr};
+ // Fallback to TypePartitionTy and we probably won't promote.
return {TypePartitionTy, false, nullptr};
}
- // If still not, can we use the largest bitwidth integer type used?
+ // Select the largest integer type used if it spans the partition.
if (LargestIntTy &&
DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size())
return {LargestIntTy, false, nullptr};
+ // Select a legal integer type if it spans the partition.
if (DL.isLegalInteger(P.size() * 8))
return {Type::getIntNTy(*C, P.size() * 8), false, nullptr};
+ // Fallback to an i8 array.
return {ArrayType::get(Type::getInt8Ty(*C), P.size()), false, nullptr};
};
>From 8dfec91cc86f3f3b087a4847bf5181fd244c6897 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 27 Nov 2025 01:51:45 +0000
Subject: [PATCH 17/41] format'
---
llvm/lib/Transforms/Scalar/SROA.cpp | 33 +++++++++++++++++------------
1 file changed, 19 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index a3975e9a041f0..c3fc2234e5391 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5203,26 +5203,28 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// First check if the partition is viable for vetor promotion.
// We prefer vector promotion over integer widening promotion when:
// - The vector element type is a floating-point type.
- // - All the loads/stores to the alloca are vector loads/stores to the entire alloca.
- // Otherwise when there is a integer vector with mixed loads/stores we prefer integer widening
- // promotion because it's more likely the user is doing bitwise arithmetic and we
- // generate better code.
+ // - All the loads/stores to the alloca are vector loads/stores to the
+ // entire alloca. Otherwise when there is a integer vector with mixed
+ // loads/stores we prefer integer widening promotion because it's more
+ // likely the user is doing bitwise arithmetic and we generate better code.
VectorType *VecTy =
isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue());
- // If the vector element type is a floating-point type, we prefer vector promotion.
+ // If the vector element type is a floating-point type, we prefer vector
+ // promotion.
if (VecTy && VecTy->getElementType()->isFloatingPointTy())
return {VecTy, false, VecTy};
- // Check if there is a common type that all slices of the partition use that spans the partition.
+ // Check if there is a common type that all slices of the partition use that
+ // spans the partition.
auto [CommonUseTy, LargestIntTy, OnlyIntrinsicUsers] =
findCommonType(P.begin(), P.end(), P.endOffset());
if (CommonUseTy) {
TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy);
if (CommonUseSize.isFixed() &&
CommonUseSize.getFixedValue() >= P.size()) {
- // We prefer vector promotion here because if vector promotion is viable and
- // there is a common type used, then it implies the second listed condition for prefering
- // vector promotion is true.
+ // We prefer vector promotion here because if vector promotion is viable
+ // and there is a common type used, then it implies the second listed
+ // condition for prefering vector promotion is true.
if (VecTy)
return {VecTy, false, VecTy};
return {CommonUseTy, isIntegerWideningViable(P, CommonUseTy, DL),
@@ -5230,8 +5232,9 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
}
}
- // If there are only intrinsic users, try to represent as a legal integer type
- // because we are probably just copying data around and the integer can be promoted.
+ // If there are only intrinsic users, try to represent as a legal integer
+ // type because we are probably just copying data around and the integer can
+ // be promoted.
if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8))
return {Type::getIntNTy(*C, P.size() * 8), false, nullptr};
@@ -5239,8 +5242,9 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// type?
if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
P.beginOffset(), P.size())) {
- // If the partition is an integer array that can be spanned by a legal integer type,
- // prefer to represent it as a legal integer type because it's more likely to be promotable.
+ // If the partition is an integer array that can be spanned by a legal
+ // integer type, prefer to represent it as a legal integer type because
+ // it's more likely to be promotable.
if (TypePartitionTy->isArrayTy() &&
TypePartitionTy->getArrayElementType()->isIntegerTy() &&
DL.isLegalInteger(P.size() * 8))
@@ -5250,7 +5254,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
return {TypePartitionTy, true, nullptr};
if (VecTy)
return {VecTy, false, VecTy};
- // If we couldn't promotion with TypePartitionTy, try with the largest integer type used.
+ // If we couldn't promotion with TypePartitionTy, try with the largest
+ // integer type used.
if (LargestIntTy &&
DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size() &&
isIntegerWideningViable(P, LargestIntTy, DL))
>From 874e50b5789255adb75b0a910ad2780798505cc7 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 27 Nov 2025 01:52:54 +0000
Subject: [PATCH 18/41] format
---
llvm/lib/Transforms/Scalar/SROA.cpp | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index c3fc2234e5391..4841fa2b822b6 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5201,10 +5201,13 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// or an i8 array of an appropriate size.
auto SelectPartitionTy = [&]() -> std::tuple<Type *, bool, VectorType *> {
// First check if the partition is viable for vetor promotion.
+ //
// We prefer vector promotion over integer widening promotion when:
// - The vector element type is a floating-point type.
// - All the loads/stores to the alloca are vector loads/stores to the
- // entire alloca. Otherwise when there is a integer vector with mixed
+ // entire alloca.
+ //
+ // Otherwise when there is a integer vector with mixed
// loads/stores we prefer integer widening promotion because it's more
// likely the user is doing bitwise arithmetic and we generate better code.
VectorType *VecTy =
>From 8804ace57f1bb1194cdb16b67a540d95d91d9e95 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 27 Nov 2025 01:53:02 +0000
Subject: [PATCH 19/41] format
---
llvm/lib/Transforms/Scalar/SROA.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 4841fa2b822b6..52e39d4ca19e6 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5206,7 +5206,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// - The vector element type is a floating-point type.
// - All the loads/stores to the alloca are vector loads/stores to the
// entire alloca.
- //
+ //
// Otherwise when there is a integer vector with mixed
// loads/stores we prefer integer widening promotion because it's more
// likely the user is doing bitwise arithmetic and we generate better code.
>From 7269b0b53ac00b28fda45aca347e66ed9a811749 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 27 Nov 2025 03:37:45 +0000
Subject: [PATCH 20/41] ptx test update
---
llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 84 ++++++++-------------
1 file changed, 31 insertions(+), 53 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index ca2914a2e8043..16ca96e5fbe84 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -455,64 +455,42 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly
; PTX-NEXT: .local .align 8 .b8 __local_depot9[8];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
-; PTX-NEXT: .reg .b32 %r<3>;
-; PTX-NEXT: .reg .b64 %rd<47>;
+; PTX-NEXT: .reg .b64 %rd<30>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
; PTX-NEXT: mov.b64 %SPL, __local_depot9;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: ld.param.b64 %rd1, [memcpy_to_param_param_0];
-; PTX-NEXT: add.u64 %rd2, %SPL, 0;
-; PTX-NEXT: ld.param.b32 %r1, [memcpy_to_param_param_1+4];
-; PTX-NEXT: st.local.b32 [%rd2+4], %r1;
-; PTX-NEXT: ld.param.b32 %r2, [memcpy_to_param_param_1];
-; PTX-NEXT: st.local.b32 [%rd2], %r2;
-; PTX-NEXT: ld.volatile.b8 %rd3, [%rd1];
-; PTX-NEXT: ld.volatile.b8 %rd4, [%rd1+1];
-; PTX-NEXT: shl.b64 %rd5, %rd4, 8;
-; PTX-NEXT: or.b64 %rd6, %rd5, %rd3;
-; PTX-NEXT: ld.volatile.b8 %rd7, [%rd1+2];
-; PTX-NEXT: shl.b64 %rd8, %rd7, 16;
-; PTX-NEXT: ld.volatile.b8 %rd9, [%rd1+3];
-; PTX-NEXT: shl.b64 %rd10, %rd9, 24;
-; PTX-NEXT: or.b64 %rd11, %rd10, %rd8;
-; PTX-NEXT: or.b64 %rd12, %rd11, %rd6;
-; PTX-NEXT: ld.volatile.b8 %rd13, [%rd1+4];
-; PTX-NEXT: ld.volatile.b8 %rd14, [%rd1+5];
-; PTX-NEXT: shl.b64 %rd15, %rd14, 8;
-; PTX-NEXT: or.b64 %rd16, %rd15, %rd13;
-; PTX-NEXT: ld.volatile.b8 %rd17, [%rd1+6];
-; PTX-NEXT: shl.b64 %rd18, %rd17, 16;
-; PTX-NEXT: ld.volatile.b8 %rd19, [%rd1+7];
-; PTX-NEXT: shl.b64 %rd20, %rd19, 24;
-; PTX-NEXT: or.b64 %rd21, %rd20, %rd18;
-; PTX-NEXT: or.b64 %rd22, %rd21, %rd16;
-; PTX-NEXT: shl.b64 %rd23, %rd22, 32;
-; PTX-NEXT: or.b64 %rd24, %rd23, %rd12;
-; PTX-NEXT: st.volatile.b64 [%SP], %rd24;
-; PTX-NEXT: ld.volatile.b8 %rd25, [%rd1+8];
-; PTX-NEXT: ld.volatile.b8 %rd26, [%rd1+9];
-; PTX-NEXT: shl.b64 %rd27, %rd26, 8;
-; PTX-NEXT: or.b64 %rd28, %rd27, %rd25;
-; PTX-NEXT: ld.volatile.b8 %rd29, [%rd1+10];
-; PTX-NEXT: shl.b64 %rd30, %rd29, 16;
-; PTX-NEXT: ld.volatile.b8 %rd31, [%rd1+11];
-; PTX-NEXT: shl.b64 %rd32, %rd31, 24;
-; PTX-NEXT: or.b64 %rd33, %rd32, %rd30;
-; PTX-NEXT: or.b64 %rd34, %rd33, %rd28;
-; PTX-NEXT: ld.volatile.b8 %rd35, [%rd1+12];
-; PTX-NEXT: ld.volatile.b8 %rd36, [%rd1+13];
-; PTX-NEXT: shl.b64 %rd37, %rd36, 8;
-; PTX-NEXT: or.b64 %rd38, %rd37, %rd35;
-; PTX-NEXT: ld.volatile.b8 %rd39, [%rd1+14];
-; PTX-NEXT: shl.b64 %rd40, %rd39, 16;
-; PTX-NEXT: ld.volatile.b8 %rd41, [%rd1+15];
-; PTX-NEXT: shl.b64 %rd42, %rd41, 24;
-; PTX-NEXT: or.b64 %rd43, %rd42, %rd40;
-; PTX-NEXT: or.b64 %rd44, %rd43, %rd38;
-; PTX-NEXT: shl.b64 %rd45, %rd44, 32;
-; PTX-NEXT: or.b64 %rd46, %rd45, %rd34;
-; PTX-NEXT: st.volatile.b64 [%SP+8], %rd46;
+; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT: ld.param.b32 %rd3, [memcpy_to_param_param_1+4];
+; PTX-NEXT: shl.b64 %rd4, %rd3, 32;
+; PTX-NEXT: ld.param.b32 %rd5, [memcpy_to_param_param_1];
+; PTX-NEXT: or.b64 %rd6, %rd4, %rd5;
+; PTX-NEXT: st.b64 [%SP], %rd6;
+; PTX-NEXT: ld.volatile.global.b8 %rd7, [%rd2];
+; PTX-NEXT: ld.volatile.global.b8 %rd8, [%rd2+1];
+; PTX-NEXT: shl.b64 %rd9, %rd8, 8;
+; PTX-NEXT: or.b64 %rd10, %rd9, %rd7;
+; PTX-NEXT: ld.volatile.global.b8 %rd11, [%rd2+2];
+; PTX-NEXT: shl.b64 %rd12, %rd11, 16;
+; PTX-NEXT: ld.volatile.global.b8 %rd13, [%rd2+3];
+; PTX-NEXT: shl.b64 %rd14, %rd13, 24;
+; PTX-NEXT: or.b64 %rd15, %rd14, %rd12;
+; PTX-NEXT: or.b64 %rd16, %rd15, %rd10;
+; PTX-NEXT: ld.volatile.global.b8 %rd17, [%rd2+4];
+; PTX-NEXT: ld.volatile.global.b8 %rd18, [%rd2+5];
+; PTX-NEXT: shl.b64 %rd19, %rd18, 8;
+; PTX-NEXT: or.b64 %rd20, %rd19, %rd17;
+; PTX-NEXT: ld.volatile.global.b8 %rd21, [%rd2+6];
+; PTX-NEXT: shl.b64 %rd22, %rd21, 16;
+; PTX-NEXT: ld.volatile.global.b8 %rd23, [%rd2+7];
+; PTX-NEXT: shl.b64 %rd24, %rd23, 24;
+; PTX-NEXT: or.b64 %rd25, %rd24, %rd22;
+; PTX-NEXT: or.b64 %rd26, %rd25, %rd20;
+; PTX-NEXT: shl.b64 %rd27, %rd26, 32;
+; PTX-NEXT: or.b64 %rd28, %rd27, %rd16;
+; PTX-NEXT: add.u64 %rd29, %SPL, 0;
+; PTX-NEXT: st.local.b64 [%rd29], %rd28;
; PTX-NEXT: ret;
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true)
>From 0c37e001785e372113b385861a83253852aebc39 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 27 Nov 2025 04:20:22 +0000
Subject: [PATCH 21/41] fix debug info test
---
llvm/test/DebugInfo/X86/sroasplit-5.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/DebugInfo/X86/sroasplit-5.ll b/llvm/test/DebugInfo/X86/sroasplit-5.ll
index 34aa30f55728e..7500bc97efbc8 100644
--- a/llvm/test/DebugInfo/X86/sroasplit-5.ll
+++ b/llvm/test/DebugInfo/X86/sroasplit-5.ll
@@ -23,7 +23,7 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK-NOT: DW_OP_LLVM_fragment, 56
; CHECK: DIExpression(DW_OP_LLVM_fragment, 0, 32)
; CHECK-NOT: DW_OP_LLVM_fragment, 56
-; CHECK: DIExpression(DW_OP_LLVM_fragment, 32, 24)
+; CHECK: DIExpression(DW_OP_LLVM_fragment, 32, 32)
; CHECK-NOT: DW_OP_LLVM_fragment, 56
%struct.prog_src_register = type { i32, i24 }
>From 1717d8d38988d5e7116d624f4c08b98334474388 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 3 Dec 2025 02:43:55 +0000
Subject: [PATCH 22/41] improvement
---
llvm/lib/Transforms/Scalar/SROA.cpp | 13 +++++++------
llvm/test/Transforms/SROA/basictest.ll | 4 ++--
2 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 52e39d4ca19e6..7f1018960f1aa 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5235,12 +5235,6 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
}
}
- // If there are only intrinsic users, try to represent as a legal integer
- // type because we are probably just copying data around and the integer can
- // be promoted.
- if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8))
- return {Type::getIntNTy(*C, P.size() * 8), false, nullptr};
-
// Can we find an appropriate subtype in the original allocated
// type?
if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
@@ -5263,6 +5257,13 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size() &&
isIntegerWideningViable(P, LargestIntTy, DL))
return {LargestIntTy, true, nullptr};
+
+ // If there are only intrinsic users, try to represent as a legal integer
+ // type because we are probably just copying data around and the integer can
+ // be promoted.
+ if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8))
+ return {Type::getIntNTy(*C, P.size() * 8), isIntegerWideningViable(P, Type::getIntNTy(*C, P.size() * 8), DL), nullptr};
+
// Fallback to TypePartitionTy and we probably won't promote.
return {TypePartitionTy, false, nullptr};
}
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index b530742bd66ac..b16940f6ffdb2 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -785,7 +785,7 @@ define i64 @test19(ptr %x) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[X:%.*]], align 1
; CHECK-NEXT: [[A_SROA_2_0_X_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 8
-; CHECK-NEXT: [[A_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[A_SROA_2_0_X_SROA_IDX]], align 1
+; CHECK-NEXT: [[A_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr [[A_SROA_2_0_X_SROA_IDX]], align 1
; CHECK-NEXT: ret i64 [[A_SROA_0_0_COPYLOAD]]
;
entry:
@@ -809,7 +809,7 @@ define i64 @test19_addrspacecast(ptr %x) {
; CHECK-NEXT: [[CAST1:%.*]] = addrspacecast ptr [[X:%.*]] to ptr addrspace(1)
; CHECK-NEXT: [[A_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr addrspace(1) [[CAST1]], align 1
; CHECK-NEXT: [[A_SROA_2_0_CAST1_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[CAST1]], i16 8
-; CHECK-NEXT: [[A_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(1) [[A_SROA_2_0_CAST1_SROA_IDX]], align 1
+; CHECK-NEXT: [[A_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(1) [[A_SROA_2_0_CAST1_SROA_IDX]], align 1
; CHECK-NEXT: ret i64 [[A_SROA_0_0_COPYLOAD]]
;
entry:
>From a53896bb097e3eb23f9ecad107bca85b89c355e7 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 3 Dec 2025 02:44:12 +0000
Subject: [PATCH 23/41] format
---
llvm/lib/Transforms/Scalar/SROA.cpp | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 7f1018960f1aa..9b85eebebfc9d 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5259,10 +5259,13 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
return {LargestIntTy, true, nullptr};
// If there are only intrinsic users, try to represent as a legal integer
- // type because we are probably just copying data around and the integer can
- // be promoted.
+ // type because we are probably just copying data around and the integer
+ // can be promoted.
if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8))
- return {Type::getIntNTy(*C, P.size() * 8), isIntegerWideningViable(P, Type::getIntNTy(*C, P.size() * 8), DL), nullptr};
+ return {
+ Type::getIntNTy(*C, P.size() * 8),
+ isIntegerWideningViable(P, Type::getIntNTy(*C, P.size() * 8), DL),
+ nullptr};
// Fallback to TypePartitionTy and we probably won't promote.
return {TypePartitionTy, false, nullptr};
>From 92fe65f49e2fad7c72922ef300d8d290a96bca92 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 3 Dec 2025 02:47:28 +0000
Subject: [PATCH 24/41] improvement
---
llvm/lib/Transforms/Scalar/SROA.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 9b85eebebfc9d..060c8d8c89eff 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5258,10 +5258,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
isIntegerWideningViable(P, LargestIntTy, DL))
return {LargestIntTy, true, nullptr};
- // If there are only intrinsic users, try to represent as a legal integer
+ // If there are only intrinsic users of an aggregate type, try to represent as a legal integer
// type because we are probably just copying data around and the integer
// can be promoted.
- if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8))
+ if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8) && TypePartitionTy->isAggregateType())
return {
Type::getIntNTy(*C, P.size() * 8),
isIntegerWideningViable(P, Type::getIntNTy(*C, P.size() * 8), DL),
>From 6517a4dfb45362ccc50b5e6468eccba2b92999c6 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 3 Dec 2025 02:47:39 +0000
Subject: [PATCH 25/41] format
---
llvm/lib/Transforms/Scalar/SROA.cpp | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 060c8d8c89eff..9c5fe41eec619 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5258,10 +5258,11 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
isIntegerWideningViable(P, LargestIntTy, DL))
return {LargestIntTy, true, nullptr};
- // If there are only intrinsic users of an aggregate type, try to represent as a legal integer
- // type because we are probably just copying data around and the integer
- // can be promoted.
- if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8) && TypePartitionTy->isAggregateType())
+ // If there are only intrinsic users of an aggregate type, try to
+ // represent as a legal integer type because we are probably just copying
+ // data around and the integer can be promoted.
+ if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8) &&
+ TypePartitionTy->isAggregateType())
return {
Type::getIntNTy(*C, P.size() * 8),
isIntegerWideningViable(P, Type::getIntNTy(*C, P.size() * 8), DL),
>From 7a4b70fd1578920f133c171425c4a8aab7bdf670 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 3 Dec 2025 02:51:51 +0000
Subject: [PATCH 26/41] revert sroasplit
---
llvm/test/DebugInfo/X86/sroasplit-5.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/DebugInfo/X86/sroasplit-5.ll b/llvm/test/DebugInfo/X86/sroasplit-5.ll
index 7500bc97efbc8..34aa30f55728e 100644
--- a/llvm/test/DebugInfo/X86/sroasplit-5.ll
+++ b/llvm/test/DebugInfo/X86/sroasplit-5.ll
@@ -23,7 +23,7 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK-NOT: DW_OP_LLVM_fragment, 56
; CHECK: DIExpression(DW_OP_LLVM_fragment, 0, 32)
; CHECK-NOT: DW_OP_LLVM_fragment, 56
-; CHECK: DIExpression(DW_OP_LLVM_fragment, 32, 32)
+; CHECK: DIExpression(DW_OP_LLVM_fragment, 32, 24)
; CHECK-NOT: DW_OP_LLVM_fragment, 56
%struct.prog_src_register = type { i32, i24 }
>From d54b040729e80423f6a6ba9b9332c3c3e37b509e Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Thu, 4 Dec 2025 17:33:18 +0000
Subject: [PATCH 27/41] remove julia fix
---
llvm/lib/Transforms/Scalar/SROA.cpp | 10 ----------
1 file changed, 10 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 9c5fe41eec619..c54ef04dde7ba 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5258,16 +5258,6 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
isIntegerWideningViable(P, LargestIntTy, DL))
return {LargestIntTy, true, nullptr};
- // If there are only intrinsic users of an aggregate type, try to
- // represent as a legal integer type because we are probably just copying
- // data around and the integer can be promoted.
- if (OnlyIntrinsicUsers && DL.isLegalInteger(P.size() * 8) &&
- TypePartitionTy->isAggregateType())
- return {
- Type::getIntNTy(*C, P.size() * 8),
- isIntegerWideningViable(P, Type::getIntNTy(*C, P.size() * 8), DL),
- nullptr};
-
// Fallback to TypePartitionTy and we probably won't promote.
return {TypePartitionTy, false, nullptr};
}
>From b212468630047348432cab640499183fe68b6a7d Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 8 Dec 2025 20:38:05 +0000
Subject: [PATCH 28/41] comment
---
llvm/lib/Transforms/Scalar/SROA.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index c54ef04dde7ba..63fb321401e35 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5199,6 +5199,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// Try to compute a friendly type for this partition of the alloca. This
// won't always succeed, in which case we fall back to a legal integer type
// or an i8 array of an appropriate size.
+ // Returns a tuple: <PartitionType, IsIntegerWideningViable (true if integer widening promotion is used), VectorType (if vector promotion is used, otherwise nullptr)>.
auto SelectPartitionTy = [&]() -> std::tuple<Type *, bool, VectorType *> {
// First check if the partition is viable for vetor promotion.
//
@@ -5275,7 +5276,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
return {ArrayType::get(Type::getInt8Ty(*C), P.size()), false, nullptr};
};
- auto [PartitionTy, IsIntegerPromotable, VecTy] = SelectPartitionTy();
+ auto [PartitionTy, IsIntegerWideningViable, VecTy] = SelectPartitionTy();
// Check for the case where we're going to rewrite to a new alloca of the
// exact same type as the original, and with the same access offsets. In that
@@ -5317,7 +5318,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
SmallSetVector<SelectInst *, 8> SelectUsers;
AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
- P.endOffset(), IsIntegerPromotable, VecTy,
+ P.endOffset(), IsIntegerWideningViable, VecTy,
PHIUsers, SelectUsers);
bool Promotable = true;
// Check whether we can have tree-structured merge.
>From 2f94f0de19a70fa77bacf6938dc3b2ccbc09c13c Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 8 Dec 2025 20:40:12 +0000
Subject: [PATCH 29/41] comment
---
llvm/lib/Transforms/Scalar/SROA.cpp | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 63fb321401e35..d1e501dd69175 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5199,7 +5199,11 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// Try to compute a friendly type for this partition of the alloca. This
// won't always succeed, in which case we fall back to a legal integer type
// or an i8 array of an appropriate size.
- // Returns a tuple: <PartitionType, IsIntegerWideningViable (true if integer widening promotion is used), VectorType (if vector promotion is used, otherwise nullptr)>.
+ //
+ // Returns a tuple with the following elements:
+ // - PartitionType: The computed type for this partition.
+ // - IsIntegerWideningViable: True if integer widening promotion is used.
+ // - VectorType: The vector type if vector promotion is used, otherwise nullptr.
auto SelectPartitionTy = [&]() -> std::tuple<Type *, bool, VectorType *> {
// First check if the partition is viable for vetor promotion.
//
>From c53787f92b0bdc5aed30252f617e54c8942fa4b5 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 8 Dec 2025 20:40:21 +0000
Subject: [PATCH 30/41] comment
---
llvm/lib/Transforms/Scalar/SROA.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index d1e501dd69175..45dc65b9a7288 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5203,7 +5203,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// Returns a tuple with the following elements:
// - PartitionType: The computed type for this partition.
// - IsIntegerWideningViable: True if integer widening promotion is used.
- // - VectorType: The vector type if vector promotion is used, otherwise nullptr.
+ // - VectorType: The vector type if vector promotion is used, otherwise
+ // nullptr.
auto SelectPartitionTy = [&]() -> std::tuple<Type *, bool, VectorType *> {
// First check if the partition is viable for vetor promotion.
//
>From c17f912d128f2bdf1de9c530330d437afdb5af59 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 8 Dec 2025 20:48:52 +0000
Subject: [PATCH 31/41] test
---
llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 141 +++++++++++-------
.../SROA/prefer-integer-partition.ll | 22 ---
2 files changed, 91 insertions(+), 72 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 16ca96e5fbe84..21257e21bea9f 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -455,42 +455,64 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly
; PTX-NEXT: .local .align 8 .b8 __local_depot9[8];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
-; PTX-NEXT: .reg .b64 %rd<30>;
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<47>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
; PTX-NEXT: mov.b64 %SPL, __local_depot9;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: ld.param.b64 %rd1, [memcpy_to_param_param_0];
-; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
-; PTX-NEXT: ld.param.b32 %rd3, [memcpy_to_param_param_1+4];
-; PTX-NEXT: shl.b64 %rd4, %rd3, 32;
-; PTX-NEXT: ld.param.b32 %rd5, [memcpy_to_param_param_1];
-; PTX-NEXT: or.b64 %rd6, %rd4, %rd5;
-; PTX-NEXT: st.b64 [%SP], %rd6;
-; PTX-NEXT: ld.volatile.global.b8 %rd7, [%rd2];
-; PTX-NEXT: ld.volatile.global.b8 %rd8, [%rd2+1];
-; PTX-NEXT: shl.b64 %rd9, %rd8, 8;
-; PTX-NEXT: or.b64 %rd10, %rd9, %rd7;
-; PTX-NEXT: ld.volatile.global.b8 %rd11, [%rd2+2];
-; PTX-NEXT: shl.b64 %rd12, %rd11, 16;
-; PTX-NEXT: ld.volatile.global.b8 %rd13, [%rd2+3];
-; PTX-NEXT: shl.b64 %rd14, %rd13, 24;
-; PTX-NEXT: or.b64 %rd15, %rd14, %rd12;
-; PTX-NEXT: or.b64 %rd16, %rd15, %rd10;
-; PTX-NEXT: ld.volatile.global.b8 %rd17, [%rd2+4];
-; PTX-NEXT: ld.volatile.global.b8 %rd18, [%rd2+5];
-; PTX-NEXT: shl.b64 %rd19, %rd18, 8;
-; PTX-NEXT: or.b64 %rd20, %rd19, %rd17;
-; PTX-NEXT: ld.volatile.global.b8 %rd21, [%rd2+6];
-; PTX-NEXT: shl.b64 %rd22, %rd21, 16;
-; PTX-NEXT: ld.volatile.global.b8 %rd23, [%rd2+7];
-; PTX-NEXT: shl.b64 %rd24, %rd23, 24;
-; PTX-NEXT: or.b64 %rd25, %rd24, %rd22;
-; PTX-NEXT: or.b64 %rd26, %rd25, %rd20;
-; PTX-NEXT: shl.b64 %rd27, %rd26, 32;
-; PTX-NEXT: or.b64 %rd28, %rd27, %rd16;
-; PTX-NEXT: add.u64 %rd29, %SPL, 0;
-; PTX-NEXT: st.local.b64 [%rd29], %rd28;
+; PTX-NEXT: add.u64 %rd2, %SPL, 0;
+; PTX-NEXT: ld.param.b32 %r1, [memcpy_to_param_param_1+4];
+; PTX-NEXT: st.local.b32 [%rd2+4], %r1;
+; PTX-NEXT: ld.param.b32 %r2, [memcpy_to_param_param_1];
+; PTX-NEXT: st.local.b32 [%rd2], %r2;
+; PTX-NEXT: ld.volatile.b8 %rd3, [%rd1];
+; PTX-NEXT: ld.volatile.b8 %rd4, [%rd1+1];
+; PTX-NEXT: shl.b64 %rd5, %rd4, 8;
+; PTX-NEXT: or.b64 %rd6, %rd5, %rd3;
+; PTX-NEXT: ld.volatile.b8 %rd7, [%rd1+2];
+; PTX-NEXT: shl.b64 %rd8, %rd7, 16;
+; PTX-NEXT: ld.volatile.b8 %rd9, [%rd1+3];
+; PTX-NEXT: shl.b64 %rd10, %rd9, 24;
+; PTX-NEXT: or.b64 %rd11, %rd10, %rd8;
+; PTX-NEXT: or.b64 %rd12, %rd11, %rd6;
+; PTX-NEXT: ld.volatile.b8 %rd13, [%rd1+4];
+; PTX-NEXT: ld.volatile.b8 %rd14, [%rd1+5];
+; PTX-NEXT: shl.b64 %rd15, %rd14, 8;
+; PTX-NEXT: or.b64 %rd16, %rd15, %rd13;
+; PTX-NEXT: ld.volatile.b8 %rd17, [%rd1+6];
+; PTX-NEXT: shl.b64 %rd18, %rd17, 16;
+; PTX-NEXT: ld.volatile.b8 %rd19, [%rd1+7];
+; PTX-NEXT: shl.b64 %rd20, %rd19, 24;
+; PTX-NEXT: or.b64 %rd21, %rd20, %rd18;
+; PTX-NEXT: or.b64 %rd22, %rd21, %rd16;
+; PTX-NEXT: shl.b64 %rd23, %rd22, 32;
+; PTX-NEXT: or.b64 %rd24, %rd23, %rd12;
+; PTX-NEXT: st.volatile.b64 [%SP], %rd24;
+; PTX-NEXT: ld.volatile.b8 %rd25, [%rd1+8];
+; PTX-NEXT: ld.volatile.b8 %rd26, [%rd1+9];
+; PTX-NEXT: shl.b64 %rd27, %rd26, 8;
+; PTX-NEXT: or.b64 %rd28, %rd27, %rd25;
+; PTX-NEXT: ld.volatile.b8 %rd29, [%rd1+10];
+; PTX-NEXT: shl.b64 %rd30, %rd29, 16;
+; PTX-NEXT: ld.volatile.b8 %rd31, [%rd1+11];
+; PTX-NEXT: shl.b64 %rd32, %rd31, 24;
+; PTX-NEXT: or.b64 %rd33, %rd32, %rd30;
+; PTX-NEXT: or.b64 %rd34, %rd33, %rd28;
+; PTX-NEXT: ld.volatile.b8 %rd35, [%rd1+12];
+; PTX-NEXT: ld.volatile.b8 %rd36, [%rd1+13];
+; PTX-NEXT: shl.b64 %rd37, %rd36, 8;
+; PTX-NEXT: or.b64 %rd38, %rd37, %rd35;
+; PTX-NEXT: ld.volatile.b8 %rd39, [%rd1+14];
+; PTX-NEXT: shl.b64 %rd40, %rd39, 16;
+; PTX-NEXT: ld.volatile.b8 %rd41, [%rd1+15];
+; PTX-NEXT: shl.b64 %rd42, %rd41, 24;
+; PTX-NEXT: or.b64 %rd43, %rd42, %rd40;
+; PTX-NEXT: or.b64 %rd44, %rd43, %rd38;
+; PTX-NEXT: shl.b64 %rd45, %rd44, 32;
+; PTX-NEXT: or.b64 %rd46, %rd45, %rd34;
+; PTX-NEXT: st.volatile.b64 [%SP+8], %rd46;
; PTX-NEXT: ret;
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true)
@@ -562,25 +584,44 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; COPY-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4
; COPY-NEXT: ret void
;
-; PTX-LABEL: test_select(
-; PTX: {
-; PTX-NEXT: .reg .pred %p<2>;
-; PTX-NEXT: .reg .b16 %rs<3>;
-; PTX-NEXT: .reg .b32 %r<2>;
-; PTX-NEXT: .reg .b64 %rd<6>;
-; PTX-EMPTY:
-; PTX-NEXT: // %bb.0: // %bb
-; PTX-NEXT: ld.param.b8 %rs1, [test_select_param_3];
-; PTX-NEXT: and.b16 %rs2, %rs1, 1;
-; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0;
-; PTX-NEXT: mov.b64 %rd1, test_select_param_0;
-; PTX-NEXT: ld.param.b64 %rd2, [test_select_param_2];
-; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
-; PTX-NEXT: mov.b64 %rd4, test_select_param_1;
-; PTX-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1;
-; PTX-NEXT: ld.param.b32 %r1, [%rd5];
-; PTX-NEXT: st.global.b32 [%rd3], %r1;
-; PTX-NEXT: ret;
+; PTX_60-LABEL: test_select(
+; PTX_60: {
+; PTX_60-NEXT: .reg .pred %p<2>;
+; PTX_60-NEXT: .reg .b16 %rs<3>;
+; PTX_60-NEXT: .reg .b32 %r<4>;
+; PTX_60-NEXT: .reg .b64 %rd<3>;
+; PTX_60-EMPTY:
+; PTX_60-NEXT: // %bb.0: // %bb
+; PTX_60-NEXT: ld.param.b8 %rs1, [test_select_param_3];
+; PTX_60-NEXT: and.b16 %rs2, %rs1, 1;
+; PTX_60-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; PTX_60-NEXT: ld.param.b64 %rd1, [test_select_param_2];
+; PTX_60-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX_60-NEXT: ld.param.b32 %r1, [test_select_param_1];
+; PTX_60-NEXT: ld.param.b32 %r2, [test_select_param_0];
+; PTX_60-NEXT: selp.b32 %r3, %r2, %r1, %p1;
+; PTX_60-NEXT: st.global.b32 [%rd2], %r3;
+; PTX_60-NEXT: ret;
+;
+; PTX_70-LABEL: test_select(
+; PTX_70: {
+; PTX_70-NEXT: .reg .pred %p<2>;
+; PTX_70-NEXT: .reg .b16 %rs<3>;
+; PTX_70-NEXT: .reg .b32 %r<2>;
+; PTX_70-NEXT: .reg .b64 %rd<6>;
+; PTX_70-EMPTY:
+; PTX_70-NEXT: // %bb.0: // %bb
+; PTX_70-NEXT: ld.param.b8 %rs1, [test_select_param_3];
+; PTX_70-NEXT: and.b16 %rs2, %rs1, 1;
+; PTX_70-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; PTX_70-NEXT: mov.b64 %rd1, test_select_param_0;
+; PTX_70-NEXT: ld.param.b64 %rd2, [test_select_param_2];
+; PTX_70-NEXT: cvta.to.global.u64 %rd3, %rd2;
+; PTX_70-NEXT: mov.b64 %rd4, test_select_param_1;
+; PTX_70-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1;
+; PTX_70-NEXT: ld.param.b32 %r1, [%rd5];
+; PTX_70-NEXT: st.global.b32 [%rd3], %r1;
+; PTX_70-NEXT: ret;
bb:
%ptrnew = select i1 %cond, ptr %input1, ptr %input2
%valloaded = load i32, ptr %ptrnew, align 4
diff --git a/llvm/test/Transforms/SROA/prefer-integer-partition.ll b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
index bf0d2562a8745..5b639169cc207 100644
--- a/llvm/test/Transforms/SROA/prefer-integer-partition.ll
+++ b/llvm/test/Transforms/SROA/prefer-integer-partition.ll
@@ -62,28 +62,6 @@ _ZNK4pbrt3SOAINS_10RaySamplesEEixEi.exit: ; preds = %0, %6
ret <2 x float> %.sroa.01.0.copyload
}
-define void @test_float_array_only_intrinsics() {
-; CHECK-LABEL: @test_float_array_only_intrinsics(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: ret void
-;
-entry:
- %src = alloca [2 x float], align 4
- %dst = alloca [2 x float], align 4
-
- call void @llvm.lifetime.start.p0(i64 8, ptr %src)
- call void @llvm.lifetime.start.p0(i64 8, ptr %dst)
-
- ; Only intrinsic uses - no scalar loads/stores to establish common type
- call void @llvm.memset.p0.i64(ptr %src, i8 42, i64 8, i1 false)
- call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false)
- call void @llvm.memcpy.p0.p0.i64(ptr %src, ptr %dst, i64 8, i1 false)
-
- call void @llvm.lifetime.end.p0(i64 8, ptr %dst)
- call void @llvm.lifetime.end.p0(i64 8, ptr %src)
- ret void
-}
-
define void @test_mixed_types() {
; CHECK-LABEL: @test_mixed_types(
; CHECK-NEXT: entry:
>From 4db31f89ef3504e5b694dae5838cad3e40c93350 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Mon, 8 Dec 2025 20:52:20 +0000
Subject: [PATCH 32/41] test
---
llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 57 +++++++--------------
1 file changed, 19 insertions(+), 38 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 21257e21bea9f..ca2914a2e8043 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -584,44 +584,25 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; COPY-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4
; COPY-NEXT: ret void
;
-; PTX_60-LABEL: test_select(
-; PTX_60: {
-; PTX_60-NEXT: .reg .pred %p<2>;
-; PTX_60-NEXT: .reg .b16 %rs<3>;
-; PTX_60-NEXT: .reg .b32 %r<4>;
-; PTX_60-NEXT: .reg .b64 %rd<3>;
-; PTX_60-EMPTY:
-; PTX_60-NEXT: // %bb.0: // %bb
-; PTX_60-NEXT: ld.param.b8 %rs1, [test_select_param_3];
-; PTX_60-NEXT: and.b16 %rs2, %rs1, 1;
-; PTX_60-NEXT: setp.ne.b16 %p1, %rs2, 0;
-; PTX_60-NEXT: ld.param.b64 %rd1, [test_select_param_2];
-; PTX_60-NEXT: cvta.to.global.u64 %rd2, %rd1;
-; PTX_60-NEXT: ld.param.b32 %r1, [test_select_param_1];
-; PTX_60-NEXT: ld.param.b32 %r2, [test_select_param_0];
-; PTX_60-NEXT: selp.b32 %r3, %r2, %r1, %p1;
-; PTX_60-NEXT: st.global.b32 [%rd2], %r3;
-; PTX_60-NEXT: ret;
-;
-; PTX_70-LABEL: test_select(
-; PTX_70: {
-; PTX_70-NEXT: .reg .pred %p<2>;
-; PTX_70-NEXT: .reg .b16 %rs<3>;
-; PTX_70-NEXT: .reg .b32 %r<2>;
-; PTX_70-NEXT: .reg .b64 %rd<6>;
-; PTX_70-EMPTY:
-; PTX_70-NEXT: // %bb.0: // %bb
-; PTX_70-NEXT: ld.param.b8 %rs1, [test_select_param_3];
-; PTX_70-NEXT: and.b16 %rs2, %rs1, 1;
-; PTX_70-NEXT: setp.ne.b16 %p1, %rs2, 0;
-; PTX_70-NEXT: mov.b64 %rd1, test_select_param_0;
-; PTX_70-NEXT: ld.param.b64 %rd2, [test_select_param_2];
-; PTX_70-NEXT: cvta.to.global.u64 %rd3, %rd2;
-; PTX_70-NEXT: mov.b64 %rd4, test_select_param_1;
-; PTX_70-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1;
-; PTX_70-NEXT: ld.param.b32 %r1, [%rd5];
-; PTX_70-NEXT: st.global.b32 [%rd3], %r1;
-; PTX_70-NEXT: ret;
+; PTX-LABEL: test_select(
+; PTX: {
+; PTX-NEXT: .reg .pred %p<2>;
+; PTX-NEXT: .reg .b16 %rs<3>;
+; PTX-NEXT: .reg .b32 %r<2>;
+; PTX-NEXT: .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %bb
+; PTX-NEXT: ld.param.b8 %rs1, [test_select_param_3];
+; PTX-NEXT: and.b16 %rs2, %rs1, 1;
+; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; PTX-NEXT: mov.b64 %rd1, test_select_param_0;
+; PTX-NEXT: ld.param.b64 %rd2, [test_select_param_2];
+; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT: mov.b64 %rd4, test_select_param_1;
+; PTX-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1;
+; PTX-NEXT: ld.param.b32 %r1, [%rd5];
+; PTX-NEXT: st.global.b32 [%rd3], %r1;
+; PTX-NEXT: ret;
bb:
%ptrnew = select i1 %cond, ptr %input1, ptr %input2
%valloaded = load i32, ptr %ptrnew, align 4
>From 78f6d3676b0fb0d6ec35b9bac78d36a3f44e1a12 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 9 Dec 2025 00:27:47 +0000
Subject: [PATCH 33/41] remove only intrinsic users
---
llvm/lib/Transforms/Scalar/SROA.cpp | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 45dc65b9a7288..4a68aba03ece2 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1483,13 +1483,12 @@ LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
/// Walk the range of a partitioning looking for a common type to cover this
/// sequence of slices.
/// Returns: {CommonType, LargestIntegerType, OnlyIntrinsicUsers}
-static std::tuple<Type *, IntegerType *, bool>
+static std::pair<Type *, IntegerType *>
findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
uint64_t EndOffset) {
Type *Ty = nullptr;
bool TyIsCommon = true;
IntegerType *ITy = nullptr;
- bool OnlyIntrinsicUsers = true;
// Note that we need to look at *every* alloca slice's Use to ensure we
// always get consistent results regardless of the order of slices.
@@ -1497,8 +1496,6 @@ findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
Use *U = I->getUse();
if (isa<IntrinsicInst>(*U->getUser()))
continue;
- // We found a non-intrinsic user
- OnlyIntrinsicUsers = false;
if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
continue;
@@ -1532,7 +1529,7 @@ findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
Ty = UserTy;
}
- return {TyIsCommon ? Ty : nullptr, ITy, OnlyIntrinsicUsers};
+ return {TyIsCommon ? Ty : nullptr, ITy};
}
/// PHI instructions that use an alloca and are subsequently loaded can be
@@ -5225,7 +5222,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// Check if there is a common type that all slices of the partition use that
// spans the partition.
- auto [CommonUseTy, LargestIntTy, OnlyIntrinsicUsers] =
+ auto CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset());
findCommonType(P.begin(), P.end(), P.endOffset());
if (CommonUseTy) {
TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy);
>From 4fe2720fb81525ef01cba4cdaf2c1cfb60702d66 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 9 Dec 2025 00:33:20 +0000
Subject: [PATCH 34/41] slgiht chnage
---
llvm/lib/Transforms/Scalar/SROA.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 4a68aba03ece2..88c0f22e6fecd 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5217,12 +5217,12 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue());
// If the vector element type is a floating-point type, we prefer vector
// promotion.
- if (VecTy && VecTy->getElementType()->isFloatingPointTy())
+ if (VecTy && VecTy->getElementType()->isFloatingPointTy() && VecTy->getElementCount().getFixedValue() > 1)
return {VecTy, false, VecTy};
// Check if there is a common type that all slices of the partition use that
// spans the partition.
- auto CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset());
+ auto [CommonUseTy, LargestIntTy] = findCommonType(P.begin(), P.end(), P.endOffset());
findCommonType(P.begin(), P.end(), P.endOffset());
if (CommonUseTy) {
TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy);
>From a3de8f81e48ac6cf4755bbac5e673731a4f1c14d Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 9 Dec 2025 00:53:26 +0000
Subject: [PATCH 35/41] singleton vectors
---
llvm/lib/Transforms/Scalar/SROA.cpp | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 88c0f22e6fecd..30cd094a760b0 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5217,8 +5217,12 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue());
// If the vector element type is a floating-point type, we prefer vector
// promotion.
- if (VecTy && VecTy->getElementType()->isFloatingPointTy() && VecTy->getElementCount().getFixedValue() > 1)
+ if (VecTy && VecTy->getElementType()->isFloatingPointTy()) {
+ // If the vector has one element we prefer to promote via the element type.
+ if (VecTy->getElementCount().getFixedValue() == 1)
+ return {VecTy->getElementType(), false, nullptr};
return {VecTy, false, VecTy};
+ }
// Check if there is a common type that all slices of the partition use that
// spans the partition.
>From 20cf8c5916e4d6fcc934b15393e4341f0e7daf22 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 9 Dec 2025 00:54:00 +0000
Subject: [PATCH 36/41] format
---
llvm/lib/Transforms/Scalar/SROA.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 30cd094a760b0..90bd4d982fd62 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5218,7 +5218,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// If the vector element type is a floating-point type, we prefer vector
// promotion.
if (VecTy && VecTy->getElementType()->isFloatingPointTy()) {
- // If the vector has one element we prefer to promote via the element type.
+ // If the vector has one element we prefer to promote via the element
+ // type.
if (VecTy->getElementCount().getFixedValue() == 1)
return {VecTy->getElementType(), false, nullptr};
return {VecTy, false, VecTy};
@@ -5226,7 +5227,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// Check if there is a common type that all slices of the partition use that
// spans the partition.
- auto [CommonUseTy, LargestIntTy] = findCommonType(P.begin(), P.end(), P.endOffset());
+ auto [CommonUseTy, LargestIntTy] =
findCommonType(P.begin(), P.end(), P.endOffset());
if (CommonUseTy) {
TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy);
>From 09bef910325448c7f9483252ac7c50022b665d02 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Tue, 9 Dec 2025 01:39:34 +0000
Subject: [PATCH 37/41] arm
---
.../AArch64/neon-scalar-x-indexed-elem.c | 40 +++++++++++--------
1 file changed, 24 insertions(+), 16 deletions(-)
diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
index a86a80a939b16..e6cd282131ca9 100644
--- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
@@ -56,8 +56,8 @@ float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to double
// CHECK-NEXT: [[TMP1:%.*]] = fmul double [[TMP0]], [[B]]
-// CHECK-NEXT: [[REF_TMP_I_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double [[TMP1]], i32 0
-// CHECK-NEXT: ret <1 x double> [[REF_TMP_I_0_VEC_INSERT]]
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to <1 x double>
+// CHECK-NEXT: ret <1 x double> [[TMP2]]
//
float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
return vmul_n_f64(a, b);
@@ -210,7 +210,9 @@ float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
// CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
-// CHECK-NEXT: ret <1 x double> [[FMLA2]]
+// CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x double> [[FMLA2]] to double
+// CHECK-NEXT: [[TMP8:%.*]] = bitcast double [[TMP7]] to <1 x double>
+// CHECK-NEXT: ret <1 x double> [[TMP8]]
//
float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
return vfma_lane_f64(a, b, v, 0);
@@ -234,7 +236,9 @@ float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
// CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
-// CHECK-NEXT: ret <1 x double> [[FMLA2]]
+// CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x double> [[FMLA2]] to double
+// CHECK-NEXT: [[TMP8:%.*]] = bitcast double [[TMP7]] to <1 x double>
+// CHECK-NEXT: ret <1 x double> [[TMP8]]
//
float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
return vfms_lane_f64(a, b, v, 0);
@@ -257,7 +261,9 @@ float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]])
// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
-// CHECK-NEXT: ret <1 x double> [[TMP10]]
+// CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x double> [[TMP10]] to double
+// CHECK-NEXT: [[TMP12:%.*]] = bitcast double [[TMP11]] to <1 x double>
+// CHECK-NEXT: ret <1 x double> [[TMP12]]
//
float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
return vfma_laneq_f64(a, b, v, 0);
@@ -281,7 +287,9 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]])
// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
-// CHECK-NEXT: ret <1 x double> [[TMP10]]
+// CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x double> [[TMP10]] to double
+// CHECK-NEXT: [[TMP12:%.*]] = bitcast double [[TMP11]] to <1 x double>
+// CHECK-NEXT: ret <1 x double> [[TMP12]]
//
float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
return vfms_laneq_f64(a, b, v, 0);
@@ -552,12 +560,12 @@ int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_lane_f64_0(
// CHECK-SAME: ) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[__PROMOTE_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FD6304BC43AB5C2, i32 0
-// CHECK-NEXT: [[__PROMOTE2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FEE211E215AEEF3, i32 0
-// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], i32 0
-// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <1 x double> [[__PROMOTE2_SROA_0_0_VEC_INSERT]], i32 0
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast double 0x3FD6304BC43AB5C2 to <1 x double>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast double 0x3FEE211E215AEEF3 to <1 x double>
+// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
+// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE9]])
-// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], double [[VMULXD_F64_I]], i32 0
+// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
// CHECK-NEXT: ret <1 x double> [[VSET_LANE]]
//
float64x1_t test_vmulx_lane_f64_0() {
@@ -574,13 +582,13 @@ float64x1_t test_vmulx_lane_f64_0() {
// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_laneq_f64_2(
// CHECK-SAME: ) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[__PROMOTE_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FD6304BC43AB5C2, i32 0
-// CHECK-NEXT: [[__PROMOTE2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x double> undef, double 0x3FEE211E215AEEF3, i32 0
-// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], <1 x double> [[__PROMOTE2_SROA_0_0_VEC_INSERT]], <2 x i32> <i32 0, i32 1>
-// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], i32 0
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast double 0x3FD6304BC43AB5C2 to <1 x double>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast double 0x3FEE211E215AEEF3 to <1 x double>
+// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[SHUFFLE_I]], i32 1
// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
-// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[__PROMOTE_SROA_0_0_VEC_INSERT]], double [[VMULXD_F64_I]], i32 0
+// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
// CHECK-NEXT: ret <1 x double> [[VSET_LANE]]
//
float64x1_t test_vmulx_laneq_f64_2() {
>From 51b68525328498d0a660dc6c95998e95ce07b928 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 10 Dec 2025 00:39:53 +0000
Subject: [PATCH 38/41] singleton
---
clang/test/CodeGen/arm-bf16-convert-intrinsics.c | 1 -
llvm/lib/Transforms/Scalar/SROA.cpp | 10 +++-------
2 files changed, 3 insertions(+), 8 deletions(-)
diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index b7f961e4ce15c..8a1ef2441b39d 100644
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -314,4 +314,3 @@ bfloat16_t test_vcvth_bf16_f32(float32_t a) {
float32_t test_vcvtah_f32_bf16(bfloat16_t a) {
return vcvtah_f32_bf16(a);
}
-
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 90bd4d982fd62..0a2ab33275e8a 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5216,14 +5216,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
VectorType *VecTy =
isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue());
// If the vector element type is a floating-point type, we prefer vector
- // promotion.
- if (VecTy && VecTy->getElementType()->isFloatingPointTy()) {
- // If the vector has one element we prefer to promote via the element
- // type.
- if (VecTy->getElementCount().getFixedValue() == 1)
- return {VecTy->getElementType(), false, nullptr};
+ // promotion. If the vector has one element, let the below code select whether we promote
+ // with the vector or scalar.
+ if (VecTy && VecTy->getElementType()->isFloatingPointTy() && VecTy->getElementCount().getFixedValue() > 1)
return {VecTy, false, VecTy};
- }
// Check if there is a common type that all slices of the partition use that
// spans the partition.
>From 6481b285b18cab7247a05738f93c9d4af7065800 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 10 Dec 2025 00:40:16 +0000
Subject: [PATCH 39/41] format
---
llvm/lib/Transforms/Scalar/SROA.cpp | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 0a2ab33275e8a..ef7b498789a44 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5216,9 +5216,10 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
VectorType *VecTy =
isVectorPromotionViable(P, DL, AI.getFunction()->getVScaleValue());
// If the vector element type is a floating-point type, we prefer vector
- // promotion. If the vector has one element, let the below code select whether we promote
- // with the vector or scalar.
- if (VecTy && VecTy->getElementType()->isFloatingPointTy() && VecTy->getElementCount().getFixedValue() > 1)
+ // promotion. If the vector has one element, let the below code select
+ // whether we promote with the vector or scalar.
+ if (VecTy && VecTy->getElementType()->isFloatingPointTy() &&
+ VecTy->getElementCount().getFixedValue() > 1)
return {VecTy, false, VecTy};
// Check if there is a common type that all slices of the partition use that
>From 2d4b683cd651aaa821107692955146af66adb3e4 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 10 Dec 2025 00:43:17 +0000
Subject: [PATCH 40/41] test
---
.../AArch64/neon-scalar-x-indexed-elem.c | 24 +++++++------------
1 file changed, 8 insertions(+), 16 deletions(-)
diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
index e6cd282131ca9..9b98126500444 100644
--- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
@@ -210,9 +210,7 @@ float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
// CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
-// CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x double> [[FMLA2]] to double
-// CHECK-NEXT: [[TMP8:%.*]] = bitcast double [[TMP7]] to <1 x double>
-// CHECK-NEXT: ret <1 x double> [[TMP8]]
+// CHECK-NEXT: ret <1 x double> [[FMLA2]]
//
float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
return vfma_lane_f64(a, b, v, 0);
@@ -236,9 +234,7 @@ float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
// CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
-// CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x double> [[FMLA2]] to double
-// CHECK-NEXT: [[TMP8:%.*]] = bitcast double [[TMP7]] to <1 x double>
-// CHECK-NEXT: ret <1 x double> [[TMP8]]
+// CHECK-NEXT: ret <1 x double> [[FMLA2]]
//
float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
return vfms_lane_f64(a, b, v, 0);
@@ -261,9 +257,7 @@ float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]])
// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
-// CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x double> [[TMP10]] to double
-// CHECK-NEXT: [[TMP12:%.*]] = bitcast double [[TMP11]] to <1 x double>
-// CHECK-NEXT: ret <1 x double> [[TMP12]]
+// CHECK-NEXT: ret <1 x double> [[TMP10]]
//
float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
return vfma_laneq_f64(a, b, v, 0);
@@ -287,9 +281,7 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]])
// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
-// CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x double> [[TMP10]] to double
-// CHECK-NEXT: [[TMP12:%.*]] = bitcast double [[TMP11]] to <1 x double>
-// CHECK-NEXT: ret <1 x double> [[TMP12]]
+// CHECK-NEXT: ret <1 x double> [[TMP10]]
//
float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
return vfms_laneq_f64(a, b, v, 0);
@@ -560,8 +552,8 @@ int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_lane_f64_0(
// CHECK-SAME: ) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast double 0x3FD6304BC43AB5C2 to <1 x double>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast double 0x3FEE211E215AEEF3 to <1 x double>
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE9]])
@@ -582,8 +574,8 @@ float64x1_t test_vmulx_lane_f64_0() {
// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_laneq_f64_2(
// CHECK-SAME: ) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast double 0x3FD6304BC43AB5C2 to <1 x double>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast double 0x3FEE211E215AEEF3 to <1 x double>
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[SHUFFLE_I]], i32 1
>From b0db4563c263003af59c1d47c882111d92c8e346 Mon Sep 17 00:00:00 2001
From: Yonah Goldberg <ygoldberg at nvidia.com>
Date: Wed, 10 Dec 2025 00:46:35 +0000
Subject: [PATCH 41/41] basictest
---
llvm/test/Transforms/SROA/basictest.ll | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index b16940f6ffdb2..15803f7b5a25b 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -1332,10 +1332,10 @@ define void @PR15674(ptr %data, ptr %src, i32 %size) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP_SROA_0:%.*]] = alloca i32, align 4
; CHECK-NEXT: switch i32 [[SIZE:%.*]], label [[END:%.*]] [
-; CHECK-NEXT: i32 4, label [[BB4:%.*]]
-; CHECK-NEXT: i32 3, label [[BB3:%.*]]
-; CHECK-NEXT: i32 2, label [[BB2:%.*]]
-; CHECK-NEXT: i32 1, label [[BB1:%.*]]
+; CHECK-NEXT: i32 4, label [[BB4:%.*]]
+; CHECK-NEXT: i32 3, label [[BB3:%.*]]
+; CHECK-NEXT: i32 2, label [[BB2:%.*]]
+; CHECK-NEXT: i32 1, label [[BB1:%.*]]
; CHECK-NEXT: ]
; CHECK: bb4:
; CHECK-NEXT: [[SRC_GEP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i32 3
More information about the llvm-commits
mailing list