[clang] [AArch64][SME] Remove immediate argument restriction for svldr and svstr (PR #68565)
Sam Tebbs via cfe-commits
cfe-commits at lists.llvm.org
Mon Oct 9 01:49:16 PDT 2023
https://github.com/SamTebbs33 created https://github.com/llvm/llvm-project/pull/68565
The svldr_vnum_za and svstr_vnum_za builtins/intrinsics currently require that the vnum argument be an immediate, since the instructions take an immediate vector number. However, we emit 0 as the immediate for the instruction no matter what, and instead modify the base register.
This patch removes that restriction on the argument, so that the argument can be a non-immediate. If an appropriate immediate was passed to the builtin then CGBuiltin passes that directly to the LLVM intrinsic, otherwise it modifies the base register as is existing behaviour.
>From f57f952989ee64d419dc51e9ecf9786720ece3ff Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Fri, 6 Oct 2023 17:09:36 +0100
Subject: [PATCH] [AArch64][SME] Remove immediate argument restriction for
svldr and svstr
The svldr_vnum_za and svstr_vnum_za builtins/intrinsics currently
require that the vnum argument be an immediate, since the instructions
take an immediate vector number. However, we emit 0 as the immediate
for the instruction no matter what, and instead modify the base register.
This patch removes that restriction on the argument, so that the
argument can be a non-immediate. If an appropriate immediate was
passed to the builtin then CGBuiltin passes that directly to the LLVM
intrinsic, otherwise it modifies the base register as is existing
behaviour.
---
clang/include/clang/Basic/arm_sme.td | 5 +-
clang/lib/CodeGen/CGBuiltin.cpp | 42 +++++++++----
.../aarch64-sme-intrinsics/acle_sme_ldr.c | 26 +++++---
.../aarch64-sme-intrinsics/acle_sme_str.c | 19 +++---
.../aarch64-sme-intrinsics/acle_sme_imm.cpp | 8 ---
llvm/include/llvm/IR/IntrinsicsAArch64.td | 4 +-
llvm/lib/Target/AArch64/SMEInstrFormats.td | 10 ++--
.../CostModel/ARM/unaligned_double_load.ll | 59 +++++++++++++++++++
.../CodeGen/AArch64/sme-intrinsics-loads.ll | 33 ++++++++---
9 files changed, 150 insertions(+), 56 deletions(-)
create mode 100644 llvm/test/Analysis/CostModel/ARM/unaligned_double_load.ll
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index d014900d719c338..49ef6b6b3fc4359 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -44,10 +44,9 @@ defm SVLD1_ZA32 : ZALoad<"za32", "i", "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0
defm SVLD1_ZA64 : ZALoad<"za64", "l", "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>]>;
defm SVLD1_ZA128 : ZALoad<"za128", "q", "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>]>;
-def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQi", "",
+def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQn", "",
[IsOverloadNone, IsStreamingCompatible, IsSharedZA],
- MemEltTyDefault, "aarch64_sme_ldr",
- [ImmCheck<2, ImmCheck0_15>]>;
+ MemEltTyDefault, "aarch64_sme_ldr">;
def SVLDR_ZA : MInst<"svldr_za", "vmQ", "",
[IsOverloadNone, IsStreamingCompatible, IsSharedZA],
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d14cf0dccb09982..ca4bf498cab9535 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9606,7 +9606,7 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
}
Value *CodeGenFunction::EmitTileslice(Value *Offset, Value *Base) {
- llvm::Value *CastOffset = Builder.CreateIntCast(Offset, Int32Ty, false);
+ llvm::Value *CastOffset = Builder.CreateIntCast(Offset, Int64Ty, false);
return Builder.CreateAdd(Base, CastOffset, "tileslice");
}
@@ -9665,18 +9665,34 @@ Value *CodeGenFunction::EmitSMEZero(const SVETypeFlags &TypeFlags,
Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags,
SmallVectorImpl<Value *> &Ops,
unsigned IntID) {
- if (Ops.size() == 3) {
- Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb);
- llvm::Value *CntsbCall = Builder.CreateCall(Cntsb, {}, "svlb");
- llvm::Value *MulVL = Builder.CreateMul(
- CntsbCall,
- Builder.getInt64(cast<llvm::ConstantInt>(Ops[2])->getZExtValue()),
- "mulvl");
-
- Ops[1] = Builder.CreateGEP(Int8Ty, Ops[1], MulVL);
- Ops[0] = EmitTileslice(Ops[0], Ops[2]);
- Ops.erase(&Ops[2]);
- }
+ if (Ops.size() == 2) {
+ // Intrinsics without a vecnum also use this function, so just provide 0
+ Ops.push_back(Ops[1]);
+ Ops[1] = Builder.getInt32(0);
+ } else {
+ int Imm = -1;
+ if (ConstantInt* C = dyn_cast<ConstantInt>(Ops[2]))
+ if (C->getZExtValue() <= 15)
+ Imm = C->getZExtValue();
+
+ if (Imm != -1) {
+ Ops[2] = Ops[1];
+ Ops[1] = Builder.getInt32(Imm);
+ } else {
+ Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb);
+ llvm::Value *CntsbCall = Builder.CreateCall(Cntsb, {}, "svlb");
+
+ llvm::Value *VecNum = Ops[2];
+ llvm::Value *MulVL = Builder.CreateMul(
+ CntsbCall,
+ VecNum,
+ "mulvl");
+
+ Ops[2] = Builder.CreateGEP(Int8Ty, Ops[1], MulVL);
+ Ops[1] = Builder.getInt32(0);
+ Ops[0] = Builder.CreateIntCast(EmitTileslice(Ops[0], VecNum), Int32Ty, false);
+ }
+ }
Function *F = CGM.getIntrinsic(IntID, {});
return Builder.CreateCall(F, Ops);
}
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
index acddc2ef50a3ddf..df7ff4ca995b544 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
@@ -8,7 +8,7 @@
// CHECK-C-LABEL: @test_svldr_vnum_za(
// CHECK-CXX-LABEL: @_Z18test_svldr_vnum_zajPKv(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]])
// CHECK-NEXT: ret void
//
void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) {
@@ -18,22 +18,34 @@ void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) {
// CHECK-C-LABEL: @test_svldr_vnum_za_1(
// CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_1jPKv(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15
-// CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]]
-// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], i32 15, ptr [[PTR]])
// CHECK-NEXT: ret void
//
void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) {
svldr_vnum_za(slice_base, ptr, 15);
}
+// CHECK-C-LABEL: @test_svldr_vnum_za_var(
+// CHECK-CXX-LABEL: @_Z22test_svldr_vnum_za_varjPKvm(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]]
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TMP2]], i32 0, ptr [[TMP0]])
+// CHECK-NEXT: ret void
+//
+void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, uint64_t vnum) {
+ svldr_vnum_za(slice_base, ptr, vnum);
+}
+
// CHECK-C-LABEL: @test_svldr_za(
// CHECK-CXX-LABEL: @_Z13test_svldr_zajPKv(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]])
// CHECK-NEXT: ret void
+//
void test_svldr_za(uint32_t slice_base, const void *ptr) {
svldr_za(slice_base, ptr);
}
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
index 2728f9ac0cd12d3..f384bd76899b0fd 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
@@ -8,31 +8,28 @@
// CHECK-C-LABEL: @test_svstr_vnum_za(
// CHECK-CXX-LABEL: @_Z18test_svstr_vnum_zajPv(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]])
// CHECK-NEXT: ret void
//
void test_svstr_vnum_za(uint32_t slice_base, void *ptr) {
svstr_vnum_za(slice_base, ptr, 0);
}
-// CHECK-C-LABEL: @test_svstr_vnum_za_1(
-// CHECK-CXX-LABEL: @_Z20test_svstr_vnum_za_1jPv(
+// CHECK-C-LABEL: define dso_local void @test_svstr_vnum_za_1(
+// CHECK-CXX-LABEL: define dso_local void @_Z20test_svstr_vnum_za_1jPv(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15
-// CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]]
-// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], i32 15, ptr [[PTR]])
// CHECK-NEXT: ret void
//
void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) {
svstr_vnum_za(slice_base, ptr, 15);
}
-// CHECK-C-LABEL: @test_svstr_za(
-// CHECK-CXX-LABEL: @_Z13test_svstr_zajPv(
+// CHECK-C-LABEL: define dso_local void @test_svstr_za(
+// CHECK-CXX-LABEL: define dso_local void @_Z13test_svstr_zajPv(
+// CHECK-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]])
// CHECK-NEXT: ret void
//
void test_svstr_za(uint32_t slice_base, void *ptr) {
diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
index 7475fd53b80ba2b..1faa5638c801c2d 100644
--- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
+++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
@@ -143,11 +143,6 @@ void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) {
// expected-error at +1 {{argument value 16 is outside the valid range [0, 15]}}
SVE_ACLE_FUNC(svst1_ver_vnum_za128,,,)(16, slice, pg, ptr, 1);
- // expected-error at +1 {{argument value 16 is outside the valid range [0, 15]}}
- SVE_ACLE_FUNC(svldr_vnum_za,,,)(-1, ptr, 16);
- // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
- SVE_ACLE_FUNC(svstr_vnum_za,,,)(-1, ptr, -1);
-
// expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
SVE_ACLE_FUNC(svread_hor_za128, _s8, _m,)(svundef_s8(), pg, -1, slice);
// expected-error at +1 {{argument value 16 is outside the valid range [0, 15]}}
@@ -171,9 +166,6 @@ void test_constant(uint64_t u64, svbool_t pg, void *ptr) {
SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(u64, 0, pg, ptr, u64); // expected-error {{argument to 'svld1_hor_vnum_za8' must be a constant integer}}
SVE_ACLE_FUNC(svst1_hor_vnum_za32,,,)(u64, 0, pg, ptr, u64); // expected-error {{argument to 'svst1_hor_vnum_za32' must be a constant integer}}
- SVE_ACLE_FUNC(svldr_vnum_za,,,)(u64, ptr, u64); // expected-error {{argument to 'svldr_vnum_za' must be a constant integer}}
- SVE_ACLE_FUNC(svstr_vnum_za,,,)(u64, ptr, u64); // expected-error {{argument to 'svstr_vnum_za' must be a constant integer}}
-
SVE_ACLE_FUNC(svread_ver_za16, _s16, _m,)(svundef_s16(), pg, u64, 0); // expected-error-re {{argument to 'svread_ver_za16{{.*}}_m' must be a constant integer}}
SVE_ACLE_FUNC(svwrite_ver_za64, _s64, _m,)(u64, 0, pg, svundef_s64()); // expected-error-re {{argument to 'svwrite_ver_za64{{.*}}_m' must be a constant integer}}
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 557063c8813268e..26827cf6110d497 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2680,9 +2680,9 @@ let TargetPrefix = "aarch64" in {
// Spill + fill
def int_aarch64_sme_ldr : DefaultAttrsIntrinsic<
- [], [llvm_i32_ty, llvm_ptr_ty]>;
+ [], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<1>>]>;
def int_aarch64_sme_str : DefaultAttrsIntrinsic<
- [], [llvm_i32_ty, llvm_ptr_ty]>;
+ [], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<1>>]>;
class SME_TileToVector_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index edd24b4a849b547..5b5b6a31705df33 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -794,8 +794,8 @@ multiclass sme_spill<string opcodestr> {
(!cast<Instruction>(NAME) MatrixOp:$ZAt,
MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>;
// base
- def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base),
- (!cast<Instruction>(NAME) ZA, $idx, 0, $base, 0)>;
+ def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm, GPR64sp:$base),
+ (!cast<Instruction>(NAME) ZA, $idx, $imm, $base, 0)>;
}
multiclass sme_fill<string opcodestr> {
@@ -805,7 +805,7 @@ multiclass sme_fill<string opcodestr> {
MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>;
def NAME # _PSEUDO
: Pseudo<(outs),
- (ins MatrixIndexGPR32Op12_15:$idx, imm0_15:$imm4,
+ (ins MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm4,
GPR64sp:$base), []>,
Sched<[]> {
// Translated to actual instruction in AArch64ISelLowering.cpp
@@ -813,8 +813,8 @@ multiclass sme_fill<string opcodestr> {
let mayLoad = 1;
}
// base
- def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base),
- (!cast<Instruction>(NAME # _PSEUDO) $idx, 0, $base)>;
+ def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm, GPR64sp:$base),
+ (!cast<Instruction>(NAME # _PSEUDO) $idx, $imm, $base)>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/Analysis/CostModel/ARM/unaligned_double_load.ll b/llvm/test/Analysis/CostModel/ARM/unaligned_double_load.ll
new file mode 100644
index 000000000000000..8d457220ea9c5ae
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/unaligned_double_load.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-NOVEC
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-FP
+
+define float @f(ptr %x) {
+; CHECK-NOVEC-LABEL: 'f'
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a.0.copyload = load float, ptr %x, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float %a.0.copyload
+;
+; CHECK-FP-LABEL: 'f'
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a.0.copyload = load float, ptr %x, align 1
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float %a.0.copyload
+;
+entry:
+ %a.0.copyload = load float, ptr %x, align 1
+ ret float %a.0.copyload
+}
+
+define float @ff(ptr %x, float %f) {
+; CHECK-NOVEC-LABEL: 'ff'
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store float %f, ptr %x, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float undef
+;
+; CHECK-FP-LABEL: 'ff'
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store float %f, ptr %x, align 1
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float undef
+;
+entry:
+ store float %f, ptr %x, align 1
+ ret float undef
+}
+
+define double @d(ptr %x) {
+; CHECK-NOVEC-LABEL: 'd'
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a.0.copyload = load double, ptr %x, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double %a.0.copyload
+;
+; CHECK-FP-LABEL: 'd'
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a.0.copyload = load double, ptr %x, align 1
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double %a.0.copyload
+;
+entry:
+ %a.0.copyload = load double, ptr %x, align 1
+ ret double %a.0.copyload
+}
+
+define double @dd(ptr %x, double %f) {
+; CHECK-NOVEC-LABEL: 'dd'
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store double %f, ptr %x, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double undef
+;
+; CHECK-FP-LABEL: 'dd'
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store double %f, ptr %x, align 1
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double undef
+;
+entry:
+ store double %f, ptr %x, align 1
+ ret double undef
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
index c96aca366ed43f2..f5d25a3229a7f82 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
@@ -252,10 +252,28 @@ define void @ldr(ptr %ptr) {
; CHECK-NEXT: mov w12, wzr
; CHECK-NEXT: ldr za[w12, 0], [x0]
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr)
+ call void @llvm.aarch64.sme.ldr(i32 0, i32 0, ptr %ptr)
ret void;
}
+define void @ldr_vnum(i32 %tile_slice, ptr %ptr, i64 %vnum) {
+; CHECK-LABEL: ldr_vnum:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: add w12, w2, w0
+; CHECK-NEXT: madd x8, x8, x2, x1
+; CHECK-NEXT: ldr za[w12, 0], [x8]
+; CHECK-NEXT: ret
+entry:
+ %svlb = tail call i64 @llvm.aarch64.sme.cntsb()
+ %mulvl = mul i64 %svlb, %vnum
+ %0 = getelementptr i8, ptr %ptr, i64 %mulvl
+ %1 = trunc i64 %vnum to i32
+ %2 = add i32 %1, %tile_slice
+ tail call void @llvm.aarch64.sme.ldr(i32 %2, i32 0, ptr %0)
+ ret void
+}
+
define void @ldr_with_off_15(ptr %ptr) {
; CHECK-LABEL: ldr_with_off_15:
; CHECK: // %bb.0:
@@ -264,7 +282,7 @@ define void @ldr_with_off_15(ptr %ptr) {
; CHECK-NEXT: ldr za[w12, 0], [x8]
; CHECK-NEXT: ret
%base = getelementptr i8, ptr %ptr, i64 15
- call void @llvm.aarch64.sme.ldr(i32 15, ptr %base)
+ call void @llvm.aarch64.sme.ldr(i32 15, i32 0, ptr %base)
ret void;
}
@@ -278,7 +296,7 @@ define void @ldr_with_off_15mulvl(ptr %ptr) {
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 240
%base = getelementptr i8, ptr %ptr, i64 %mulvl
- call void @llvm.aarch64.sme.ldr(i32 15, ptr %base)
+ call void @llvm.aarch64.sme.ldr(i32 15, i32 0, ptr %base)
ret void;
}
@@ -292,7 +310,7 @@ define void @ldr_with_off_16mulvl(ptr %ptr) {
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 256
%base = getelementptr i8, ptr %ptr, i64 %mulvl
- call void @llvm.aarch64.sme.ldr(i32 16, ptr %base)
+ call void @llvm.aarch64.sme.ldr(i32 16, i32 0, ptr %base)
ret void;
}
@@ -302,13 +320,13 @@ define void @test_ld1_sink_tile0_offset_operand(<vscale x 4 x i1> %pg, ptr %src,
; CHECK-LABEL: test_ld1_sink_tile0_offset_operand:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov w12, w1
-; CHECK-NEXT: .LBB14_1: // %for.body
+; CHECK-NEXT: .LBB15_1: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1w {za0h.s[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: subs w2, w2, #1
; CHECK-NEXT: ld1w {za0h.s[w12, 1]}, p0/z, [x0]
; CHECK-NEXT: ld1w {za0h.s[w12, 2]}, p0/z, [x0]
-; CHECK-NEXT: b.ne .LBB14_1
+; CHECK-NEXT: b.ne .LBB15_1
; CHECK-NEXT: // %bb.2: // %exit
; CHECK-NEXT: ret
entry:
@@ -341,5 +359,6 @@ declare void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1>, ptr, i32, i32)
declare void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1>, ptr, i32, i32)
declare void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1>, ptr, i32, i32)
-declare void @llvm.aarch64.sme.ldr(i32, ptr)
+declare void @llvm.aarch64.sme.ldr(i32, i32, ptr)
declare i64 @llvm.vscale.i64()
+declare i64 @llvm.aarch64.sme.cntsb()
More information about the cfe-commits
mailing list