[llvm] [AArch64] Improve operand sinking for mul instructions (PR #116604)
Hari Limaye via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 5 08:44:40 PST 2024
https://github.com/hazzlim updated https://github.com/llvm/llvm-project/pull/116604
>From f3a58f2c449aa12ecbaa2f46a50010d3573e2d36 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Fri, 15 Nov 2024 12:49:19 +0000
Subject: [PATCH 1/4] [AArch64] Add tests for sinking exts into muls (NFC)
---
llvm/test/CodeGen/AArch64/sink-mul-exts.ll | 248 +++++++++++++++++++++
1 file changed, 248 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/sink-mul-exts.ll
diff --git a/llvm/test/CodeGen/AArch64/sink-mul-exts.ll b/llvm/test/CodeGen/AArch64/sink-mul-exts.ll
new file mode 100644
index 00000000000000..f5ac7cc3102220
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sink-mul-exts.ll
@@ -0,0 +1,248 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+
+define <8 x i16> @mul_splat_sext_v8i16(ptr %x, ptr %y) {
+; CHECK-LABEL: mul_splat_sext_v8i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: dup v1.8b, v1.b[3]
+; CHECK-NEXT: .LBB0_1: // %l1
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr d2, [x1, x8]
+; CHECK-NEXT: add x8, x8, #4
+; CHECK-NEXT: cmp w8, #4
+; CHECK-NEXT: smlal v0.8h, v2.8b, v1.8b
+; CHECK-NEXT: b.eq .LBB0_1
+; CHECK-NEXT: // %bb.2: // %l2
+; CHECK-NEXT: ret
+entry:
+ %x.val = load <8 x i8>, ptr %x
+ %x.ext = sext <8 x i8> %x.val to <8 x i16>
+ %a = shufflevector <8 x i16> %x.ext, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ br label %l1
+
+l1:
+ %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
+ %q = phi <8 x i16> [ zeroinitializer, %entry ], [ %c, %l1 ]
+ %y.idx = mul nuw nsw i32 %p, 4
+ %y.ptr = getelementptr i8, ptr %y, i32 %y.idx
+ %y.val = load <8 x i8>, ptr %y.ptr
+ %y.ext = sext <8 x i8> %y.val to <8 x i16>
+ %b = mul <8 x i16> %y.ext, %a
+ %c = add <8 x i16> %q, %b
+ %pa = add i32 %p, 1
+ %c1 = icmp eq i32 %p, 0
+ br i1 %c1, label %l1, label %l2
+
+l2:
+ ret <8 x i16> %c
+}
+
+define <4 x i32> @mul_splat_sext_v4i32(ptr %x, ptr %y) {
+; CHECK-LABEL: mul_splat_sext_v4i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: .LBB1_1: // %l1
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr d2, [x1, x8]
+; CHECK-NEXT: add x8, x8, #8
+; CHECK-NEXT: cmp w8, #8
+; CHECK-NEXT: smlal v0.4s, v2.4h, v1.h[3]
+; CHECK-NEXT: b.eq .LBB1_1
+; CHECK-NEXT: // %bb.2: // %l2
+; CHECK-NEXT: ret
+entry:
+ %x.val = load <4 x i16>, ptr %x
+ %x.ext = sext <4 x i16> %x.val to <4 x i32>
+ %a = shufflevector <4 x i32> %x.ext, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ br label %l1
+
+l1:
+ %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
+ %q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ]
+ %y.idx = mul nuw nsw i32 %p, 4
+ %y.ptr = getelementptr i16, ptr %y, i32 %y.idx
+ %y.val = load <4 x i16>, ptr %y.ptr
+ %y.ext = sext <4 x i16> %y.val to <4 x i32>
+ %b = mul <4 x i32> %y.ext, %a
+ %c = add <4 x i32> %q, %b
+ %pa = add i32 %p, 1
+ %c1 = icmp eq i32 %p, 0
+ br i1 %c1, label %l1, label %l2
+
+l2:
+ ret <4 x i32> %c
+}
+
+define <2 x i64> @mul_splat_sext_v2i64(ptr %x, ptr %y) {
+; CHECK-LABEL: mul_splat_sext_v2i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: .LBB2_1: // %l1
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr d2, [x1, x8]
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp w8, #16
+; CHECK-NEXT: smlal v0.2d, v2.2s, v1.s[1]
+; CHECK-NEXT: b.eq .LBB2_1
+; CHECK-NEXT: // %bb.2: // %l2
+; CHECK-NEXT: ret
+entry:
+ %x.val = load <2 x i32>, ptr %x
+ %x.ext = sext <2 x i32> %x.val to <2 x i64>
+ %a = shufflevector <2 x i64> %x.ext, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+ br label %l1
+
+l1:
+ %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
+ %q = phi <2 x i64> [ zeroinitializer, %entry ], [ %c, %l1 ]
+ %y.idx = mul nuw nsw i32 %p, 4
+ %y.ptr = getelementptr i32, ptr %y, i32 %y.idx
+ %y.val = load <2 x i32>, ptr %y.ptr
+ %y.ext = sext <2 x i32> %y.val to <2 x i64>
+ %b = mul <2 x i64> %y.ext, %a
+ %c = add <2 x i64> %q, %b
+ %pa = add i32 %p, 1
+ %c1 = icmp eq i32 %p, 0
+ br i1 %c1, label %l1, label %l2
+
+l2:
+ ret <2 x i64> %c
+}
+
+define <8 x i16> @mul_sext_splat_v8i16(ptr %x, ptr %y) {
+; CHECK-LABEL: mul_sext_splat_v8i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: dup v1.8b, v0.b[3]
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-NEXT: .LBB3_1: // %l1
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr d2, [x1, x8]
+; CHECK-NEXT: add x8, x8, #4
+; CHECK-NEXT: cmp w8, #4
+; CHECK-NEXT: sshll v2.8h, v2.8b, #0
+; CHECK-NEXT: mla v0.8h, v2.8h, v1.8h
+; CHECK-NEXT: b.eq .LBB3_1
+; CHECK-NEXT: // %bb.2: // %l2
+; CHECK-NEXT: ret
+entry:
+ %x.val = load <8 x i8>, ptr %x
+ %x.spt = shufflevector <8 x i8> %x.val, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %a = sext <8 x i8> %x.spt to <8 x i16>
+ br label %l1
+
+l1:
+ %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
+ %q = phi <8 x i16> [ zeroinitializer, %entry ], [ %c, %l1 ]
+ %y.idx = mul nuw nsw i32 %p, 4
+ %y.ptr = getelementptr i8, ptr %y, i32 %y.idx
+ %y.val = load <8 x i8>, ptr %y.ptr
+ %y.ext = sext <8 x i8> %y.val to <8 x i16>
+ %b = mul <8 x i16> %y.ext, %a
+ %c = add <8 x i16> %q, %b
+ %pa = add i32 %p, 1
+ %c1 = icmp eq i32 %p, 0
+ br i1 %c1, label %l1, label %l2
+
+l2:
+ ret <8 x i16> %c
+}
+
+define <4 x i32> @mul_sext_splat_v4i32(ptr %x, ptr %y) {
+; CHECK-LABEL: mul_sext_splat_v4i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: dup v1.4h, v0.h[3]
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-NEXT: .LBB4_1: // %l1
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr d2, [x1, x8]
+; CHECK-NEXT: add x8, x8, #8
+; CHECK-NEXT: cmp w8, #8
+; CHECK-NEXT: sshll v2.4s, v2.4h, #0
+; CHECK-NEXT: mla v0.4s, v2.4s, v1.4s
+; CHECK-NEXT: b.eq .LBB4_1
+; CHECK-NEXT: // %bb.2: // %l2
+; CHECK-NEXT: ret
+entry:
+ %x.val = load <4 x i16>, ptr %x
+ %x.spt = shufflevector <4 x i16> %x.val, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %a = sext <4 x i16> %x.spt to <4 x i32>
+ br label %l1
+
+l1:
+ %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
+ %q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ]
+ %y.idx = mul nuw nsw i32 %p, 4
+ %y.ptr = getelementptr i16, ptr %y, i32 %y.idx
+ %y.val = load <4 x i16>, ptr %y.ptr
+ %y.ext = sext <4 x i16> %y.val to <4 x i32>
+ %b = mul <4 x i32> %y.ext, %a
+ %c = add <4 x i32> %q, %b
+ %pa = add i32 %p, 1
+ %c1 = icmp eq i32 %p, 0
+ br i1 %c1, label %l1, label %l2
+
+l2:
+ ret <4 x i32> %c
+}
+
+define <2 x i64> @mul_sext_splat_v2i64(ptr %x, ptr %y) {
+; CHECK-LABEL: mul_sext_splat_v2i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: dup v0.2s, v0.s[1]
+; CHECK-NEXT: sshll v1.2d, v0.2s, #0
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: mov x9, v1.d[1]
+; CHECK-NEXT: fmov x10, d1
+; CHECK-NEXT: .LBB5_1: // %l1
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr d1, [x1, x8]
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp w8, #16
+; CHECK-NEXT: sshll v1.2d, v1.2s, #0
+; CHECK-NEXT: fmov x12, d1
+; CHECK-NEXT: mov x11, v1.d[1]
+; CHECK-NEXT: smull x12, w12, w10
+; CHECK-NEXT: smull x11, w11, w9
+; CHECK-NEXT: fmov d1, x12
+; CHECK-NEXT: mov v1.d[1], x11
+; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: b.eq .LBB5_1
+; CHECK-NEXT: // %bb.2: // %l2
+; CHECK-NEXT: ret
+entry:
+ %x.val = load <2 x i32>, ptr %x
+ %x.spt = shufflevector <2 x i32> %x.val, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %a = sext <2 x i32> %x.spt to <2 x i64>
+ br label %l1
+
+l1:
+ %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
+ %q = phi <2 x i64> [ zeroinitializer, %entry ], [ %c, %l1 ]
+ %y.idx = mul nuw nsw i32 %p, 4
+ %y.ptr = getelementptr i32, ptr %y, i32 %y.idx
+ %y.val = load <2 x i32>, ptr %y.ptr
+ %y.ext = sext <2 x i32> %y.val to <2 x i64>
+ %b = mul <2 x i64> %y.ext, %a
+ %c = add <2 x i64> %q, %b
+ %pa = add i32 %p, 1
+ %c1 = icmp eq i32 %p, 0
+ br i1 %c1, label %l1, label %l2
+
+l2:
+ ret <2 x i64> %c
+}
>From 343168a2ad212b7df6c422aa53773044b23edca5 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Fri, 15 Nov 2024 00:25:39 +0000
Subject: [PATCH 2/4] [AArch64] Improve operand sinking for mul instructions
- Sink splat operands to mul instructions for types where we can use the
lane-indexed variants.
- When sinking operands for [su]mull, also sink the ext instruction.
---
.../AArch64/AArch64TargetTransformInfo.cpp | 48 +++--
.../CodeGen/AArch64/aarch64-dup-ext-crash.ll | 16 +-
.../AArch64/aarch64-matrix-umull-smull.ll | 172 +++++++++---------
llvm/test/CodeGen/AArch64/sink-mul-exts.ll | 38 ++--
llvm/test/CodeGen/AArch64/sinksplat.ll | 3 +-
5 files changed, 143 insertions(+), 134 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a97b0d3b1db92a..615c2854852824 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5168,26 +5168,41 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
return false;
}
case Instruction::Mul: {
+ auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
+ auto VT = MVT::getVT(V->getType(), /*HandleUnknown=*/true);
+ return (VT == MVT::v4i16 || VT == MVT::v8i16 || VT == MVT::v2i32 ||
+ VT == MVT::v4i32);
+ };
+
int NumZExts = 0, NumSExts = 0;
for (auto &Op : I->operands()) {
// Make sure we are not already sinking this operand
if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
continue;
- if (match(&Op, m_SExt(m_Value()))) {
- NumSExts++;
- continue;
- } else if (match(&Op, m_ZExt(m_Value()))) {
- NumZExts++;
+ if (match(&Op, m_ZExtOrSExt(m_Value()))) {
+ auto *Ext = cast<Instruction>(Op);
+ auto *ExtOp = Ext->getOperand(0);
+ if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
+ Ops.push_back(&Ext->getOperandUse(0));
+ Ops.push_back(&Op);
+
+ if (isa<SExtInst>(Ext))
+ NumSExts++;
+ else
+ NumZExts++;
+
continue;
}
ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
+ if (!Shuffle)
+ continue;
// If the Shuffle is a splat and the operand is a zext/sext, sinking the
// operand and the s/zext can help create indexed s/umull. This is
// especially useful to prevent i64 mul being scalarized.
- if (Shuffle && isSplatShuffle(Shuffle) &&
+ if (isSplatShuffle(Shuffle) &&
match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
Ops.push_back(&Shuffle->getOperandUse(0));
Ops.push_back(&Op);
@@ -5198,9 +5213,6 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
continue;
}
- if (!Shuffle)
- continue;
-
Value *ShuffleOperand = Shuffle->getOperand(0);
InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
if (!Insert)
@@ -5232,12 +5244,26 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
NumZExts++;
}
+ Ops.push_back(&Insert->getOperandUse(1));
Ops.push_back(&Shuffle->getOperandUse(0));
Ops.push_back(&Op);
}
- // Is it profitable to sink if we found two of the same type of extends.
- return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
+ // It is profitable to sink if we found two of the same type of extends.
+ if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
+ return true;
+
+ // Otherwise, see if we should sink splats for indexed variants.
+ if (!ShouldSinkSplatForIndexedVariant(I))
+ return false;
+
+ Ops.clear();
+ if (isSplatShuffle(I->getOperand(0)))
+ Ops.push_back(&I->getOperandUse(0));
+ if (isSplatShuffle(I->getOperand(1)))
+ Ops.push_back(&I->getOperandUse(1));
+
+ return !Ops.empty();
}
default:
return false;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll
index ef54cc4bbf7180..482135b721da49 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll
@@ -10,14 +10,18 @@ target triple = "aarch64-unknown-linux-gnu"
define dso_local i32 @dupext_crashtest(i32 %e) local_unnamed_addr {
; CHECK-LABEL: dupext_crashtest:
; CHECK: // %bb.0: // %for.body.lr.ph
-; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: dup v0.2s, w8
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr d1, [x8]
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEXT: str d1, [x8]
+; CHECK-NEXT: ldr d0, [x8]
+; CHECK-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: mul w9, w0, w9
+; CHECK-NEXT: mul w8, w0, w8
+; CHECK-NEXT: fmov d0, x9
+; CHECK-NEXT: mov v0.d[1], x8
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: str d0, [x8]
; CHECK-NEXT: b .LBB0_1
for.body.lr.ph:
%conv314 = zext i32 %e to i64
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 0c7a61739695fb..3432b15abfbf22 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -5,9 +5,8 @@
define void @matrix_mul_unsigned(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) {
; CHECK-SD-LABEL: matrix_mul_unsigned:
; CHECK-SD: // %bb.0: // %vector.header
-; CHECK-SD-NEXT: and w8, w3, #0xffff
+; CHECK-SD-NEXT: dup v0.4h, w3
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT: dup v0.4h, w8
; CHECK-SD-NEXT: and x8, x0, #0xfffffff8
; CHECK-SD-NEXT: .LBB0_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
@@ -91,9 +90,8 @@ for.end12: ; preds = %vector.body
define void @matrix_mul_signed(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) {
; CHECK-SD-LABEL: matrix_mul_signed:
; CHECK-SD: // %bb.0: // %vector.header
-; CHECK-SD-NEXT: sxth w8, w3
+; CHECK-SD-NEXT: dup v0.4h, w3
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT: dup v0.4h, w8
; CHECK-SD-NEXT: and x8, x0, #0xfffffff8
; CHECK-SD-NEXT: .LBB1_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
@@ -179,9 +177,8 @@ for.end12: ; preds = %vector.body
define void @matrix_mul_double_shuffle(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) {
; CHECK-SD-LABEL: matrix_mul_double_shuffle:
; CHECK-SD: // %bb.0: // %vector.header
-; CHECK-SD-NEXT: and w8, w3, #0xffff
+; CHECK-SD-NEXT: dup v0.4h, w3
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT: dup v0.4h, w8
; CHECK-SD-NEXT: and x8, x0, #0xfffffff8
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 killed $x0 def $x0
; CHECK-SD-NEXT: .LBB2_1: // %vector.body
@@ -261,44 +258,44 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
; CHECK-SD-NEXT: cmp w3, #1
; CHECK-SD-NEXT: b.lt .LBB3_8
; CHECK-SD-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-SD-NEXT: sxth w8, w1
; CHECK-SD-NEXT: cmp w3, #15
-; CHECK-SD-NEXT: mov w9, w3
+; CHECK-SD-NEXT: mov w8, w3
; CHECK-SD-NEXT: b.hi .LBB3_3
; CHECK-SD-NEXT: // %bb.2:
-; CHECK-SD-NEXT: mov x10, xzr
+; CHECK-SD-NEXT: mov x9, xzr
; CHECK-SD-NEXT: b .LBB3_6
; CHECK-SD-NEXT: .LBB3_3: // %vector.ph
-; CHECK-SD-NEXT: dup v0.8h, w8
-; CHECK-SD-NEXT: and x10, x9, #0xfffffff0
-; CHECK-SD-NEXT: add x11, x2, #32
-; CHECK-SD-NEXT: add x12, x0, #16
-; CHECK-SD-NEXT: mov x13, x10
+; CHECK-SD-NEXT: dup v0.8h, w1
+; CHECK-SD-NEXT: and x9, x8, #0xfffffff0
+; CHECK-SD-NEXT: add x10, x2, #32
+; CHECK-SD-NEXT: add x11, x0, #16
+; CHECK-SD-NEXT: mov x12, x9
; CHECK-SD-NEXT: .LBB3_4: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT: ldp q1, q2, [x12, #-16]
-; CHECK-SD-NEXT: subs x13, x13, #16
-; CHECK-SD-NEXT: add x12, x12, #32
+; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16]
+; CHECK-SD-NEXT: subs x12, x12, #16
+; CHECK-SD-NEXT: add x11, x11, #32
; CHECK-SD-NEXT: smull2 v3.4s, v0.8h, v1.8h
; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: smull2 v4.4s, v0.8h, v2.8h
; CHECK-SD-NEXT: smull v2.4s, v0.4h, v2.4h
-; CHECK-SD-NEXT: stp q1, q3, [x11, #-32]
-; CHECK-SD-NEXT: stp q2, q4, [x11], #64
+; CHECK-SD-NEXT: stp q1, q3, [x10, #-32]
+; CHECK-SD-NEXT: stp q2, q4, [x10], #64
; CHECK-SD-NEXT: b.ne .LBB3_4
; CHECK-SD-NEXT: // %bb.5: // %middle.block
-; CHECK-SD-NEXT: cmp x10, x9
+; CHECK-SD-NEXT: cmp x9, x8
; CHECK-SD-NEXT: b.eq .LBB3_8
; CHECK-SD-NEXT: .LBB3_6: // %for.body.preheader1
-; CHECK-SD-NEXT: add x11, x2, x10, lsl #2
-; CHECK-SD-NEXT: add x12, x0, x10, lsl #1
-; CHECK-SD-NEXT: sub x9, x9, x10
+; CHECK-SD-NEXT: sxth w10, w1
+; CHECK-SD-NEXT: add x11, x2, x9, lsl #2
+; CHECK-SD-NEXT: add x12, x0, x9, lsl #1
+; CHECK-SD-NEXT: sub x8, x8, x9
; CHECK-SD-NEXT: .LBB3_7: // %for.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT: ldrsh w10, [x12], #2
-; CHECK-SD-NEXT: subs x9, x9, #1
-; CHECK-SD-NEXT: mul w10, w10, w8
-; CHECK-SD-NEXT: str w10, [x11], #4
+; CHECK-SD-NEXT: ldrsh w9, [x12], #2
+; CHECK-SD-NEXT: subs x8, x8, #1
+; CHECK-SD-NEXT: mul w9, w9, w10
+; CHECK-SD-NEXT: str w9, [x11], #4
; CHECK-SD-NEXT: b.ne .LBB3_7
; CHECK-SD-NEXT: .LBB3_8: // %for.cond.cleanup
; CHECK-SD-NEXT: ret
@@ -424,43 +421,43 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
; CHECK-SD-NEXT: b.lt .LBB4_8
; CHECK-SD-NEXT: // %bb.1: // %for.body.preheader
; CHECK-SD-NEXT: cmp w3, #15
-; CHECK-SD-NEXT: and w8, w1, #0xffff
-; CHECK-SD-NEXT: mov w9, w3
+; CHECK-SD-NEXT: mov w8, w3
; CHECK-SD-NEXT: b.hi .LBB4_3
; CHECK-SD-NEXT: // %bb.2:
-; CHECK-SD-NEXT: mov x10, xzr
+; CHECK-SD-NEXT: mov x9, xzr
; CHECK-SD-NEXT: b .LBB4_6
; CHECK-SD-NEXT: .LBB4_3: // %vector.ph
-; CHECK-SD-NEXT: dup v0.8h, w8
-; CHECK-SD-NEXT: and x10, x9, #0xfffffff0
-; CHECK-SD-NEXT: add x11, x2, #32
-; CHECK-SD-NEXT: add x12, x0, #16
-; CHECK-SD-NEXT: mov x13, x10
+; CHECK-SD-NEXT: dup v0.8h, w1
+; CHECK-SD-NEXT: and x9, x8, #0xfffffff0
+; CHECK-SD-NEXT: add x10, x2, #32
+; CHECK-SD-NEXT: add x11, x0, #16
+; CHECK-SD-NEXT: mov x12, x9
; CHECK-SD-NEXT: .LBB4_4: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT: ldp q1, q2, [x12, #-16]
-; CHECK-SD-NEXT: subs x13, x13, #16
-; CHECK-SD-NEXT: add x12, x12, #32
+; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16]
+; CHECK-SD-NEXT: subs x12, x12, #16
+; CHECK-SD-NEXT: add x11, x11, #32
; CHECK-SD-NEXT: umull2 v3.4s, v0.8h, v1.8h
; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: umull2 v4.4s, v0.8h, v2.8h
; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h
-; CHECK-SD-NEXT: stp q1, q3, [x11, #-32]
-; CHECK-SD-NEXT: stp q2, q4, [x11], #64
+; CHECK-SD-NEXT: stp q1, q3, [x10, #-32]
+; CHECK-SD-NEXT: stp q2, q4, [x10], #64
; CHECK-SD-NEXT: b.ne .LBB4_4
; CHECK-SD-NEXT: // %bb.5: // %middle.block
-; CHECK-SD-NEXT: cmp x10, x9
+; CHECK-SD-NEXT: cmp x9, x8
; CHECK-SD-NEXT: b.eq .LBB4_8
; CHECK-SD-NEXT: .LBB4_6: // %for.body.preheader1
-; CHECK-SD-NEXT: add x11, x2, x10, lsl #2
-; CHECK-SD-NEXT: add x12, x0, x10, lsl #1
-; CHECK-SD-NEXT: sub x9, x9, x10
+; CHECK-SD-NEXT: add x10, x2, x9, lsl #2
+; CHECK-SD-NEXT: add x11, x0, x9, lsl #1
+; CHECK-SD-NEXT: and w12, w1, #0xffff
+; CHECK-SD-NEXT: sub x8, x8, x9
; CHECK-SD-NEXT: .LBB4_7: // %for.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT: ldrh w10, [x12], #2
-; CHECK-SD-NEXT: subs x9, x9, #1
-; CHECK-SD-NEXT: mul w10, w10, w8
-; CHECK-SD-NEXT: str w10, [x11], #4
+; CHECK-SD-NEXT: ldrh w9, [x11], #2
+; CHECK-SD-NEXT: subs x8, x8, #1
+; CHECK-SD-NEXT: mul w9, w9, w12
+; CHECK-SD-NEXT: str w9, [x10], #4
; CHECK-SD-NEXT: b.ne .LBB4_7
; CHECK-SD-NEXT: .LBB4_8: // %for.cond.cleanup
; CHECK-SD-NEXT: ret
@@ -470,47 +467,48 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
; CHECK-GI-NEXT: cmp w3, #0
; CHECK-GI-NEXT: b.le .LBB4_7
; CHECK-GI-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-GI-NEXT: mov x9, xzr
+; CHECK-GI-NEXT: mov x8, xzr
; CHECK-GI-NEXT: cmp w3, #16
-; CHECK-GI-NEXT: and w8, w1, #0xffff
-; CHECK-GI-NEXT: mov w10, w3
+; CHECK-GI-NEXT: mov w9, w3
; CHECK-GI-NEXT: b.lo .LBB4_5
; CHECK-GI-NEXT: // %bb.2: // %vector.ph
-; CHECK-GI-NEXT: dup v0.4s, w8
-; CHECK-GI-NEXT: and x9, x10, #0xfffffff0
-; CHECK-GI-NEXT: add x11, x2, #32
-; CHECK-GI-NEXT: add x12, x0, #16
-; CHECK-GI-NEXT: mov x13, x9
+; CHECK-GI-NEXT: and x8, x9, #0xfffffff0
+; CHECK-GI-NEXT: add x10, x2, #32
+; CHECK-GI-NEXT: add x11, x0, #16
+; CHECK-GI-NEXT: mov x12, x8
; CHECK-GI-NEXT: .LBB4_3: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT: ldp q1, q2, [x12, #-16]
-; CHECK-GI-NEXT: mov x14, x11
-; CHECK-GI-NEXT: subs x13, x13, #16
-; CHECK-GI-NEXT: add x12, x12, #32
-; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT: ldp q0, q1, [x11, #-16]
+; CHECK-GI-NEXT: and w13, w1, #0xffff
+; CHECK-GI-NEXT: dup v2.4s, w13
+; CHECK-GI-NEXT: mov x13, x10
+; CHECK-GI-NEXT: subs x12, x12, #16
+; CHECK-GI-NEXT: add x11, x11, #32
+; CHECK-GI-NEXT: ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT: ushll v4.4s, v1.4h, #0
; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT: ushll v4.4s, v2.4h, #0
-; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0
-; CHECK-GI-NEXT: mul v3.4s, v0.4s, v3.4s
-; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: mul v4.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT: stp q3, q1, [x14, #-32]!
-; CHECK-GI-NEXT: stp q4, q2, [x11], #64
+; CHECK-GI-NEXT: mul v3.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT: mul v4.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT: mul v1.4s, v2.4s, v1.4s
+; CHECK-GI-NEXT: stp q3, q0, [x13, #-32]!
+; CHECK-GI-NEXT: stp q4, q1, [x10], #64
; CHECK-GI-NEXT: b.ne .LBB4_3
; CHECK-GI-NEXT: // %bb.4: // %middle.block
-; CHECK-GI-NEXT: cmp x9, x10
+; CHECK-GI-NEXT: cmp x8, x9
; CHECK-GI-NEXT: b.eq .LBB4_7
; CHECK-GI-NEXT: .LBB4_5: // %for.body.preheader1
-; CHECK-GI-NEXT: add x11, x2, x9, lsl #2
-; CHECK-GI-NEXT: add x12, x0, x9, lsl #1
-; CHECK-GI-NEXT: sub x9, x10, x9
+; CHECK-GI-NEXT: add x10, x2, x8, lsl #2
+; CHECK-GI-NEXT: add x11, x0, x8, lsl #1
+; CHECK-GI-NEXT: and w12, w1, #0xffff
+; CHECK-GI-NEXT: sub x8, x9, x8
; CHECK-GI-NEXT: .LBB4_6: // %for.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT: ldrh w10, [x12], #2
-; CHECK-GI-NEXT: subs x9, x9, #1
-; CHECK-GI-NEXT: mul w10, w10, w8
-; CHECK-GI-NEXT: str w10, [x11], #4
+; CHECK-GI-NEXT: ldrh w9, [x11], #2
+; CHECK-GI-NEXT: subs x8, x8, #1
+; CHECK-GI-NEXT: mul w9, w9, w12
+; CHECK-GI-NEXT: str w9, [x10], #4
; CHECK-GI-NEXT: b.ne .LBB4_6
; CHECK-GI-NEXT: .LBB4_7: // %for.cond.cleanup
; CHECK-GI-NEXT: ret
@@ -600,7 +598,7 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-NEXT: and x11, x10, #0xfffffff0
-; CHECK-SD-NEXT: dup v2.8h, w9
+; CHECK-SD-NEXT: fmov s2, w9
; CHECK-SD-NEXT: add x8, x0, #8
; CHECK-SD-NEXT: mov x12, x11
; CHECK-SD-NEXT: .LBB5_5: // %vector.body
@@ -610,8 +608,8 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
; CHECK-SD-NEXT: add x8, x8, #16
; CHECK-SD-NEXT: ushll v3.8h, v3.8b, #0
; CHECK-SD-NEXT: ushll v4.8h, v4.8b, #0
-; CHECK-SD-NEXT: mla v0.8h, v2.8h, v3.8h
-; CHECK-SD-NEXT: mla v1.8h, v2.8h, v4.8h
+; CHECK-SD-NEXT: mla v0.8h, v3.8h, v2.h[0]
+; CHECK-SD-NEXT: mla v1.8h, v4.8h, v2.h[0]
; CHECK-SD-NEXT: b.ne .LBB5_5
; CHECK-SD-NEXT: // %bb.6: // %middle.block
; CHECK-SD-NEXT: add v0.8h, v1.8h, v0.8h
@@ -1025,9 +1023,8 @@ exit:
define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) {
; CHECK-SD-LABEL: matrix_mul_unsigned_and:
; CHECK-SD: // %bb.0: // %vector.header
-; CHECK-SD-NEXT: and w8, w3, #0xffff
+; CHECK-SD-NEXT: dup v0.4h, w3
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT: dup v0.4h, w8
; CHECK-SD-NEXT: and x8, x0, #0xfffffff8
; CHECK-SD-NEXT: .LBB10_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
@@ -1111,9 +1108,8 @@ for.end12: ; preds = %vector.body
define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) {
; CHECK-SD-LABEL: matrix_mul_unsigned_and_double:
; CHECK-SD: // %bb.0: // %vector.header
-; CHECK-SD-NEXT: and w8, w3, #0xffff
+; CHECK-SD-NEXT: dup v0.8h, w3
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT: dup v0.8h, w8
; CHECK-SD-NEXT: and x8, x0, #0xfffffff0
; CHECK-SD-NEXT: .LBB11_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
@@ -1207,10 +1203,10 @@ for.end12: ; preds = %vector.body
define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) {
; CHECK-SD-LABEL: matrix_mul_signed_and:
; CHECK-SD: // %bb.0: // %vector.header
-; CHECK-SD-NEXT: and w8, w3, #0xffff
+; CHECK-SD-NEXT: and w9, w3, #0xffff
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT: dup v0.4s, w8
; CHECK-SD-NEXT: and x8, x0, #0xfffffff8
+; CHECK-SD-NEXT: fmov s0, w9
; CHECK-SD-NEXT: .LBB12_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1
@@ -1220,8 +1216,8 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
; CHECK-SD-NEXT: add w0, w0, #8
; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-SD-NEXT: mul v1.4s, v0.4s, v1.4s
-; CHECK-SD-NEXT: mul v2.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT: mul v1.4s, v1.4s, v0.s[0]
+; CHECK-SD-NEXT: mul v2.4s, v2.4s, v0.s[0]
; CHECK-SD-NEXT: stp q1, q2, [x9]
; CHECK-SD-NEXT: b.ne .LBB12_1
; CHECK-SD-NEXT: // %bb.2: // %for.end12
diff --git a/llvm/test/CodeGen/AArch64/sink-mul-exts.ll b/llvm/test/CodeGen/AArch64/sink-mul-exts.ll
index f5ac7cc3102220..d52ac7847f8146 100644
--- a/llvm/test/CodeGen/AArch64/sink-mul-exts.ll
+++ b/llvm/test/CodeGen/AArch64/sink-mul-exts.ll
@@ -119,18 +119,16 @@ l2:
define <8 x i16> @mul_sext_splat_v8i16(ptr %x, ptr %y) {
; CHECK-LABEL: mul_sext_splat_v8i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: dup v1.8b, v0.b[3]
+; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: dup v1.8b, v1.b[3]
; CHECK-NEXT: .LBB3_1: // %l1
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr d2, [x1, x8]
; CHECK-NEXT: add x8, x8, #4
; CHECK-NEXT: cmp w8, #4
-; CHECK-NEXT: sshll v2.8h, v2.8b, #0
-; CHECK-NEXT: mla v0.8h, v2.8h, v1.8h
+; CHECK-NEXT: smlal v0.8h, v2.8b, v1.8b
; CHECK-NEXT: b.eq .LBB3_1
; CHECK-NEXT: // %bb.2: // %l2
; CHECK-NEXT: ret
@@ -160,18 +158,15 @@ l2:
define <4 x i32> @mul_sext_splat_v4i32(ptr %x, ptr %y) {
; CHECK-LABEL: mul_sext_splat_v4i32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: dup v1.4h, v0.h[3]
; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB4_1: // %l1
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr d2, [x1, x8]
; CHECK-NEXT: add x8, x8, #8
; CHECK-NEXT: cmp w8, #8
-; CHECK-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-NEXT: mla v0.4s, v2.4s, v1.4s
+; CHECK-NEXT: smlal v0.4s, v2.4h, v1.h[3]
; CHECK-NEXT: b.eq .LBB4_1
; CHECK-NEXT: // %bb.2: // %l2
; CHECK-NEXT: ret
@@ -201,26 +196,15 @@ l2:
define <2 x i64> @mul_sext_splat_v2i64(ptr %x, ptr %y) {
; CHECK-LABEL: mul_sext_splat_v2i64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: dup v0.2s, v0.s[1]
-; CHECK-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: mov x9, v1.d[1]
-; CHECK-NEXT: fmov x10, d1
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB5_1: // %l1
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr d1, [x1, x8]
+; CHECK-NEXT: ldr d2, [x1, x8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp w8, #16
-; CHECK-NEXT: sshll v1.2d, v1.2s, #0
-; CHECK-NEXT: fmov x12, d1
-; CHECK-NEXT: mov x11, v1.d[1]
-; CHECK-NEXT: smull x12, w12, w10
-; CHECK-NEXT: smull x11, w11, w9
-; CHECK-NEXT: fmov d1, x12
-; CHECK-NEXT: mov v1.d[1], x11
-; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: smlal v0.2d, v2.2s, v1.s[1]
; CHECK-NEXT: b.eq .LBB5_1
; CHECK-NEXT: // %bb.2: // %l2
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
index d156ec079ae941..4cee60f2d82658 100644
--- a/llvm/test/CodeGen/AArch64/sinksplat.ll
+++ b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -204,10 +204,9 @@ define <4 x i32> @mlal(<4 x i32> %x, ptr %y) {
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: ldr q2, [x0]
; CHECK-NEXT: mov w8, #1 // =0x1
-; CHECK-NEXT: dup v1.4s, v1.s[3]
; CHECK-NEXT: .LBB6_1: // %l1
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mla v0.4s, v2.4s, v1.4s
+; CHECK-NEXT: mla v0.4s, v2.4s, v1.s[3]
; CHECK-NEXT: subs w8, w8, #1
; CHECK-NEXT: b.eq .LBB6_1
; CHECK-NEXT: // %bb.2: // %l2
>From 265694a116883b7119289f9c93a90b078c6b7704 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Mon, 2 Dec 2024 12:24:11 +0000
Subject: [PATCH 3/4] Also sink vector types with different element counts
---
.../Target/AArch64/AArch64TargetTransformInfo.cpp | 11 ++++++++---
.../CodeGen/AArch64/aarch64-matrix-umull-smull.ll | 12 ++++++------
2 files changed, 14 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 615c2854852824..3df24e9f565717 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5169,9 +5169,14 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
}
case Instruction::Mul: {
auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
- auto VT = MVT::getVT(V->getType(), /*HandleUnknown=*/true);
- return (VT == MVT::v4i16 || VT == MVT::v8i16 || VT == MVT::v2i32 ||
- VT == MVT::v4i32);
+ auto *Ty = cast<VectorType>(V->getType());
+ // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
+ if (Ty->isScalableTy())
+ return false;
+
+ // Indexed variants of Mul exist for i16 and i32 element types only.
+ auto ElemVT = MVT::getVT(Ty->getElementType(), /*HandleUnknown=*/true);
+ return (ElemVT == MVT::i16 || ElemVT == MVT::i32);
};
int NumZExts = 0, NumSExts = 0;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 3432b15abfbf22..fb6575cc0ee83b 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -1291,10 +1291,10 @@ for.end12: ; preds = %vector.body
define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) {
; CHECK-SD-LABEL: matrix_mul_signed_and_double:
; CHECK-SD: // %bb.0: // %vector.header
-; CHECK-SD-NEXT: and w8, w3, #0xffff
+; CHECK-SD-NEXT: and w9, w3, #0xffff
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT: dup v0.4s, w8
; CHECK-SD-NEXT: and x8, x0, #0xfffffff0
+; CHECK-SD-NEXT: fmov s0, w9
; CHECK-SD-NEXT: .LBB13_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1
@@ -1307,10 +1307,10 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: sshll2 v4.4s, v2.8h, #0
; CHECK-SD-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-SD-NEXT: mul v3.4s, v0.4s, v3.4s
-; CHECK-SD-NEXT: mul v1.4s, v0.4s, v1.4s
-; CHECK-SD-NEXT: mul v4.4s, v0.4s, v4.4s
-; CHECK-SD-NEXT: mul v2.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT: mul v3.4s, v3.4s, v0.s[0]
+; CHECK-SD-NEXT: mul v1.4s, v1.4s, v0.s[0]
+; CHECK-SD-NEXT: mul v4.4s, v4.4s, v0.s[0]
+; CHECK-SD-NEXT: mul v2.4s, v2.4s, v0.s[0]
; CHECK-SD-NEXT: stp q1, q3, [x9]
; CHECK-SD-NEXT: stp q2, q4, [x9, #32]
; CHECK-SD-NEXT: b.ne .LBB13_1
>From 9c82902707d38790ce0222b99fd85a1896c87dee Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Wed, 4 Dec 2024 10:10:05 +0000
Subject: [PATCH 4/4] Avoid call to MVT::getVT()
---
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 3df24e9f565717..0f3577468a8b46 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5175,8 +5175,7 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
return false;
// Indexed variants of Mul exist for i16 and i32 element types only.
- auto ElemVT = MVT::getVT(Ty->getElementType(), /*HandleUnknown=*/true);
- return (ElemVT == MVT::i16 || ElemVT == MVT::i32);
+ return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
};
int NumZExts = 0, NumSExts = 0;
More information about the llvm-commits
mailing list