[llvm] [AArch64][Codegen]Transform saturating smull to sqdmulh (PR #143671)
Nashe Mncube via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 11 08:16:32 PDT 2025
https://github.com/nasherm updated https://github.com/llvm/llvm-project/pull/143671
>From 30c41930cd0556682df4d3f90ac2910aa9e5ee35 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Tue, 10 Jun 2025 16:20:42 +0100
Subject: [PATCH] [AArch64][Codegen]Transform saturating smull to sqdmulh
This patch adds a pattern for recognizing saturating vector
smull. Prior to this patch these were performed using a
combination of smull+smull2+uzp+smin which can be done
with a ushr + sqdmulh.
Change-Id: Ib7d4d5284d1bd3fdd0907365f9e2f37f4da14671
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 7 +++
.../CodeGen/AArch64/saturating-vec-smull.ll | 45 +++++++++++++++++++
2 files changed, 52 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 727831896737d..3984c9b0cc1cd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -9349,6 +9349,13 @@ def : Pat<(v4i32 (mulhs V128:$Rn, V128:$Rm)),
(EXTRACT_SUBREG V128:$Rm, dsub)),
(SMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
+// Saturating vector mulitplications on signed integers
+// follow a smull + smull2 + uzip + smin pattern. It would
+// be more efficient to make use of sqdmulh instructions
+def : Pat<(v4i32 (smin (mulhs V128:$Rn, V128:$Rm),
+ (v4i32 (AArch64mvni_shift (i32 192), (i32 24))))),
+ (USHRv4i32_shift (SQDMULHv4i32 V128:$Rn, V128:$Rm), (i32 1))>;
+
def : Pat<(v16i8 (mulhu V128:$Rn, V128:$Rm)),
(UZP2v16i8
(UMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub),
diff --git a/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
new file mode 100644
index 0000000000000..0bc454f06743d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-elf < %s | FileCheck %s
+define void @arm_mult_q31(ptr %0, ptr %1, ptr %2){
+; CHECK-LABEL: arm_mult_q31:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr q0, [x0, x8]
+; CHECK-NEXT: ldr q1, [x1, x8]
+; CHECK-NEXT: sqdmulh v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: ushr v0.4s, v0.4s, #1
+; CHECK-NEXT: add v0.4s, v0.4s, v0.4s
+; CHECK-NEXT: str q0, [x2, x8]
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp x8, #512
+; CHECK-NEXT: b.ne .LBB0_1
+; CHECK-NEXT: // %bb.2:
+; CHECK-NEXT: ret
+ br label %4
+
+4:
+ %5 = phi i64 [ 0, %3 ], [ %21, %4 ]
+ %6 = shl i64 %5, 2
+ %7 = getelementptr i8, ptr %0, i64 %6
+ %8 = shl i64 %5, 2
+ %9 = getelementptr i8, ptr %1, i64 %8
+ %10 = shl i64 %5, 2
+ %11 = getelementptr i8, ptr %2, i64 %10
+ %12 = load <4 x i32>, ptr %7, align 4
+ %13 = sext <4 x i32> %12 to <4 x i64>
+ %14 = load <4 x i32>, ptr %9, align 4
+ %15 = sext <4 x i32> %14 to <4 x i64>
+ %16 = mul nsw <4 x i64> %15, %13
+ %17 = lshr <4 x i64> %16, splat (i64 32)
+ %18 = trunc nuw <4 x i64> %17 to <4 x i32>
+ %19 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %18, <4 x i32> splat (i32 1073741823))
+ %20 = shl <4 x i32> %19, splat (i32 1)
+ store <4 x i32> %20, ptr %11, align 4
+ %21 = add nuw i64 %5, 4
+ %22 = icmp eq i64 %21, 128
+ br i1 %22, label %23, label %4
+
+23:
+ ret void
+}
More information about the llvm-commits
mailing list