[llvm] [AArch64][Codegen]Transform saturating smull to sqdmulh (PR #143671)

Wed Jun 11 08:16:32 PDT 2025

https://github.com/nasherm updated https://github.com/llvm/llvm-project/pull/143671

>From 30c41930cd0556682df4d3f90ac2910aa9e5ee35 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Tue, 10 Jun 2025 16:20:42 +0100
Subject: [PATCH] [AArch64][Codegen]Transform saturating smull to sqdmulh

This patch adds a pattern for recognizing saturating vector
smull. Prior to this patch these were performed using a
combination of smull+smull2+uzp+smin which can be done
with a ushr + sqdmulh.

Change-Id: Ib7d4d5284d1bd3fdd0907365f9e2f37f4da14671
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  7 +++
 .../CodeGen/AArch64/saturating-vec-smull.ll   | 45 +++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/saturating-vec-smull.ll

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 727831896737d..3984c9b0cc1cd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -9349,6 +9349,13 @@ def : Pat<(v4i32 (mulhs V128:$Rn, V128:$Rm)),
                              (EXTRACT_SUBREG V128:$Rm, dsub)),
            (SMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
 
+// Saturating vector mulitplications on signed integers
+// follow a smull + smull2 + uzip + smin pattern. It would 
+// be more efficient to make use of sqdmulh instructions
+def : Pat<(v4i32 (smin (mulhs V128:$Rn, V128:$Rm), 
+                        (v4i32 (AArch64mvni_shift (i32 192), (i32 24))))),
+           (USHRv4i32_shift (SQDMULHv4i32 V128:$Rn, V128:$Rm), (i32 1))>;
+
 def : Pat<(v16i8 (mulhu V128:$Rn, V128:$Rm)),
           (UZP2v16i8
            (UMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub),
diff --git a/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
new file mode 100644
index 0000000000000..0bc454f06743d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-elf < %s | FileCheck %s
+define void @arm_mult_q31(ptr %0, ptr %1, ptr %2){
+; CHECK-LABEL: arm_mult_q31:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  .LBB0_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    ldr q1, [x1, x8]
+; CHECK-NEXT:    sqdmulh v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #1
+; CHECK-NEXT:    add v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    str q0, [x2, x8]
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    cmp x8, #512
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    ret
+  br label %4
+
+4:
+  %5 = phi i64 [ 0, %3 ], [ %21, %4 ]
+  %6 = shl i64 %5, 2
+  %7 = getelementptr i8, ptr %0, i64 %6
+  %8 = shl i64 %5, 2
+  %9 = getelementptr i8, ptr %1, i64 %8
+  %10 = shl i64 %5, 2
+  %11 = getelementptr i8, ptr %2, i64 %10
+  %12 = load <4 x i32>, ptr %7, align 4
+  %13 = sext <4 x i32> %12 to <4 x i64>
+  %14 = load <4 x i32>, ptr %9, align 4
+  %15 = sext <4 x i32> %14 to <4 x i64>
+  %16 = mul nsw <4 x i64> %15, %13
+  %17 = lshr <4 x i64> %16, splat (i64 32)
+  %18 = trunc nuw <4 x i64> %17 to <4 x i32>
+  %19 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %18, <4 x i32> splat (i32 1073741823))
+  %20 = shl <4 x i32> %19, splat (i32 1)
+  store <4 x i32> %20, ptr %11, align 4
+  %21 = add nuw i64 %5, 4
+  %22 = icmp eq i64 %21, 128
+  br i1 %22, label %23, label %4
+
+23:
+  ret void
+}