[llvm] [AArch64][Codegen]Transform saturating smull to sqdmull (PR #143671)
Nashe Mncube via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 11 02:51:18 PDT 2025
https://github.com/nasherm created https://github.com/llvm/llvm-project/pull/143671
This patch adds a pattern for recognizing saturating vector smull. Prior to this patch these were performed using a combination of smull+smull2+uzp+smin. The sqdmull instructions performs the saturation removing the need for smin calls.
>From fdef5a6e60782d853ab44d37afd6f0383283aba7 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Tue, 10 Jun 2025 16:20:42 +0100
Subject: [PATCH] [AArch64][Codegen]Transform saturating smull to sqdmull
This patch adds a pattern for recognizing saturating vector
smull. Prior to this patch these were performed using a
combination of smull+smull2+uzp+smin. The sqdmull instructions
performs the saturation removing the need for smin calls.
Change-Id: Ib7d4d5284d1bd3fdd0907365f9e2f37f4da14671
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 19 ++
.../CodeGen/AArch64/saturating-vec-smull.ll | 202 ++++++++++++++++++
2 files changed, 221 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 727831896737d..3ab3e6fda524c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -9349,6 +9349,25 @@ def : Pat<(v4i32 (mulhs V128:$Rn, V128:$Rm)),
(EXTRACT_SUBREG V128:$Rm, dsub)),
(SMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
+// Saturating vector mulitplications on signed integers
+// follow a smull + smull2 + uzip + smin pattern. It would
+// be more efficient to make use of sqdmull instructions which
+// negates the need for a saturating smin call.
+def : Pat<(v8i16(smin (mulhs V128:$Rn, V128:$Rm),
+ (v8i16 (AArch64mvni_shift (i32 192), (i32 8))))),
+ (UZP2v8i16
+ (SQDMULLv4i16_v4i32 (EXTRACT_SUBREG V128:$Rn, dsub),
+ (EXTRACT_SUBREG V128:$Rm, dsub)),
+ (SQDMULLv8i16_v4i32 V128:$Rn, V128:$Rm))>;
+
+
+def : Pat<(v4i32 (smin (mulhs V128:$Rn, V128:$Rm),
+ (v4i32 (AArch64mvni_shift (i32 192), (i32 24))))),
+ (UZP2v4i32
+ (SQDMULLv2i32_v2i64 (EXTRACT_SUBREG V128:$Rn, dsub),
+ (EXTRACT_SUBREG V128:$Rm, dsub)),
+ (SQDMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
+
def : Pat<(v16i8 (mulhu V128:$Rn, V128:$Rm)),
(UZP2v16i8
(UMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub),
diff --git a/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
new file mode 100644
index 0000000000000..8cdb71a41eb4c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
@@ -0,0 +1,202 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-elf < %s | FileCheck %s
+
+define void @arm_mult_q31(ptr %0, ptr %1, ptr %2, i32 %3) {
+; CHECK-LABEL: arm_mult_q31:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cbz w3, .LBB0_4
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: cmp w3, #8
+; CHECK-NEXT: b.lo .LBB0_4
+; CHECK-NEXT: // %bb.2:
+; CHECK-NEXT: mov w8, w3
+; CHECK-NEXT: add x9, x2, #16
+; CHECK-NEXT: add x10, x1, #16
+; CHECK-NEXT: and x8, x8, #0xfffffff8
+; CHECK-NEXT: add x11, x0, #16
+; CHECK-NEXT: .LBB0_3: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldp q0, q3, [x10, #-16]
+; CHECK-NEXT: subs x8, x8, #8
+; CHECK-NEXT: ldp q1, q2, [x11, #-16]
+; CHECK-NEXT: add x10, x10, #32
+; CHECK-NEXT: add x11, x11, #32
+; CHECK-NEXT: sqdmull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT: sqdmull2 v1.2d, v3.4s, v2.4s
+; CHECK-NEXT: sqdmull v2.2d, v3.2s, v2.2s
+; CHECK-NEXT: uzp2 v0.4s, v0.4s, v4.4s
+; CHECK-NEXT: uzp2 v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: stp q0, q1, [x9, #-16]
+; CHECK-NEXT: add x9, x9, #32
+; CHECK-NEXT: b.ne .LBB0_3
+; CHECK-NEXT: .LBB0_4:
+; CHECK-NEXT: ret
+ %5 = icmp eq i32 %3, 0
+ br i1 %5, label %48, label %6
+
+6:
+ %7 = zext i32 %3 to i64
+ %8 = icmp ult i32 %3, 8
+ br i1 %8, label %48, label %9
+
+9:
+ %10 = and i64 %7, 4294967288
+ %11 = shl nuw nsw i64 %10, 2
+ %12 = getelementptr i8, ptr %0, i64 %11
+ %13 = trunc nuw i64 %10 to i32
+ %14 = sub i32 %3, %13
+ %15 = shl nuw nsw i64 %10, 2
+ %16 = getelementptr i8, ptr %1, i64 %15
+ %17 = shl nuw nsw i64 %10, 2
+ %18 = getelementptr i8, ptr %2, i64 %17
+ br label %19
+
+19:
+ %20 = phi i64 [ 0, %9 ], [ %46, %19 ]
+ %21 = shl i64 %20, 2
+ %22 = getelementptr i8, ptr %0, i64 %21
+ %23 = shl i64 %20, 2
+ %24 = getelementptr i8, ptr %1, i64 %23
+ %25 = shl i64 %20, 2
+ %26 = getelementptr i8, ptr %2, i64 %25
+ %27 = getelementptr i8, ptr %22, i64 16
+ %28 = load <4 x i32>, ptr %22, align 4
+ %29 = load <4 x i32>, ptr %27, align 4
+ %30 = sext <4 x i32> %28 to <4 x i64>
+ %31 = sext <4 x i32> %29 to <4 x i64>
+ %32 = getelementptr i8, ptr %24, i64 16
+ %33 = load <4 x i32>, ptr %24, align 4
+ %34 = load <4 x i32>, ptr %32, align 4
+ %35 = sext <4 x i32> %33 to <4 x i64>
+ %36 = sext <4 x i32> %34 to <4 x i64>
+ %37 = mul nsw <4 x i64> %35, %30
+ %38 = mul nsw <4 x i64> %36, %31
+ %39 = lshr <4 x i64> %37, splat (i64 32)
+ %40 = lshr <4 x i64> %38, splat (i64 32)
+ %41 = trunc nuw <4 x i64> %39 to <4 x i32>
+ %42 = trunc nuw <4 x i64> %40 to <4 x i32>
+ %43 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %41, <4 x i32> splat (i32 1073741823))
+ %44 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %42, <4 x i32> splat (i32 1073741823))
+ %45 = getelementptr i8, ptr %26, i64 16
+ store <4 x i32> %43, ptr %26, align 4
+ store <4 x i32> %44, ptr %45, align 4
+ %46 = add nuw i64 %20, 8
+ %47 = icmp eq i64 %46, %10
+ br i1 %47, label %48, label %19
+
+48:
+ ret void
+}
+
+define void @arm_mult_q15(ptr %0, ptr %1, ptr %2, i16 %3) {
+; CHECK-LABEL: arm_mult_q15:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w3, #0xffff
+; CHECK-NEXT: cmp w8, #4
+; CHECK-NEXT: b.lo .LBB1_7
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: ubfx w8, w3, #2, #14
+; CHECK-NEXT: sub w8, w8, #1
+; CHECK-NEXT: cmp w8, #3
+; CHECK-NEXT: b.lo .LBB1_7
+; CHECK-NEXT: // %bb.2:
+; CHECK-NEXT: sub x9, x2, x0
+; CHECK-NEXT: cmp x9, #32
+; CHECK-NEXT: b.lo .LBB1_7
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: sub x9, x2, x1
+; CHECK-NEXT: cmp x9, #32
+; CHECK-NEXT: b.lo .LBB1_7
+; CHECK-NEXT: // %bb.4:
+; CHECK-NEXT: cmp w8, #15
+; CHECK-NEXT: b.lo .LBB1_7
+; CHECK-NEXT: // %bb.5:
+; CHECK-NEXT: and x8, x8, #0xffff
+; CHECK-NEXT: add x9, x2, #16
+; CHECK-NEXT: add x10, x1, #16
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: add x11, x0, #16
+; CHECK-NEXT: and x8, x8, #0x1fff0
+; CHECK-NEXT: .LBB1_6: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldp q0, q3, [x10, #-16]
+; CHECK-NEXT: subs x8, x8, #16
+; CHECK-NEXT: ldp q1, q2, [x11, #-16]
+; CHECK-NEXT: add x10, x10, #32
+; CHECK-NEXT: add x11, x11, #32
+; CHECK-NEXT: sqdmull2 v4.4s, v0.8h, v1.8h
+; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: sqdmull2 v1.4s, v3.8h, v2.8h
+; CHECK-NEXT: sqdmull v2.4s, v3.4h, v2.4h
+; CHECK-NEXT: uzp2 v0.8h, v0.8h, v4.8h
+; CHECK-NEXT: uzp2 v1.8h, v2.8h, v1.8h
+; CHECK-NEXT: stp q0, q1, [x9, #-16]
+; CHECK-NEXT: add x9, x9, #32
+; CHECK-NEXT: b.ne .LBB1_6
+; CHECK-NEXT: .LBB1_7:
+; CHECK-NEXT: ret
+ %5 = ptrtoint ptr %1 to i64
+ %6 = ptrtoint ptr %0 to i64
+ %7 = ptrtoint ptr %2 to i64
+ %8 = icmp ult i16 %3, 4
+ br i1 %8, label %54, label %9
+
+9:
+ %10 = lshr i16 %3, 2
+ %11 = add nsw i16 %10, -1
+ %12 = zext i16 %11 to i64
+ %13 = add nuw nsw i64 %12, 1
+ %14 = icmp ult i16 %11, 3
+ br i1 %14, label %54, label %15
+
+15:
+ %16 = sub i64 %7, %6
+ %17 = icmp ult i64 %16, 32
+ %18 = sub i64 %7, %5
+ %19 = icmp ult i64 %18, 32
+ %20 = or i1 %17, %19
+ br i1 %20, label %54, label %21
+
+21:
+ %22 = icmp ult i16 %11, 15
+ br i1 %22, label %54, label %23
+
+23:
+ %24 = and i64 %13, 131056
+ br label %25
+
+25:
+ %26 = phi i64 [ 0, %23 ], [ %52, %25 ]
+ %27 = shl i64 %26, 1
+ %28 = getelementptr i8, ptr %0, i64 %27
+ %29 = shl i64 %26, 1
+ %30 = getelementptr i8, ptr %1, i64 %29
+ %31 = shl i64 %26, 1
+ %32 = getelementptr i8, ptr %2, i64 %31
+ %33 = getelementptr i8, ptr %28, i64 16
+ %34 = load <8 x i16>, ptr %28, align 2
+ %35 = load <8 x i16>, ptr %33, align 2
+ %36 = sext <8 x i16> %34 to <8 x i32>
+ %37 = sext <8 x i16> %35 to <8 x i32>
+ %38 = getelementptr i8, ptr %30, i64 16
+ %39 = load <8 x i16>, ptr %30, align 2
+ %40 = load <8 x i16>, ptr %38, align 2
+ %41 = sext <8 x i16> %39 to <8 x i32>
+ %42 = sext <8 x i16> %40 to <8 x i32>
+ %43 = mul nsw <8 x i32> %41, %36
+ %44 = mul nsw <8 x i32> %42, %37
+ %45 = lshr <8 x i32> %43, splat (i32 16)
+ %46 = lshr <8 x i32> %44, splat (i32 16)
+ %47 = trunc nuw <8 x i32> %45 to <8 x i16>
+ %48 = trunc nuw <8 x i32> %46 to <8 x i16>
+ %49 = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> %47, <8 x i16> splat (i16 16383))
+ %50 = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> %48, <8 x i16> splat (i16 16383))
+ %51 = getelementptr i8, ptr %32, i64 16
+ store <8 x i16> %49, ptr %32, align 2
+ store <8 x i16> %50, ptr %51, align 2
+ %52 = add nuw i64 %26, 16
+ %53 = icmp eq i64 %52, %24
+ br i1 %53, label %54, label %25
+
+54:
+ ret void
+}
More information about the llvm-commits
mailing list