[llvm] [AArch64] Mark [usp]mull as commutative (PR #152158)
Cullen Rhodes via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 7 07:21:26 PDT 2025
https://github.com/c-rhodes updated https://github.com/llvm/llvm-project/pull/152158
>From 18daf9049db88d5cff79d7a63ade1a90152c2437 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes at arm.com>
Date: Tue, 5 Aug 2025 14:14:01 +0000
Subject: [PATCH 1/2] [AArch64] Add tests for commutable [usp]mull, [us]addl,
[us]abdl
Precommit tests for PR #152158.
---
.../test/CodeGen/AArch64/arm64-neon-3vdiff.ll | 52 ++++++++++++-
.../CodeGen/AArch64/arm64-neon-mul-div.ll | 13 ++++
llvm/test/CodeGen/AArch64/arm64-vabs.ll | 60 +++++++++++++++
llvm/test/CodeGen/AArch64/arm64-vmul.ll | 77 +++++++++++++++++--
4 files changed, 192 insertions(+), 10 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
index 256ff94830113..557fcbbb41961 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -70,6 +70,30 @@ entry:
ret <2 x i64> %add.i
}
+define void @test_commutable_vaddl_s8(<8 x i8> %a, <8 x i8> %b, ptr %c) {
+; CHECK-SD-LABEL: test_commutable_vaddl_s8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: stp q0, q0, [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_commutable_vaddl_s8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: saddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: saddl v0.8h, v1.8b, v0.8b
+; CHECK-GI-NEXT: stp q2, q0, [x0]
+; CHECK-GI-NEXT: ret
+entry:
+ %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
+ %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
+ %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
+ store <8 x i16> %add.i, ptr %c
+ %add.i2 = add <8 x i16> %vmovl.i2.i, %vmovl.i.i
+ %c.gep.1 = getelementptr i8, ptr %c, i64 16
+ store <8 x i16> %add.i2, ptr %c.gep.1
+ ret void
+}
+
define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: test_vaddl_u8:
; CHECK: // %bb.0: // %entry
@@ -106,6 +130,30 @@ entry:
ret <2 x i64> %add.i
}
+define void @test_commutable_vaddl_u8(<8 x i8> %a, <8 x i8> %b, ptr %c) {
+; CHECK-SD-LABEL: test_commutable_vaddl_u8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: stp q0, q0, [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_commutable_vaddl_u8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: uaddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: uaddl v0.8h, v1.8b, v0.8b
+; CHECK-GI-NEXT: stp q2, q0, [x0]
+; CHECK-GI-NEXT: ret
+entry:
+ %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
+ %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
+ %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
+ store <8 x i16> %add.i, ptr %c
+ %add.i2 = add <8 x i16> %vmovl.i2.i, %vmovl.i.i
+ %c.gep.1 = getelementptr i8, ptr %c, i64 16
+ store <8 x i16> %add.i2, ptr %c.gep.1
+ ret void
+}
+
define <8 x i16> @test_vaddl_a8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-SD-LABEL: test_vaddl_a8:
; CHECK-SD: // %bb.0: // %entry
@@ -2892,9 +2940,9 @@ define <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %scale.coer
; CHECK-GI-LABEL: cmplx_mul_combined_re_im:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: lsr x9, x0, #16
-; CHECK-GI-NEXT: adrp x8, .LCPI196_0
+; CHECK-GI-NEXT: adrp x8, .LCPI198_0
; CHECK-GI-NEXT: rev32 v4.8h, v0.8h
-; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI196_0]
+; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI198_0]
; CHECK-GI-NEXT: fmov d1, x9
; CHECK-GI-NEXT: dup v2.8h, v1.h[0]
; CHECK-GI-NEXT: sqneg v1.8h, v2.8h
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll
index ecf3f69825c0e..e3515124a6d34 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll
@@ -1608,6 +1608,19 @@ define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
ret <16 x i8> %prod
}
+define <16 x i8> @commutable_poly_mul(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK-LABEL: commutable_poly_mul:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pmul v2.16b, v0.16b, v1.16b
+; CHECK-NEXT: pmul v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: add v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: ret
+ %1 = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+ %2 = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %rhs, <16 x i8> %lhs)
+ %3 = add <16 x i8> %1, %2
+ ret <16 x i8> %3
+}
+
declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index 78881c80ccc10..8e3611d312612 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -44,6 +44,36 @@ define <2 x i64> @sabdl2d(ptr %A, ptr %B) nounwind {
ret <2 x i64> %tmp4
}
+define void @commutable_sabdl(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-SD-LABEL: commutable_sabdl:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr d0, [x0]
+; CHECK-SD-NEXT: ldr d1, [x1]
+; CHECK-SD-NEXT: sabdl.8h v0, v1, v0
+; CHECK-SD-NEXT: str q0, [x2]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: commutable_sabdl:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: sabdl.8h v2, v0, v1
+; CHECK-GI-NEXT: sabdl.8h v0, v1, v0
+; CHECK-GI-NEXT: str q2, [x2]
+; CHECK-GI-NEXT: str q0, [x2]
+; CHECK-GI-NEXT: ret
+ %tmp1 = load <8 x i8>, ptr %A
+ %tmp2 = load <8 x i8>, ptr %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+ store <8 x i16> %tmp4, ptr %C
+ %tmp5 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp2, <8 x i8> %tmp1)
+ %tmp6 = zext <8 x i8> %tmp5 to <8 x i16>
+ %tmp7 = getelementptr i8, ptr %C, i64 16
+ store <8 x i16> %tmp6, ptr %C
+ ret void
+}
+
define <8 x i16> @sabdl2_8h(ptr %A, ptr %B) nounwind {
; CHECK-SD-LABEL: sabdl2_8h:
; CHECK-SD: // %bb.0:
@@ -155,6 +185,36 @@ define <2 x i64> @uabdl2d(ptr %A, ptr %B) nounwind {
ret <2 x i64> %tmp4
}
+define void @commutable_uabdl(ptr %A, ptr %B, ptr %C) nounwind {
+; CHECK-SD-LABEL: commutable_uabdl:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr d0, [x0]
+; CHECK-SD-NEXT: ldr d1, [x1]
+; CHECK-SD-NEXT: uabdl.8h v0, v1, v0
+; CHECK-SD-NEXT: str q0, [x2]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: commutable_uabdl:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: uabdl.8h v2, v0, v1
+; CHECK-GI-NEXT: uabdl.8h v0, v1, v0
+; CHECK-GI-NEXT: str q2, [x2]
+; CHECK-GI-NEXT: str q0, [x2]
+; CHECK-GI-NEXT: ret
+ %tmp1 = load <8 x i8>, ptr %A
+ %tmp2 = load <8 x i8>, ptr %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+ store <8 x i16> %tmp4, ptr %C
+ %tmp5 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp2, <8 x i8> %tmp1)
+ %tmp6 = zext <8 x i8> %tmp5 to <8 x i16>
+ %tmp7 = getelementptr i8, ptr %C, i64 16
+ store <8 x i16> %tmp6, ptr %C
+ ret void
+}
+
define <8 x i16> @uabdl2_8h(ptr %A, ptr %B) nounwind {
; CHECK-SD-LABEL: uabdl2_8h:
; CHECK-SD: // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index 07400bbb2f58c..cac980fd99b69 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=aarch64-none-elf -mattr=+aes -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
; CHECK-GI: warning: Instruction selection used fallback path for pmull8h
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for commutable_pmull8h
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmulh_1s
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2s
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_4s
@@ -78,6 +79,21 @@ define <2 x i64> @smull2d(ptr %A, ptr %B) nounwind {
ret <2 x i64> %tmp3
}
+define void @commutable_smull(<2 x i32> %A, <2 x i32> %B, ptr %C) {
+; CHECK-LABEL: commutable_smull:
+; CHECK: // %bb.0:
+; CHECK-NEXT: smull v2.2d, v0.2s, v1.2s
+; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s
+; CHECK-NEXT: stp q2, q0, [x0]
+; CHECK-NEXT: ret
+ %1 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %B)
+ %2 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %B, <2 x i32> %A)
+ store <2 x i64> %1, ptr %C
+ %3 = getelementptr i8, ptr %C, i64 16
+ store <2 x i64> %2, ptr %3
+ ret void
+}
+
declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
@@ -121,6 +137,21 @@ define <2 x i64> @umull2d(ptr %A, ptr %B) nounwind {
ret <2 x i64> %tmp3
}
+define void @commutable_umull(<2 x i32> %A, <2 x i32> %B, ptr %C) {
+; CHECK-LABEL: commutable_umull:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umull v2.2d, v0.2s, v1.2s
+; CHECK-NEXT: umull v0.2d, v1.2s, v0.2s
+; CHECK-NEXT: stp q2, q0, [x0]
+; CHECK-NEXT: ret
+ %1 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %B)
+ %2 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %B, <2 x i32> %A)
+ store <2 x i64> %1, ptr %C
+ %3 = getelementptr i8, ptr %C, i64 16
+ store <2 x i64> %2, ptr %3
+ ret void
+}
+
declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
@@ -212,6 +243,21 @@ define <8 x i16> @pmull8h(ptr %A, ptr %B) nounwind {
ret <8 x i16> %tmp3
}
+define void @commutable_pmull8h(<8 x i8> %A, <8 x i8> %B, ptr %C) {
+; CHECK-LABEL: commutable_pmull8h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pmull v2.8h, v0.8b, v1.8b
+; CHECK-NEXT: pmull v0.8h, v1.8b, v0.8b
+; CHECK-NEXT: stp q2, q0, [x0]
+; CHECK-NEXT: ret
+ %1 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %A, <8 x i8> %B)
+ %2 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %B, <8 x i8> %A)
+ store <8 x i16> %1, ptr %C
+ %3 = getelementptr i8, ptr %C, i8 16
+ store <8 x i16> %2, ptr %3
+ ret void
+}
+
declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
define <4 x i16> @sqdmulh_4h(ptr %A, ptr %B) nounwind {
@@ -487,10 +533,10 @@ define void @smlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2,
; CHECK-GI-LABEL: smlal2d_chain_with_constant:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mvn v3.8b, v2.8b
-; CHECK-GI-NEXT: adrp x8, .LCPI27_0
+; CHECK-GI-NEXT: adrp x8, .LCPI30_0
; CHECK-GI-NEXT: smull v1.2d, v1.2s, v3.2s
; CHECK-GI-NEXT: smlal v1.2d, v0.2s, v2.2s
-; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI27_0]
+; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI30_0]
; CHECK-GI-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-NEXT: str q0, [x0]
; CHECK-GI-NEXT: ret
@@ -566,8 +612,8 @@ define void @smlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2,
;
; CHECK-GI-LABEL: smlsl2d_chain_with_constant:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: adrp x8, .LCPI31_0
-; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI31_0]
+; CHECK-GI-NEXT: adrp x8, .LCPI34_0
+; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI34_0]
; CHECK-GI-NEXT: smlsl v3.2d, v0.2s, v2.2s
; CHECK-GI-NEXT: mvn v0.8b, v2.8b
; CHECK-GI-NEXT: smlsl v3.2d, v1.2s, v0.2s
@@ -829,10 +875,10 @@ define void @umlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2,
; CHECK-GI-LABEL: umlal2d_chain_with_constant:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mvn v3.8b, v2.8b
-; CHECK-GI-NEXT: adrp x8, .LCPI43_0
+; CHECK-GI-NEXT: adrp x8, .LCPI46_0
; CHECK-GI-NEXT: umull v1.2d, v1.2s, v3.2s
; CHECK-GI-NEXT: umlal v1.2d, v0.2s, v2.2s
-; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI43_0]
+; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI46_0]
; CHECK-GI-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-NEXT: str q0, [x0]
; CHECK-GI-NEXT: ret
@@ -908,8 +954,8 @@ define void @umlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2,
;
; CHECK-GI-LABEL: umlsl2d_chain_with_constant:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: adrp x8, .LCPI47_0
-; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI47_0]
+; CHECK-GI-NEXT: adrp x8, .LCPI50_0
+; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI50_0]
; CHECK-GI-NEXT: umlsl v3.2d, v0.2s, v2.2s
; CHECK-GI-NEXT: mvn v0.8b, v2.8b
; CHECK-GI-NEXT: umlsl v3.2d, v1.2s, v0.2s
@@ -3222,6 +3268,21 @@ define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
ret <16 x i8> %val
}
+define <16 x i8> @test_commutable_pmull_64(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: test_commutable_pmull_64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov d0, x1
+; CHECK-NEXT: fmov d1, x0
+; CHECK-NEXT: pmull v2.1q, v1.1d, v0.1d
+; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d
+; CHECK-NEXT: add v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: ret
+ %1 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
+ %2 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %r, i64 %l)
+ %3 = add <16 x i8> %1, %2
+ ret <16 x i8> %3
+}
+
declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
>From fec5c49aed75303326e6ff329f5335785935def5 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes at arm.com>
Date: Tue, 5 Aug 2025 14:14:01 +0000
Subject: [PATCH 2/2] [AArch64] Mark [usp]mull as commutative
Fixes #61461.
---
llvm/include/llvm/IR/IntrinsicsAArch64.td | 32 ++++++++---------
.../lib/Target/AArch64/AArch64InstrFormats.td | 10 +++---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 4 +++
.../test/CodeGen/AArch64/arm64-neon-3vdiff.ll | 34 ++++++-------------
.../CodeGen/AArch64/arm64-neon-mul-div.ll | 5 ++-
llvm/test/CodeGen/AArch64/arm64-vabs.ll | 10 +++---
llvm/test/CodeGen/AArch64/arm64-vmul.ll | 20 +++++------
7 files changed, 49 insertions(+), 66 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index ca6e2128812f7..6d53bf8b172d8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -296,22 +296,22 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
def int_aarch64_neon_sqrdmlah : AdvSIMD_3IntArg_Intrinsic;
def int_aarch64_neon_sqrdmlsh : AdvSIMD_3IntArg_Intrinsic;
- // Vector Polynominal Multiply
- def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;
-
- // Vector Long Multiply
- def int_aarch64_neon_smull : AdvSIMD_2VectorArg_Long_Intrinsic;
- def int_aarch64_neon_umull : AdvSIMD_2VectorArg_Long_Intrinsic;
- def int_aarch64_neon_pmull : AdvSIMD_2VectorArg_Long_Intrinsic;
-
- // 64-bit polynomial multiply really returns an i128, which is not legal. Fake
- // it with a v16i8.
- def int_aarch64_neon_pmull64 :
- DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
-
- // Vector Extending Multiply
- def int_aarch64_neon_fmulx : AdvSIMD_2FloatArg_Intrinsic {
- let IntrProperties = [IntrNoMem, Commutative];
+ let IntrProperties = [IntrNoMem, Commutative] in {
+ // Vector Polynominal Multiply
+ def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;
+
+ // Vector Long Multiply
+ def int_aarch64_neon_smull : AdvSIMD_2VectorArg_Long_Intrinsic;
+ def int_aarch64_neon_umull : AdvSIMD_2VectorArg_Long_Intrinsic;
+ def int_aarch64_neon_pmull : AdvSIMD_2VectorArg_Long_Intrinsic;
+
+ // 64-bit polynomial multiply really returns an i128, which is not legal.
+ // Fake it with a v16i8.
+ def int_aarch64_neon_pmull64
+ : DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_i64_ty, llvm_i64_ty]>;
+
+ // Vector Extending Multiply
+ def int_aarch64_neon_fmulx : AdvSIMD_2FloatArg_Intrinsic;
}
// Vector Saturating Doubling Long Multiply
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index ba7cbccc0bcd6..d1684970775cf 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -7359,7 +7359,9 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
[(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
def v16i8 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
V128, V128, V128,
- asm#"2", ".8h", ".16b", ".16b", []>;
+ asm#"2", ".8h", ".16b", ".16b",
+ [(set (v8i16 V128:$Rd), (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))),
+ (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm)))))]>;
let Predicates = [HasAES] in {
def v1i64 : BaseSIMDDifferentThreeVector<U, 0b110, opc,
V128, V64, V64,
@@ -7371,10 +7373,6 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
[(set (v16i8 V128:$Rd), (OpNode (extract_high_v2i64 (v2i64 V128:$Rn)),
(extract_high_v2i64 (v2i64 V128:$Rm))))]>;
}
-
- def : Pat<(v8i16 (OpNode (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))),
- (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm))))),
- (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
}
multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
@@ -7399,6 +7397,7 @@ multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
(extract_high_v4i32 (v4i32 V128:$Rm))))]>;
}
+let isCommutable = 1 in
multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
@@ -7480,6 +7479,7 @@ multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
(extract_high_v4i32 (v4i32 V128:$Rm)))))))]>;
}
+let isCommutable = 1 in
multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ac31236d8f2cf..8cfbff938a395 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6055,6 +6055,7 @@ defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>;
defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>;
defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
+let isCommutable = 1 in
defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
TriOpFrag<(add node:$LHS, (abds node:$MHS, node:$RHS))> >;
@@ -6806,6 +6807,7 @@ defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>
defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
+let isCommutable = 1 in
defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>;
defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", abds>;
defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", abds>;
@@ -6822,6 +6824,7 @@ defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", saddsat>;
defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", ssubsat>;
defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
int_aarch64_neon_sqdmull>;
+let isCommutable = 0 in
defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
@@ -6836,6 +6839,7 @@ defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", AArch64umull>;
+let isCommutable = 0 in
defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>;
defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw",
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
index 557fcbbb41961..9a1b6a0661e6b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -71,18 +71,11 @@ entry:
}
define void @test_commutable_vaddl_s8(<8 x i8> %a, <8 x i8> %b, ptr %c) {
-; CHECK-SD-LABEL: test_commutable_vaddl_s8:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b
-; CHECK-SD-NEXT: stp q0, q0, [x0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_commutable_vaddl_s8:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: saddl v2.8h, v0.8b, v1.8b
-; CHECK-GI-NEXT: saddl v0.8h, v1.8b, v0.8b
-; CHECK-GI-NEXT: stp q2, q0, [x0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_commutable_vaddl_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: saddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: stp q0, q0, [x0]
+; CHECK-NEXT: ret
entry:
%vmovl.i.i = sext <8 x i8> %a to <8 x i16>
%vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
@@ -131,18 +124,11 @@ entry:
}
define void @test_commutable_vaddl_u8(<8 x i8> %a, <8 x i8> %b, ptr %c) {
-; CHECK-SD-LABEL: test_commutable_vaddl_u8:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b
-; CHECK-SD-NEXT: stp q0, q0, [x0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_commutable_vaddl_u8:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: uaddl v2.8h, v0.8b, v1.8b
-; CHECK-GI-NEXT: uaddl v0.8h, v1.8b, v0.8b
-; CHECK-GI-NEXT: stp q2, q0, [x0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_commutable_vaddl_u8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: stp q0, q0, [x0]
+; CHECK-NEXT: ret
entry:
%vmovl.i.i = zext <8 x i8> %a to <8 x i16>
%vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll
index e3515124a6d34..0d427c05e3b77 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll
@@ -1611,9 +1611,8 @@ define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
define <16 x i8> @commutable_poly_mul(<16 x i8> %lhs, <16 x i8> %rhs) {
; CHECK-LABEL: commutable_poly_mul:
; CHECK: // %bb.0:
-; CHECK-NEXT: pmul v2.16b, v0.16b, v1.16b
-; CHECK-NEXT: pmul v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: add v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: pmul v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: add v0.16b, v0.16b, v0.16b
; CHECK-NEXT: ret
%1 = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
%2 = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %rhs, <16 x i8> %lhs)
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index 8e3611d312612..ede5a7c2ae42b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -57,9 +57,8 @@ define void @commutable_sabdl(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr d0, [x0]
; CHECK-GI-NEXT: ldr d1, [x1]
-; CHECK-GI-NEXT: sabdl.8h v2, v0, v1
-; CHECK-GI-NEXT: sabdl.8h v0, v1, v0
-; CHECK-GI-NEXT: str q2, [x2]
+; CHECK-GI-NEXT: sabdl.8h v0, v0, v1
+; CHECK-GI-NEXT: str q0, [x2]
; CHECK-GI-NEXT: str q0, [x2]
; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
@@ -198,9 +197,8 @@ define void @commutable_uabdl(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: ldr d0, [x0]
; CHECK-GI-NEXT: ldr d1, [x1]
-; CHECK-GI-NEXT: uabdl.8h v2, v0, v1
-; CHECK-GI-NEXT: uabdl.8h v0, v1, v0
-; CHECK-GI-NEXT: str q2, [x2]
+; CHECK-GI-NEXT: uabdl.8h v0, v0, v1
+; CHECK-GI-NEXT: str q0, [x2]
; CHECK-GI-NEXT: str q0, [x2]
; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index cac980fd99b69..d12f7ce33e246 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -82,9 +82,8 @@ define <2 x i64> @smull2d(ptr %A, ptr %B) nounwind {
define void @commutable_smull(<2 x i32> %A, <2 x i32> %B, ptr %C) {
; CHECK-LABEL: commutable_smull:
; CHECK: // %bb.0:
-; CHECK-NEXT: smull v2.2d, v0.2s, v1.2s
-; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s
-; CHECK-NEXT: stp q2, q0, [x0]
+; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT: stp q0, q0, [x0]
; CHECK-NEXT: ret
%1 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %B)
%2 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %B, <2 x i32> %A)
@@ -140,9 +139,8 @@ define <2 x i64> @umull2d(ptr %A, ptr %B) nounwind {
define void @commutable_umull(<2 x i32> %A, <2 x i32> %B, ptr %C) {
; CHECK-LABEL: commutable_umull:
; CHECK: // %bb.0:
-; CHECK-NEXT: umull v2.2d, v0.2s, v1.2s
-; CHECK-NEXT: umull v0.2d, v1.2s, v0.2s
-; CHECK-NEXT: stp q2, q0, [x0]
+; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT: stp q0, q0, [x0]
; CHECK-NEXT: ret
%1 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %B)
%2 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %B, <2 x i32> %A)
@@ -246,9 +244,8 @@ define <8 x i16> @pmull8h(ptr %A, ptr %B) nounwind {
define void @commutable_pmull8h(<8 x i8> %A, <8 x i8> %B, ptr %C) {
; CHECK-LABEL: commutable_pmull8h:
; CHECK: // %bb.0:
-; CHECK-NEXT: pmull v2.8h, v0.8b, v1.8b
-; CHECK-NEXT: pmull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT: stp q2, q0, [x0]
+; CHECK-NEXT: pmull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: stp q0, q0, [x0]
; CHECK-NEXT: ret
%1 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %A, <8 x i8> %B)
%2 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %B, <8 x i8> %A)
@@ -3273,9 +3270,8 @@ define <16 x i8> @test_commutable_pmull_64(i64 %l, i64 %r) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d0, x1
; CHECK-NEXT: fmov d1, x0
-; CHECK-NEXT: pmull v2.1q, v1.1d, v0.1d
-; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d
-; CHECK-NEXT: add v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d
+; CHECK-NEXT: add v0.16b, v0.16b, v0.16b
; CHECK-NEXT: ret
%1 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
%2 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %r, i64 %l)
More information about the llvm-commits
mailing list