[llvm] 3e6d1a6 - [NFC][AArch64] Precommit test to optimize instruction selection for aarch64_neon_pmull64 intrinsic.
Mingming Liu via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 19 13:18:07 PDT 2022
Author: Mingming Liu
Date: 2022-08-19T13:17:13-07:00
New Revision: 3e6d1a6f54264bb50216069c68845b57428fc932
URL: https://github.com/llvm/llvm-project/commit/3e6d1a6f54264bb50216069c68845b57428fc932
DIFF: https://github.com/llvm/llvm-project/commit/3e6d1a6f54264bb50216069c68845b57428fc932.diff
LOG: [NFC][AArch64] Precommit test to optimize instruction selection for aarch64_neon_pmull64 intrinsic.
Differential Revision: https://reviews.llvm.org/D131045
Added:
llvm/test/CodeGen/AArch64/aarch64-pmull2.ll
Modified:
llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll b/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll
new file mode 100644
index 000000000000..86112fd934b6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK
+
+; User code intends to execute {pmull, pmull2} instructions on {lower, higher} half of the same vector registers directly.
+; Test that PMULL2 are generated for higher-half operands.
+; The suboptimal code generation fails to use higher-half contents in place; instead, it moves higher-lane contents to lower lane
+; to make use of PMULL everywhere, and generates unnecessary moves.
+define void @test1(ptr %0, ptr %1) {
+; CHECK-LABEL: test1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q1, [x1]
+; CHECK-NEXT: mov w8, #56824
+; CHECK-NEXT: mov w9, #61186
+; CHECK-NEXT: movk w8, #40522, lsl #16
+; CHECK-NEXT: movk w9, #29710, lsl #16
+; CHECK-NEXT: mov x10, v0.d[1]
+; CHECK-NEXT: fmov d2, x9
+; CHECK-NEXT: mov x11, v1.d[1]
+; CHECK-NEXT: fmov d3, x8
+; CHECK-NEXT: fmov d4, x10
+; CHECK-NEXT: pmull v0.1q, v0.1d, v2.1d
+; CHECK-NEXT: fmov d5, x11
+; CHECK-NEXT: pmull v1.1q, v1.1d, v2.1d
+; CHECK-NEXT: pmull v2.1q, v4.1d, v3.1d
+; CHECK-NEXT: pmull v3.1q, v5.1d, v3.1d
+; CHECK-NEXT: eor v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: eor v1.16b, v1.16b, v3.16b
+; CHECK-NEXT: stp q0, q1, [x1]
+; CHECK-NEXT: ret
+ %3 = load <2 x i64>, ptr %1
+ %4 = getelementptr inbounds <2 x i64>, ptr %1, i64 1
+ %5 = load <2 x i64>, ptr %4
+ %6 = extractelement <2 x i64> %3, i64 1
+ %7 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %6, i64 2655706616)
+ %8 = extractelement <2 x i64> %5, i64 1
+ %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %8, i64 2655706616)
+ %10 = load <2 x i64>, ptr %0
+ %11 = getelementptr inbounds i8, ptr %0, i64 16
+ %12 = load <2 x i64>, ptr %11
+ %13 = extractelement <2 x i64> %3, i64 0
+ %14 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %13, i64 1947135746)
+ %15 = extractelement <2 x i64> %5, i64 0
+ %16 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %15, i64 1947135746)
+ %17 = xor <16 x i8> %14, %7
+ %18 = xor <16 x i8> %16, %9
+ store <16 x i8> %17, ptr %1
+ store <16 x i8> %18, ptr %4
+ ret void
+}
+
+; One operand is higher-half of SIMD register, and the other operand is lower-half of another SIMD register.
+; Tests that codegen doesn't generate unnecessary moves.
+define void @test2(ptr %0, <2 x i64> %1, <2 x i64> %2) {
+; CHECK-LABEL: test2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %4 = extractelement <2 x i64> %1, i64 1
+ %5 = extractelement <2 x i64> %2, i64 0
+ %6 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %4, i64 %5)
+ store <16 x i8> %6, ptr %0, align 16
+ ret void
+}
+
+declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
diff --git a/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll
index 4ba9ea4e328e..a8127c300047 100644
--- a/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll
+++ b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll
@@ -1,9 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK
-; Tests that scalar i64 arguments of llvm.aarch64.neon.pmull64 are
-; loaded into SIMD registers, as opposed to being loaded into GPR followed by a mov.
-
+; Two operands are in scalar form.
+; Tests that both operands are loaded into SIMD registers directly as opposed to being loaded into GPR followed by a fmov.
define void @test1(ptr %0, i64 %1, i64 %2) {
; CHECK-LABEL: test1:
; CHECK: // %bb.0:
@@ -12,7 +11,7 @@ define void @test1(ptr %0, i64 %1, i64 %2) {
; CHECK-NEXT: ldr d0, [x8, #8]
; CHECK-NEXT: ldr d1, [x9, #8]
; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d
-; CHECK-NEXT: str q0, [x9]
+; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
%4 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1
%5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1
@@ -20,10 +19,12 @@ define void @test1(ptr %0, i64 %1, i64 %2) {
%7 = getelementptr inbounds <2 x i64>, ptr %0, i64 %2, i64 1
%8 = load i64, ptr %7, align 8
%9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %6, i64 %8)
- store <16 x i8> %9, ptr %4, align 16
+ store <16 x i8> %9, ptr %0, align 16
ret void
}
+; Operand %8 is higher-half of v2i64, and operand %7 is a scalar load.
+; Tests that operand is loaded into SIMD registers directly as opposed to being loaded into GPR followed by a fmov.
define void @test2(ptr %0, i64 %1, i64 %2, <2 x i64> %3) {
; CHECK-LABEL: test2:
; CHECK: // %bb.0:
@@ -31,18 +32,20 @@ define void @test2(ptr %0, i64 %1, i64 %2, <2 x i64> %3) {
; CHECK-NEXT: add x8, x0, x1, lsl #4
; CHECK-NEXT: ldr d0, [x8, #8]
; CHECK-NEXT: fmov d1, x9
-; CHECK-NEXT: pmull v0.1q, v0.1d, v1.1d
+; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d
; CHECK-NEXT: str q0, [x8]
; CHECK-NEXT: ret
%5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1
%6 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1, i64 1
%7 = load i64, ptr %6, align 8
%8 = extractelement <2 x i64> %3, i64 1
- %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %7, i64 %8)
+ %9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %8, i64 %7)
store <16 x i8> %9, ptr %5, align 16
ret void
}
+; Operand %7 is a scalar load, and operand %3 is an input parameter of function `test4`.
+; Test that %7 is loaded into SIMD registers.
define void @test3(ptr %0, i64 %1, i64 %2, i64 %3) {
; CHECK-LABEL: test3:
; CHECK: // %bb.0:
@@ -60,4 +63,21 @@ define void @test3(ptr %0, i64 %1, i64 %2, i64 %3) {
ret void
}
+; Operand %4 is the higher-half of v2i64, and operand %2 is an input parameter of i64.
+; Test that %2 is duplicated into the proper lane of SIMD directly for optimal codegen.
+define void @test4(ptr %0, <2 x i64> %1, i64 %2) {
+; CHECK-LABEL: test4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: fmov d0, x1
+; CHECK-NEXT: fmov d1, x8
+; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %4 = extractelement <2 x i64> %1, i64 1
+ %5 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %4, i64 %2)
+ store <16 x i8> %5, ptr %0, align 16
+ ret void
+}
+
declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
More information about the llvm-commits
mailing list