[llvm] bb84dd8 - [AArch64] Add a tablegen pattern for RADDHN/RADDHN2.

Fri Dec 24 03:31:28 PST 2021

Author: Alexandros Lamprineas
Date: 2021-12-24T11:13:25Z
New Revision: bb84dd81590bbcaa2277f7f15700ac842af8932e

URL: https://github.com/llvm/llvm-project/commit/bb84dd81590bbcaa2277f7f15700ac842af8932e
DIFF: https://github.com/llvm/llvm-project/commit/bb84dd81590bbcaa2277f7f15700ac842af8932e.diff

LOG: [AArch64] Add a tablegen pattern for RADDHN/RADDHN2.

Converts RSHRN/RSHRN2 to RADDHN/RADDHN2 when the shift amount is half
the width of the vector element. The latter has twice the throughput
and half the latency on Arm out-of-order cores. Setting up the zero
register adds no latency.

Differential Revision: https://reviews.llvm.org/D116166

Added: 
    llvm/test/CodeGen/AArch64/arm64-raddhn-combine.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64InstrInfo.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 27013b51d9668..ebccc07edc7aa 100644

--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6603,6 +6603,34 @@ defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
 defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
                 TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
 
+// RADDHN patterns for when RSHRN shifts by half the size of the vector element
+def : Pat<(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))),
+          (RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>;
+def : Pat<(v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))),
+          (RADDHNv4i32_v4i16 V128:$Vn, (v4i32 (MOVIv2d_ns (i32 0))))>;
+def : Pat<(v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))),
+          (RADDHNv2i64_v2i32 V128:$Vn, (v2i64 (MOVIv2d_ns (i32 0))))>;
+
+// RADDHN2 patterns for when RSHRN shifts by half the size of the vector element
+def : Pat<(v16i8 (concat_vectors
+                 (v8i8 V64:$Vd),
+                 (v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))))),
+          (RADDHNv8i16_v16i8
+                 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
+                 (v8i16 (MOVIv2d_ns (i32 0))))>;
+def : Pat<(v8i16 (concat_vectors
+                 (v4i16 V64:$Vd),
+                 (v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))))),
+          (RADDHNv4i32_v8i16
+                 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
+                 (v4i32 (MOVIv2d_ns (i32 0))))>;
+def : Pat<(v4i32 (concat_vectors
+                 (v2i32 V64:$Vd),
+                 (v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))))),
+          (RADDHNv2i64_v4i32
+                 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
+                 (v2i64 (MOVIv2d_ns (i32 0))))>;
+
 // SHRN patterns for when a logical right shift was used instead of arithmetic
 // (the immediate guarantees no sign bits actually end up in the result so it
 // doesn't matter).

diff  --git a/llvm/test/CodeGen/AArch64/arm64-raddhn-combine.ll b/llvm/test/CodeGen/AArch64/arm64-raddhn-combine.ll
new file mode 100644
index 0000000000000..a2547782f6b2a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-raddhn-combine.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck %s
+
+define <16 x i8> @test_combine_v8i16_to_v16i8(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: test_combine_v8i16_to_v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    raddhn v0.8b, v0.8h, v2.8h
+; CHECK-NEXT:    raddhn2 v0.16b, v1.8h, v2.8h
+; CHECK-NEXT:    ret
+entry:
+  %res = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %x, i32 8)
+  %res2 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %y, i32 8)
+  %shuffle = shufflevector <8 x i8> %res, <8 x i8> %res2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @test_combine_v4i32_to_v8i16(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_combine_v4i32_to_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    raddhn v0.4h, v0.4s, v2.4s
+; CHECK-NEXT:    raddhn2 v0.8h, v1.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %x, i32 16)
+  %res2 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %y, i32 16)
+  %shuffle = shufflevector <4 x i16> %res, <4 x i16> %res2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @test_combine_v2i64_to_v4i32(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: test_combine_v2i64_to_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    raddhn v0.2s, v0.2d, v2.2d
+; CHECK-NEXT:    raddhn2 v0.4s, v1.2d, v2.2d
+; CHECK-NEXT:    ret
+entry:
+  %res = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %x, i32 32)
+  %res2 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %y, i32 32)
+  %shuffle = shufflevector <2 x i32> %res, <2 x i32> %res2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
+declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32)
+declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32)