[llvm] [AArch64] Avoid GPR trip when moving truncated i32 vector elements (PR #114541)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 1 07:11:24 PDT 2024
https://github.com/SpencerAbson created https://github.com/llvm/llvm-project/pull/114541
This patch introduces ISEL patterns to enable the use of [INS](https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en) (element (of truncated size)) when moving a truncated i32 element between vectors.
For example,
```
define <4 x i32> @test(<4 x i32> %a, <2 x i64> %b) {
%c = extractelement <2 x i64> %b, i32 1
%d = trunc i64 %c to i32
%e = insertelement <4 x i32> %a, i32 %d, i64 3
ret <4 x i32> %e
}
```
Can use
```
mov v0.s[3], v1.s[2]
ret
```
Instead of
```
mov x8, v1.d[1]
mov v0.s[3], w8
```
>From 5cd15cb492a6c023e424ef232ebf1ae6ccfe9def Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Thu, 31 Oct 2024 14:46:09 +0000
Subject: [PATCH 1/3] [AArch64] Avoid GPR trip when moving truncated i32 vector
elements
This patch introduces ISEL patterns to enable the use of INS (element, of truncated size) when moving a
truncated (from i64) i32 element between vectors.
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 53 +++++++--
.../CodeGen/AArch64/neon-ins-trunc-elt.ll | 106 ++++++++++++++++++
2 files changed, 149 insertions(+), 10 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 250d6144f75318..f92532c27fe154 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -683,6 +683,22 @@ def topbitsallzero64: PatLeaf<(i64 GPR64:$src), [{
CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(64, 63));
}]>;
+def VectorIndexStoH : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+
+def VectorIndexDtoH : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
+}]>;
+
+def VectorIndexStoB : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
+}]>;
+
+def VectorIndexHtoB : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+
// Node definitions.
def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
def AArch64adr : SDNode<"AArch64ISD::ADR", SDTIntUnaryOp, []>;
@@ -7281,6 +7297,33 @@ defm : Neon_INS_elt_pattern<v8i16, v4i16, nxv8i16, i32, VectorIndexH, INSvi1
defm : Neon_INS_elt_pattern<v4i32, v2i32, nxv4i32, i32, VectorIndexS, INSvi32lane>;
defm : Neon_INS_elt_pattern<v2i64, v1i64, nxv2i64, i64, VectorIndexD, INSvi64lane>;
+// Rmove GPR trip when inserting extracted truncated i64 into vector of i32.
+// From another NEON vector
+def : Pat<(v4i32 (vector_insert v4i32:$src,
+ (i32 (trunc (i64 (vector_extract v2i64:$Rn, (i64 imm:$Immn))))),
+ (i64 imm:$Immd))),
+ (INSvi32lane V128:$src, imm:$Immd, V128:$Rn, (VectorIndexHtoB imm:$Immn))>;
+
+def : Pat<(v2i32 (vector_insert v2i32:$src,
+ (i32 (trunc (i64 (vector_extract v2i64:$Rn, (i64 imm:$Immn))))),
+ (i64 imm:$Immd))),
+ (EXTRACT_SUBREG (INSvi32lane (SUBREG_TO_REG (i64 0), V64:$src, dsub),
+ imm:$Immd, V128:$Rn, (VectorIndexHtoB imm:$Immn)),
+ dsub)>;
+// From the bottom 128b of an SVE vector
+def : Pat<(v4i32 (vector_insert v4i32:$Rn,
+ (i32 (trunc (i64 (vector_extract nxv2i64:$Rm, (i64 VectorIndexD:$Immn))))),
+ (i64 imm:$Immd))),
+ (INSvi32lane V128:$Rn, imm:$Immd, (EXTRACT_SUBREG nxv2i64:$Rm, zsub), (VectorIndexHtoB VectorIndexD:$Immn))>;
+
+def : Pat<(v2i32 (vector_insert v2i32:$Rn,
+ (i32 (trunc (i64 (vector_extract nxv2i64:$Rm, (i64 VectorIndexD:$Immn))))),
+ (i64 imm:$Immd))),
+ (EXTRACT_SUBREG
+ (INSvi32lane (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immd,
+ (EXTRACT_SUBREG nxv2i64:$Rm, zsub), (VectorIndexHtoB VectorIndexD:$Immn)),
+ dsub)>;
+
// Insert from bitcast
// vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0)
def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), (i64 imm:$Immd))),
@@ -8700,16 +8743,6 @@ class Ld1Lane64IdxOpPat<SDPatternOperator scalar_load, Operand VecIndex,
(IdxOp VecIndex:$idx), GPR64sp:$Rn),
dsub)>;
-def VectorIndexStoH : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
-}]>;
-def VectorIndexStoB : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
-}]>;
-def VectorIndexHtoB : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
-}]>;
-
def : Ld1Lane128IdxOpPat<extloadi16, VectorIndexS, v4i32, i32, LD1i16, VectorIndexStoH>;
def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexS, v4i32, i32, LD1i8, VectorIndexStoB>;
def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexH, v8i16, i32, LD1i8, VectorIndexHtoB>;
diff --git a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
new file mode 100644
index 00000000000000..ab11f12d53790f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+neon < %s | FileCheck %s
+
+; Inserting a truncated (i64 to i32) element from the bottom 128-bits of any vector type into a NEON vector should use INS (element) of the
+; truncated size to avoid pointless GPR trips.
+
+
+define <2 x i32> @test_s_trunc_d_lane0(<2 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <1 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 0
+ ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_d_qlane1(<2 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_qlane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[0], v1.s[2]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 0
+ ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_lane0(<4 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: ret
+ %c = extractelement <1 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 0
+ ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_qlane1(<4 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_qlane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[3], v1.s[2]
+; CHECK-NEXT: ret
+ %c = extractelement <2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 3
+ ret <4 x i32> %e
+}
+
+; ---- From the bottom 128b of an SVE vector
+
+define <2 x i32> @test_s_trunc_dsve_lane0(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 0
+ ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[1], v1.s[2]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 1
+ ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane0(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 0
+ ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[3], v1.s[2]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 3
+ ret <4 x i32> %e
+}
>From cb225b4b2704af18acc4ce8e78ac430b5b27cfc9 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Fri, 1 Nov 2024 13:25:07 +0000
Subject: [PATCH 2/3] [NFC] Remove unused node operator
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 6 ------
1 file changed, 6 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f92532c27fe154..4f00a7a187723b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -686,15 +686,9 @@ def topbitsallzero64: PatLeaf<(i64 GPR64:$src), [{
def VectorIndexStoH : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
}]>;
-
-def VectorIndexDtoH : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
-}]>;
-
def VectorIndexStoB : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
}]>;
-
def VectorIndexHtoB : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
}]>;
>From d25df8c9c8adabf741f476d988b7d0b19ef32519 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Fri, 1 Nov 2024 13:54:01 +0000
Subject: [PATCH 3/3] [NFC] Add negative tests for bound of SVE extraction
---
.../CodeGen/AArch64/neon-ins-trunc-elt.ll | 30 +++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
index ab11f12d53790f..4c189cd2886e8a 100644
--- a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
+++ b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
@@ -83,6 +83,22 @@ define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
ret <2 x i32> %e
}
+; (negative test) Extracted element is not within V-register.
+define <2 x i32> @test_s_trunc_dsve_lane2(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, z1.d[2]
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: mov v0.s[1], w8
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 2
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 1
+ ret <2 x i32> %e
+}
+
define <4 x i32> @test_qs_trunc_dsve_lane0(<4 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_qs_trunc_dsve_lane0:
; CHECK: // %bb.0:
@@ -104,3 +120,17 @@ define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b)
%e = insertelement <4 x i32> %a, i32 %d, i64 3
ret <4 x i32> %e
}
+
+; (negative test) Extracted element is not within V-register.
+define <4 x i32> @test_qs_trunc_dsve_lane2(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, z1.d[2]
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: mov v0.s[3], w8
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 2
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 3
+ ret <4 x i32> %e
+}
More information about the llvm-commits
mailing list