[llvm] [AArch64] Avoid GPR trip when moving truncated i32 vector elements (PR #114541)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 1 07:11:56 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: None (SpencerAbson)
<details>
<summary>Changes</summary>
This patch introduces ISEL patterns to enable the use of [INS](https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en) (element (of truncated size)) when moving a truncated i32 element between vectors.
For example,
```
define <4 x i32> @<!-- -->test(<4 x i32> %a, <2 x i64> %b) {
%c = extractelement <2 x i64> %b, i32 1
%d = trunc i64 %c to i32
%e = insertelement <4 x i32> %a, i32 %d, i64 3
ret <4 x i32> %e
}
```
Can use
```
mov v0.s[3], v1.s[2]
ret
```
Instead of
```
mov x8, v1.d[1]
mov v0.s[3], w8
```
---
Full diff: https://github.com/llvm/llvm-project/pull/114541.diff
2 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+37-10)
- (added) llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll (+136)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 250d6144f75318..4f00a7a187723b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -683,6 +683,16 @@ def topbitsallzero64: PatLeaf<(i64 GPR64:$src), [{
CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(64, 63));
}]>;
+def VectorIndexStoH : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+def VectorIndexStoB : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
+}]>;
+def VectorIndexHtoB : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+
// Node definitions.
def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
def AArch64adr : SDNode<"AArch64ISD::ADR", SDTIntUnaryOp, []>;
@@ -7281,6 +7291,33 @@ defm : Neon_INS_elt_pattern<v8i16, v4i16, nxv8i16, i32, VectorIndexH, INSvi1
defm : Neon_INS_elt_pattern<v4i32, v2i32, nxv4i32, i32, VectorIndexS, INSvi32lane>;
defm : Neon_INS_elt_pattern<v2i64, v1i64, nxv2i64, i64, VectorIndexD, INSvi64lane>;
+// Rmove GPR trip when inserting extracted truncated i64 into vector of i32.
+// From another NEON vector
+def : Pat<(v4i32 (vector_insert v4i32:$src,
+ (i32 (trunc (i64 (vector_extract v2i64:$Rn, (i64 imm:$Immn))))),
+ (i64 imm:$Immd))),
+ (INSvi32lane V128:$src, imm:$Immd, V128:$Rn, (VectorIndexHtoB imm:$Immn))>;
+
+def : Pat<(v2i32 (vector_insert v2i32:$src,
+ (i32 (trunc (i64 (vector_extract v2i64:$Rn, (i64 imm:$Immn))))),
+ (i64 imm:$Immd))),
+ (EXTRACT_SUBREG (INSvi32lane (SUBREG_TO_REG (i64 0), V64:$src, dsub),
+ imm:$Immd, V128:$Rn, (VectorIndexHtoB imm:$Immn)),
+ dsub)>;
+// From the bottom 128b of an SVE vector
+def : Pat<(v4i32 (vector_insert v4i32:$Rn,
+ (i32 (trunc (i64 (vector_extract nxv2i64:$Rm, (i64 VectorIndexD:$Immn))))),
+ (i64 imm:$Immd))),
+ (INSvi32lane V128:$Rn, imm:$Immd, (EXTRACT_SUBREG nxv2i64:$Rm, zsub), (VectorIndexHtoB VectorIndexD:$Immn))>;
+
+def : Pat<(v2i32 (vector_insert v2i32:$Rn,
+ (i32 (trunc (i64 (vector_extract nxv2i64:$Rm, (i64 VectorIndexD:$Immn))))),
+ (i64 imm:$Immd))),
+ (EXTRACT_SUBREG
+ (INSvi32lane (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immd,
+ (EXTRACT_SUBREG nxv2i64:$Rm, zsub), (VectorIndexHtoB VectorIndexD:$Immn)),
+ dsub)>;
+
// Insert from bitcast
// vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0)
def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), (i64 imm:$Immd))),
@@ -8700,16 +8737,6 @@ class Ld1Lane64IdxOpPat<SDPatternOperator scalar_load, Operand VecIndex,
(IdxOp VecIndex:$idx), GPR64sp:$Rn),
dsub)>;
-def VectorIndexStoH : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
-}]>;
-def VectorIndexStoB : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
-}]>;
-def VectorIndexHtoB : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
-}]>;
-
def : Ld1Lane128IdxOpPat<extloadi16, VectorIndexS, v4i32, i32, LD1i16, VectorIndexStoH>;
def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexS, v4i32, i32, LD1i8, VectorIndexStoB>;
def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexH, v8i16, i32, LD1i8, VectorIndexHtoB>;
diff --git a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
new file mode 100644
index 00000000000000..4c189cd2886e8a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+neon < %s | FileCheck %s
+
+; Inserting a truncated (i64 to i32) element from the bottom 128-bits of any vector type into a NEON vector should use INS (element) of the
+; truncated size to avoid pointless GPR trips.
+
+
+define <2 x i32> @test_s_trunc_d_lane0(<2 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <1 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 0
+ ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_d_qlane1(<2 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_qlane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[0], v1.s[2]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 0
+ ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_lane0(<4 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: ret
+ %c = extractelement <1 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 0
+ ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_qlane1(<4 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_qlane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[3], v1.s[2]
+; CHECK-NEXT: ret
+ %c = extractelement <2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 3
+ ret <4 x i32> %e
+}
+
+; ---- From the bottom 128b of an SVE vector
+
+define <2 x i32> @test_s_trunc_dsve_lane0(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 0
+ ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[1], v1.s[2]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 1
+ ret <2 x i32> %e
+}
+
+; (negative test) Extracted element is not within V-register.
+define <2 x i32> @test_s_trunc_dsve_lane2(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, z1.d[2]
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: mov v0.s[1], w8
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 2
+ %d = trunc i64 %c to i32
+ %e = insertelement <2 x i32> %a, i32 %d, i64 1
+ ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane0(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 0
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 0
+ ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[3], v1.s[2]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 1
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 3
+ ret <4 x i32> %e
+}
+
+; (negative test) Extracted element is not within V-register.
+define <4 x i32> @test_qs_trunc_dsve_lane2(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, z1.d[2]
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: mov v0.s[3], w8
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 2
+ %d = trunc i64 %c to i32
+ %e = insertelement <4 x i32> %a, i32 %d, i64 3
+ ret <4 x i32> %e
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/114541
More information about the llvm-commits
mailing list