[llvm] [AArch64] Avoid GPR trip when moving truncated i32 vector elements (PR #114541)

via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 1 07:11:24 PDT 2024


https://github.com/SpencerAbson created https://github.com/llvm/llvm-project/pull/114541

This patch introduces ISEL patterns to enable the use of [INS](https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en) (element (of truncated size)) when moving a truncated i32 element between vectors.

For example,
```
define <4 x i32> @test(<4 x i32> %a, <2 x i64> %b) {
    %c = extractelement <2 x i64> %b, i32 1
    %d = trunc i64 %c to i32
    %e = insertelement <4 x i32> %a, i32 %d, i64 3
    ret <4 x i32> %e
}
```
Can use 
```
mov v0.s[3], v1.s[2]
ret
```
Instead of
```
mov x8, v1.d[1]
mov v0.s[3], w8
```

>From 5cd15cb492a6c023e424ef232ebf1ae6ccfe9def Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Thu, 31 Oct 2024 14:46:09 +0000
Subject: [PATCH 1/3] [AArch64] Avoid GPR trip when moving truncated i32 vector
 elements

This patch introduces ISEL patterns to enable the use of INS (element, of truncated size) when moving a
truncated (from i64) i32 element between vectors.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  53 +++++++--
 .../CodeGen/AArch64/neon-ins-trunc-elt.ll     | 106 ++++++++++++++++++
 2 files changed, 149 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 250d6144f75318..f92532c27fe154 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -683,6 +683,22 @@ def topbitsallzero64: PatLeaf<(i64 GPR64:$src), [{
          CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(64, 63));
   }]>;
 
+def VectorIndexStoH : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+
+def VectorIndexDtoH : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
+}]>;
+
+def VectorIndexStoB : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
+}]>;
+
+def VectorIndexHtoB : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+
 // Node definitions.
 def AArch64adrp          : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
 def AArch64adr           : SDNode<"AArch64ISD::ADR", SDTIntUnaryOp, []>;
@@ -7281,6 +7297,33 @@ defm : Neon_INS_elt_pattern<v8i16,  v4i16,  nxv8i16,  i32,  VectorIndexH, INSvi1
 defm : Neon_INS_elt_pattern<v4i32,  v2i32,  nxv4i32,  i32,  VectorIndexS, INSvi32lane>;
 defm : Neon_INS_elt_pattern<v2i64,  v1i64,  nxv2i64,  i64,  VectorIndexD, INSvi64lane>;
 
+// Rmove GPR trip when inserting extracted truncated i64 into vector of i32.
+// From another NEON vector
+def : Pat<(v4i32 (vector_insert v4i32:$src,
+                    (i32 (trunc (i64 (vector_extract v2i64:$Rn, (i64 imm:$Immn))))),
+                    (i64 imm:$Immd))),
+          (INSvi32lane V128:$src, imm:$Immd,  V128:$Rn, (VectorIndexHtoB imm:$Immn))>;
+
+def : Pat<(v2i32 (vector_insert v2i32:$src,
+                    (i32 (trunc (i64 (vector_extract v2i64:$Rn, (i64 imm:$Immn))))),
+                    (i64 imm:$Immd))),
+          (EXTRACT_SUBREG (INSvi32lane (SUBREG_TO_REG (i64 0), V64:$src, dsub),
+                                       imm:$Immd,  V128:$Rn, (VectorIndexHtoB imm:$Immn)),
+                          dsub)>;
+// From the bottom 128b of an SVE vector
+def : Pat<(v4i32 (vector_insert v4i32:$Rn,
+                    (i32 (trunc (i64 (vector_extract nxv2i64:$Rm, (i64 VectorIndexD:$Immn))))),
+                    (i64 imm:$Immd))),
+          (INSvi32lane V128:$Rn, imm:$Immd, (EXTRACT_SUBREG nxv2i64:$Rm, zsub), (VectorIndexHtoB VectorIndexD:$Immn))>;
+
+def : Pat<(v2i32 (vector_insert v2i32:$Rn,
+                    (i32 (trunc (i64 (vector_extract nxv2i64:$Rm, (i64 VectorIndexD:$Immn))))),
+                    (i64 imm:$Immd))),
+          (EXTRACT_SUBREG
+              (INSvi32lane (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immd,
+                    (EXTRACT_SUBREG nxv2i64:$Rm, zsub), (VectorIndexHtoB VectorIndexD:$Immn)),
+              dsub)>;
+
 // Insert from bitcast
 // vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0)
 def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), (i64 imm:$Immd))),
@@ -8700,16 +8743,6 @@ class Ld1Lane64IdxOpPat<SDPatternOperator scalar_load, Operand VecIndex,
                 (IdxOp VecIndex:$idx), GPR64sp:$Rn),
             dsub)>;
 
-def VectorIndexStoH : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
-}]>;
-def VectorIndexStoB : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
-}]>;
-def VectorIndexHtoB : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
-}]>;
-
 def : Ld1Lane128IdxOpPat<extloadi16, VectorIndexS, v4i32, i32, LD1i16, VectorIndexStoH>;
 def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexS, v4i32, i32, LD1i8, VectorIndexStoB>;
 def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexH, v8i16, i32, LD1i8, VectorIndexHtoB>;
diff --git a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
new file mode 100644
index 00000000000000..ab11f12d53790f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+neon < %s | FileCheck %s
+
+; Inserting a truncated (i64 to i32) element from the bottom 128-bits of any vector type into a NEON vector should use INS (element) of the
+; truncated size to avoid pointless GPR trips.
+
+
+define <2 x i32> @test_s_trunc_d_lane0(<2 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <1 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 0
+    ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_d_qlane1(<2 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_qlane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[0], v1.s[2]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 0
+    ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_lane0(<4 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    ret
+    %c = extractelement <1 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 0
+    ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_qlane1(<4 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_qlane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.s[3], v1.s[2]
+; CHECK-NEXT:    ret
+    %c = extractelement <2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 3
+    ret <4 x i32> %e
+}
+
+; ---- From the bottom 128b of an SVE vector
+
+define <2 x i32> @test_s_trunc_dsve_lane0(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 0
+    ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[1], v1.s[2]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 1
+    ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane0(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 0
+    ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.s[3], v1.s[2]
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 3
+    ret <4 x i32> %e
+}

>From cb225b4b2704af18acc4ce8e78ac430b5b27cfc9 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Fri, 1 Nov 2024 13:25:07 +0000
Subject: [PATCH 2/3] [NFC] Remove unused node operator

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f92532c27fe154..4f00a7a187723b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -686,15 +686,9 @@ def topbitsallzero64: PatLeaf<(i64 GPR64:$src), [{
 def VectorIndexStoH : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
 }]>;
-
-def VectorIndexDtoH : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
-}]>;
-
 def VectorIndexStoB : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
 }]>;
-
 def VectorIndexHtoB : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
 }]>;

>From d25df8c9c8adabf741f476d988b7d0b19ef32519 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Fri, 1 Nov 2024 13:54:01 +0000
Subject: [PATCH 3/3] [NFC] Add negative tests for bound of SVE extraction

---
 .../CodeGen/AArch64/neon-ins-trunc-elt.ll     | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
index ab11f12d53790f..4c189cd2886e8a 100644
--- a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
+++ b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
@@ -83,6 +83,22 @@ define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
     ret <2 x i32> %e
 }
 
+; (negative test) Extracted element is not within V-register.
+define <2 x i32> @test_s_trunc_dsve_lane2(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, z1.d[2]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 2
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 1
+    ret <2 x i32> %e
+}
+
 define <4 x i32> @test_qs_trunc_dsve_lane0(<4 x i32> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: test_qs_trunc_dsve_lane0:
 ; CHECK:       // %bb.0:
@@ -104,3 +120,17 @@ define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b)
     %e = insertelement <4 x i32> %a, i32 %d, i64 3
     ret <4 x i32> %e
 }
+
+; (negative test) Extracted element is not within V-register.
+define <4 x i32> @test_qs_trunc_dsve_lane2(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, z1.d[2]
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    mov v0.s[3], w8
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 2
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 3
+    ret <4 x i32> %e
+}



More information about the llvm-commits mailing list