[llvm] 8ac66d6 - [AArch64] Generate LD1 for anyext i8 or i16 vector load

Wed May 26 04:49:20 PDT 2021

Author: Andrew Savonichev
Date: 2021-05-26T14:44:21+03:00
New Revision: 8ac66d61eab3dd44defa5755b884eca71a19431c

URL: https://github.com/llvm/llvm-project/commit/8ac66d61eab3dd44defa5755b884eca71a19431c
DIFF: https://github.com/llvm/llvm-project/commit/8ac66d61eab3dd44defa5755b884eca71a19431c.diff

LOG: [AArch64] Generate LD1 for anyext i8 or i16 vector load

The existing LD1 patterns do not cover cases where result type does
not match the memory type. This happens when illegal vector types are
extended and scalarized, for example:

  load <2 x i16>* %v2i16

is lowered into:

  // first element
  (v4i32 (insert_subvector (v2i32 (scalar_to_vector (load anyext from i16)))))
  // other elements
  (v4i32 (insert_vector_elt (i32 (load anyext from i16)) idx))

Before this patch these patterns were compiled into LDR + INS.
Now they are compiled into LD1.

The problem was reported in
PR24820: LLVM Generates abysmal code in simple situation.

Differential Revision: https://reviews.llvm.org/D102938

Added: 
    llvm/test/CodeGen/AArch64/aarch64-load-ext.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
    llvm/test/CodeGen/AArch64/ssub_sat_vec.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 7ebd339e979a..52c5900a9d41 100644

--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6729,6 +6729,46 @@ def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
 def : Ld1Lane128Pat<load,       VectorIndexH, v8f16, f16, LD1i16>;
 def : Ld1Lane128Pat<load,       VectorIndexH, v8bf16, bf16, LD1i16>;
 
+// Generate LD1 for extload if memory type does not match the
+// destination type, for example:
+//
+//   (v4i32 (insert_vector_elt (load anyext from i8) idx))
+//
+// In this case, the index must be adjusted to match LD1 type.
+//
+class Ld1Lane128IdxOpPat<SDPatternOperator scalar_load, Operand
+                    VecIndex, ValueType VTy, ValueType STy,
+                    Instruction LD1, SDNodeXForm IdxOp>
+  : Pat<(vector_insert (VTy VecListOne128:$Rd),
+                       (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+        (LD1 VecListOne128:$Rd, (IdxOp VecIndex:$idx), GPR64sp:$Rn)>;
+
+def VectorIndexStoH : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+def VectorIndexStoB : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64);
+}]>;
+def VectorIndexHtoB : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64);
+}]>;
+
+def : Ld1Lane128IdxOpPat<extloadi16, VectorIndexS, v4i32, i32, LD1i16, VectorIndexStoH>;
+def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexS, v4i32, i32, LD1i8, VectorIndexStoB>;
+def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexH, v8i16, i32, LD1i8, VectorIndexHtoB>;
+
+// Same as above, but the first element is populated using
+// scalar_to_vector + insert_subvector instead of insert_vector_elt.
+class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy,
+                        SDPatternOperator ExtLoad, Instruction LD1>
+  : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))),
+          (ResultTy (EXTRACT_SUBREG
+            (LD1 (VecTy (IMPLICIT_DEF)), 0, GPR64sp:$Rn), dsub))>;
+
+def : Ld1Lane128FirstElm<v2i32, v8i16, extloadi16, LD1i16>;
+def : Ld1Lane128FirstElm<v2i32, v16i8, extloadi8, LD1i8>;
+def : Ld1Lane128FirstElm<v4i16, v16i8, extloadi8, LD1i8>;
+
 class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
                    ValueType VTy, ValueType STy, Instruction LD1>
   : Pat<(vector_insert (VTy VecListOne64:$Rd),

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
new file mode 100644
index 000000000000..308352e3e227
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-LE
+; RUN: llc -mtriple=aarch64_be-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-BE
+
+define <2 x i16> @test0(i16* %i16_ptr, i64 %inc) {
+; CHECK-LE-LABEL: test0:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test0:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %i_0 = load i16, i16* %i16_ptr
+  %v0 = insertelement <2 x i16> undef, i16 %i_0, i32 0
+  ret <2 x i16> %v0
+}
+
+define <2 x i16> @test1(<2 x i16>* %v2i16_ptr) {
+; CHECK-LE-LABEL: test1:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-LE-NEXT:    add x8, x0, #2 // =2
+; CHECK-LE-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test1:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-BE-NEXT:    add x8, x0, #2 // =2
+; CHECK-BE-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %v2i16 = load <2 x i16>, <2 x i16>* %v2i16_ptr
+  ret <2 x i16> %v2i16
+}
+
+define <2 x i16> @test2(i16* %i16_ptr, i64 %inc) {
+; CHECK-LE-LABEL: test2:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-LE-NEXT:    add x8, x0, x1, lsl #1
+; CHECK-LE-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test2:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-BE-NEXT:    add x8, x0, x1, lsl #1
+; CHECK-BE-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %i_0 = load i16, i16* %i16_ptr
+  %i16_ptr_inc = getelementptr i16, i16* %i16_ptr, i64 %inc
+  %i_1 = load i16, i16* %i16_ptr_inc
+  %v0 = insertelement <2 x i16> undef, i16 %i_0, i32 0
+  %v1 = insertelement <2 x i16> %v0, i16 %i_1, i32 1
+  ret <2 x i16> %v1
+}
+
+define <2 x i8> @test3(<2 x i8>* %v2i8_ptr) {
+; CHECK-LE-LABEL: test3:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-LE-NEXT:    add x8, x0, #1 // =1
+; CHECK-LE-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test3:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-BE-NEXT:    add x8, x0, #1 // =1
+; CHECK-BE-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %v2i8 = load <2 x i8>, <2 x i8>* %v2i8_ptr
+  ret <2 x i8> %v2i8
+}
+
+define <4 x i8> @test4(<4 x i8>* %v4i8_ptr) {
+; CHECK-LE-LABEL: test4:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-LE-NEXT:    add x8, x0, #1 // =1
+; CHECK-LE-NEXT:    ld1 { v0.b }[2], [x8]
+; CHECK-LE-NEXT:    add x8, x0, #2 // =2
+; CHECK-LE-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-LE-NEXT:    add x8, x0, #3 // =3
+; CHECK-LE-NEXT:    ld1 { v0.b }[6], [x8]
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test4:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-BE-NEXT:    add x8, x0, #1 // =1
+; CHECK-BE-NEXT:    ld1 { v0.b }[2], [x8]
+; CHECK-BE-NEXT:    add x8, x0, #2 // =2
+; CHECK-BE-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-BE-NEXT:    add x8, x0, #3 // =3
+; CHECK-BE-NEXT:    ld1 { v0.b }[6], [x8]
+; CHECK-BE-NEXT:    rev64 v0.4h, v0.4h
+; CHECK-BE-NEXT:    ret
+  %v4i8 = load <4 x i8>, <4 x i8>* %v4i8_ptr
+  ret <4 x i8> %v4i8
+}

diff  --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 16561fe8502c..cefd4758b374 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -145,17 +145,15 @@ define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
 define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
 ; CHECK-LABEL: v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    ldrb w9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #1]
-; CHECK-NEXT:    ldrb w11, [x1, #1]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    shl v1.2s, v1.2s, #24
+; CHECK-NEXT:    ld1 { v0.b }[0], [x1]
+; CHECK-NEXT:    ld1 { v1.b }[0], [x0]
+; CHECK-NEXT:    add x8, x0, #1 // =1
+; CHECK-NEXT:    add x9, x1, #1 // =1
+; CHECK-NEXT:    ld1 { v0.b }[4], [x9]
+; CHECK-NEXT:    ld1 { v1.b }[4], [x8]
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #24
-; CHECK-NEXT:    sqadd v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    shl v1.2s, v1.2s, #24
+; CHECK-NEXT:    sqadd v0.2s, v1.2s, v0.2s
 ; CHECK-NEXT:    ushr v0.2s, v0.2s, #24
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
@@ -187,17 +185,15 @@ define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
 define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
 ; CHECK-LABEL: v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    ldrh w10, [x0, #2]
-; CHECK-NEXT:    ldrh w11, [x1, #2]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    ld1 { v0.h }[0], [x1]
+; CHECK-NEXT:    ld1 { v1.h }[0], [x0]
+; CHECK-NEXT:    add x8, x0, #2 // =2
+; CHECK-NEXT:    add x9, x1, #2 // =2
+; CHECK-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-NEXT:    ld1 { v1.h }[2], [x8]
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-NEXT:    sqadd v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    sqadd v0.2s, v1.2s, v0.2s
 ; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0

diff  --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index bfc3858965fe..17af8a11aeee 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -146,17 +146,15 @@ define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
 define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
 ; CHECK-LABEL: v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    ldrb w9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #1]
-; CHECK-NEXT:    ldrb w11, [x1, #1]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    shl v1.2s, v1.2s, #24
+; CHECK-NEXT:    ld1 { v0.b }[0], [x1]
+; CHECK-NEXT:    ld1 { v1.b }[0], [x0]
+; CHECK-NEXT:    add x8, x0, #1 // =1
+; CHECK-NEXT:    add x9, x1, #1 // =1
+; CHECK-NEXT:    ld1 { v0.b }[4], [x9]
+; CHECK-NEXT:    ld1 { v1.b }[4], [x8]
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #24
-; CHECK-NEXT:    sqsub v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    shl v1.2s, v1.2s, #24
+; CHECK-NEXT:    sqsub v0.2s, v1.2s, v0.2s
 ; CHECK-NEXT:    ushr v0.2s, v0.2s, #24
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
@@ -188,17 +186,15 @@ define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
 define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
 ; CHECK-LABEL: v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    ldrh w10, [x0, #2]
-; CHECK-NEXT:    ldrh w11, [x1, #2]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    ld1 { v0.h }[0], [x1]
+; CHECK-NEXT:    ld1 { v1.h }[0], [x0]
+; CHECK-NEXT:    add x8, x0, #2 // =2
+; CHECK-NEXT:    add x9, x1, #2 // =2
+; CHECK-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-NEXT:    ld1 { v1.h }[2], [x8]
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-NEXT:    sqsub v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    sqsub v0.2s, v1.2s, v0.2s
 ; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0