[llvm] abd0d5d - Reland: [AArch64][GlobalISel] Adopt dup(load) -> LD1R patterns from SelectionDAG

Tue Oct 17 08:50:23 PDT 2023

Author: Vladislav Dzhidzhoev
Date: 2023-10-17T17:40:05+02:00
New Revision: abd0d5d2626022d835c784b1fed557caf90e793f

URL: https://github.com/llvm/llvm-project/commit/abd0d5d2626022d835c784b1fed557caf90e793f
DIFF: https://github.com/llvm/llvm-project/commit/abd0d5d2626022d835c784b1fed557caf90e793f.diff

LOG: Reland: [AArch64][GlobalISel] Adopt dup(load) -> LD1R patterns from SelectionDAG

This relands the fb8f59156f0f208f6192ed808fc223eda6c0e7ec and makes
isAArch64FrameOffsetLegal function recognize LD1R instructions.

Original PR: https://github.com/llvm/llvm-project/pull/66914
PR of the fix: https://github.com/llvm/llvm-project/pull/69003

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64InstrGISel.td
    llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
    llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
    llvm/test/CodeGen/AArch64/arm64-ld1.ll
    llvm/test/CodeGen/AArch64/arm64-st1.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index c6ff7bea4bd2c92..27338bd24393325 100644

--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -511,3 +511,20 @@ let AddedComplexity = 19 in {
   defm : VecROStoreLane64_0Pat<ro16, store, v4i16, i16, hsub, STRHroW, STRHroX>;
   defm : VecROStoreLane64_0Pat<ro32, store, v2i32, i32, ssub, STRSroW, STRSroX>;
 }
+
+def : Pat<(v8i8 (AArch64dup (i8 (load (am_indexed8 GPR64sp:$Rn))))),
+          (LD1Rv8b GPR64sp:$Rn)>;
+def : Pat<(v16i8 (AArch64dup (i8 (load GPR64sp:$Rn)))),
+          (LD1Rv16b GPR64sp:$Rn)>;
+def : Pat<(v4i16 (AArch64dup (i16 (load GPR64sp:$Rn)))),
+          (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8i16 (AArch64dup (i16 (load GPR64sp:$Rn)))),
+          (LD1Rv8h GPR64sp:$Rn)>;
+def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+          (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+          (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+          (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+          (LD1Rv1d GPR64sp:$Rn)>;

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 8f0e272a6fac788..05c79b610cb36c5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5584,6 +5584,14 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
   switch (MI.getOpcode()) {
   default:
     break;
+  case AArch64::LD1Rv1d:
+  case AArch64::LD1Rv2s:
+  case AArch64::LD1Rv2d:
+  case AArch64::LD1Rv4h:
+  case AArch64::LD1Rv4s:
+  case AArch64::LD1Rv8b:
+  case AArch64::LD1Rv8h:
+  case AArch64::LD1Rv16b:
   case AArch64::LD1Twov2d:
   case AArch64::LD1Threev2d:
   case AArch64::LD1Fourv2d:

diff  --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 1b9583464edea76..2cab4932def0724 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s --check-prefixes=CHECK,SDAG
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-GISEL
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=arm64-apple-ios7.0 -o - %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GISEL
 
 ; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for test_v8i8_pre_load
 ; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for test_v8i8_post_load
@@ -620,9 +620,6 @@
 ; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_variable_index_i8
 ; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_variable_index_i16
 ; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_variable_index_i32
-; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_variable_index_v3i32_small_align
-; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_variable_index_v3i32_default_align
-; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_valid_const_index_v3i32
 ; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_variable_index_masked_i32
 ; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_variable_index_masked2_i32
 
@@ -13786,11 +13783,18 @@ define ptr @test_v1f64_post_reg_st4lane(ptr %A, ptr %ptr, <1 x double> %B, <1 x
 declare void @llvm.aarch64.neon.st4lane.v1f64.p0(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, ptr)
 
 define <16 x i8> @test_v16i8_post_imm_ld1r(ptr %bar, ptr %ptr) {
-; CHECK-LABEL: test_v16i8_post_imm_ld1r:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    ld1r.16b { v0 }, [x0], #1
-; CHECK-NEXT:    str x0, [x1]
-; CHECK-NEXT:    ret
+; SDAG-LABEL: test_v16i8_post_imm_ld1r:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    ld1r.16b { v0 }, [x0], #1
+; SDAG-NEXT:    str x0, [x1]
+; SDAG-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: test_v16i8_post_imm_ld1r:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    ld1r.16b { v0 }, [x0]
+; CHECK-GISEL-NEXT:    add x8, x0, #1
+; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i8, ptr %bar
   %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
   %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
@@ -13814,11 +13818,18 @@ define <16 x i8> @test_v16i8_post_imm_ld1r(ptr %bar, ptr %ptr) {
 }
 
 define <16 x i8> @test_v16i8_post_reg_ld1r(ptr %bar, ptr %ptr, i64 %inc) {
-; CHECK-LABEL: test_v16i8_post_reg_ld1r:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    ld1r.16b { v0 }, [x0], x2
-; CHECK-NEXT:    str x0, [x1]
-; CHECK-NEXT:    ret
+; SDAG-LABEL: test_v16i8_post_reg_ld1r:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    ld1r.16b { v0 }, [x0], x2
+; SDAG-NEXT:    str x0, [x1]
+; SDAG-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: test_v16i8_post_reg_ld1r:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    ld1r.16b { v0 }, [x0]
+; CHECK-GISEL-NEXT:    add x8, x0, x2
+; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i8, ptr %bar
   %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
   %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
@@ -13842,11 +13853,18 @@ define <16 x i8> @test_v16i8_post_reg_ld1r(ptr %bar, ptr %ptr, i64 %inc) {
 }
 
 define <8 x i8> @test_v8i8_post_imm_ld1r(ptr %bar, ptr %ptr) {
-; CHECK-LABEL: test_v8i8_post_imm_ld1r:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    ld1r.8b { v0 }, [x0], #1
-; CHECK-NEXT:    str x0, [x1]
-; CHECK-NEXT:    ret
+; SDAG-LABEL: test_v8i8_post_imm_ld1r:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    ld1r.8b { v0 }, [x0], #1
+; SDAG-NEXT:    str x0, [x1]
+; SDAG-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: test_v8i8_post_imm_ld1r:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    ld1r.8b { v0 }, [x0]
+; CHECK-GISEL-NEXT:    add x8, x0, #1
+; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i8, ptr %bar
   %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
   %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
@@ -13862,11 +13880,18 @@ define <8 x i8> @test_v8i8_post_imm_ld1r(ptr %bar, ptr %ptr) {
 }
 
 define <8 x i8> @test_v8i8_post_reg_ld1r(ptr %bar, ptr %ptr, i64 %inc) {
-; CHECK-LABEL: test_v8i8_post_reg_ld1r:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    ld1r.8b { v0 }, [x0], x2
-; CHECK-NEXT:    str x0, [x1]
-; CHECK-NEXT:    ret
+; SDAG-LABEL: test_v8i8_post_reg_ld1r:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    ld1r.8b { v0 }, [x0], x2
+; SDAG-NEXT:    str x0, [x1]
+; SDAG-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: test_v8i8_post_reg_ld1r:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    ld1r.8b { v0 }, [x0]
+; CHECK-GISEL-NEXT:    add x8, x0, x2
+; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i8, ptr %bar
   %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
   %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1

diff  --git a/llvm/test/CodeGen/AArch64/arm64-ld1.ll b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
index 96468b2cfa8ace8..54b96520dce41d9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -global-isel=1 -global-isel-abort=2 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc < %s -global-isel=1 -global-isel-abort=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 %struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
 %struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
@@ -1712,3 +1712,30 @@ define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(ptr %addr) {
   %val = call %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr %addr)
   ret %struct.__neon_float64x2x4_t %val
 }
+
+define <8 x i8> @dup_ld1_from_stack(ptr %__ret) {
+; CHECK-SD-LABEL: dup_ld1_from_stack:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add x8, sp, #15
+; CHECK-SD-NEXT:    ld1r.8b { v0 }, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: dup_ld1_from_stack:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    add x8, sp, #15
+; CHECK-GI-NEXT:    ld1r.8b { v0 }, [x8]
+; CHECK-GI-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %item = alloca i8, align 1
+  %0 = load i8, ptr %item, align 1
+  %1 = insertelement <8 x i8> poison, i8 %0, i32 0
+  %lane = shufflevector <8 x i8> %1, <8 x i8> %1, <8 x i32> zeroinitializer
+  ret <8 x i8> %lane
+}

diff  --git a/llvm/test/CodeGen/AArch64/arm64-st1.ll b/llvm/test/CodeGen/AArch64/arm64-st1.ll
index 121ca69bee21dd6..6f87c66c873451a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-st1.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-st1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -global-isel -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
 ; The instruction latencies of Exynos-M3 trigger the transform we see under the Exynos check.
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs -mcpu=exynos-m3 | FileCheck --check-prefix=EXYNOS %s