[llvm] c3fe025 - [AArch64][SelectionDAG] Refactor to support more scalable vector extending loads

Sun Mar 27 06:20:35 PDT 2022

Author: zhongyunde
Date: 2022-03-27T21:18:01+08:00
New Revision: c3fe025bd4a180d3898e3d6cfd1a8a34c8fe6aa8

URL: https://github.com/llvm/llvm-project/commit/c3fe025bd4a180d3898e3d6cfd1a8a34c8fe6aa8
DIFF: https://github.com/llvm/llvm-project/commit/c3fe025bd4a180d3898e3d6cfd1a8a34c8fe6aa8.diff

LOG: [AArch64][SelectionDAG] Refactor to support more scalable vector extending loads

Accord the discussion in D120953, we should firstly exclude all scalable vector
extending loads and then selectively enable those which we directly support.

This patch is intend to refactor for above (truncating stores is not touched),and
more scalable vector types will try to reduce the number of masked loads in favour
of more unpklo/hi instructions.

Reviewed By: paulwalker-arm

Differential Revision: https://reviews.llvm.org/D122281

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll
    llvm/test/CodeGen/AArch64/sve-intrinsics-mask-ldst-ext.ll
    llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c4828999aab31..0341f24d07c1f 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1231,11 +1231,25 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       }
     }
 
-    // SVE supports unpklo/hi instructions to reduce the number of loads.
-    for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
-      setLoadExtAction(Op, MVT::nxv16i64, MVT::nxv16i8, Expand);
-      setLoadExtAction(Op, MVT::nxv8i64, MVT::nxv8i16, Expand);
-      setLoadExtAction(Op, MVT::nxv4i64, MVT::nxv4i32, Expand);
+    // Firstly, exclude all scalable vector extending loads/truncating stores.
+    for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
+      for (MVT InnerVT : MVT::integer_scalable_vector_valuetypes()) {
+        // TODO: truncating stores should also be exclude
+        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+      }
+    }
+
+    // Then, selectively enable those which we directly support.
+    for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
+      setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
+      setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
+      setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
+      setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
+      setLoadExtAction(Op, MVT::nxv2i32, MVT::nxv2i16, Legal);
+      setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
+      setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
     }
 
     // SVE supports truncating stores of 64 and 128-bit vectors

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll
index d4771c5b610ba..d44bc171b7b6f 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll
@@ -2,9 +2,37 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -asm-verbose=1 < %s | FileCheck %s
 
 ;
-; LD1B
+; LD1SB/LD1B
 ;
 
+define <vscale x 16 x i32> @ld1b_i8_sext_i32(<vscale x 16 x i8> *%base) {
+; CHECK-LABEL: ld1b_i8_sext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1sb { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sb { z2.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1sb { z3.s }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT:    ret
+  %wide.load = load <vscale x 16 x i8>, <vscale x 16 x i8>* %base
+  %res = sext <vscale x 16 x i8> %wide.load to <vscale x 16 x i32>
+  ret <vscale x 16 x i32> %res
+}
+
+define <vscale x 16 x i32> @ld1b_i8_zext_i32(<vscale x 16 x i8> *%base) {
+; CHECK-LABEL: ld1b_i8_zext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1b { z2.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1b { z3.s }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT:    ret
+  %wide.load = load <vscale x 16 x i8>, <vscale x 16 x i8>* %base
+  %res = zext <vscale x 16 x i8> %wide.load to <vscale x 16 x i32>
+  ret <vscale x 16 x i32> %res
+}
+
 define <vscale x 16 x i64> @ld1b_i8_sext(<vscale x 16 x i8> *%base) {
 ; CHECK-LABEL: ld1b_i8_sext:
 ; CHECK:       // %bb.0:

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-mask-ldst-ext.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-mask-ldst-ext.ll
index 38f2a3b2c829b..127bbf5c139ec 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-mask-ldst-ext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-mask-ldst-ext.ll
@@ -5,6 +5,38 @@
 ; LD1B
 ;
 
+define <vscale x 16 x i32> @masked_ld1b_i8_sext_i32(<vscale x 16 x i8> *%base, <vscale x 16 x i1> %mask) {
+; CHECK-LABEL: masked_ld1b_i8_sext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    sunpklo z1.h, z0.b
+; CHECK-NEXT:    sunpkhi z3.h, z0.b
+; CHECK-NEXT:    sunpklo z0.s, z1.h
+; CHECK-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-NEXT:    sunpklo z2.s, z3.h
+; CHECK-NEXT:    sunpkhi z3.s, z3.h
+; CHECK-NEXT:    ret
+  %wide.masked.load = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8>* %base, i32 2, <vscale x 16 x i1> %mask, <vscale x 16 x i8> undef)
+  %res = sext <vscale x 16 x i8> %wide.masked.load to <vscale x 16 x i32>
+  ret <vscale x 16 x i32> %res
+}
+
+define <vscale x 16 x i32> @masked_ld1b_i8_zext_i32(<vscale x 16 x i8> *%base, <vscale x 16 x i1> %mask) {
+; CHECK-LABEL: masked_ld1b_i8_zext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    uunpklo z1.h, z0.b
+; CHECK-NEXT:    uunpkhi z3.h, z0.b
+; CHECK-NEXT:    uunpklo z0.s, z1.h
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    uunpklo z2.s, z3.h
+; CHECK-NEXT:    uunpkhi z3.s, z3.h
+; CHECK-NEXT:    ret
+  %wide.masked.load = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8>* %base, i32 2, <vscale x 16 x i1> %mask, <vscale x 16 x i8> undef)
+  %res = zext <vscale x 16 x i8> %wide.masked.load to <vscale x 16 x i32>
+  ret <vscale x 16 x i32> %res
+}
+
 define <vscale x 16 x i64> @masked_ld1b_i8_sext(<vscale x 16 x i8> *%base, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: masked_ld1b_i8_sext:
 ; CHECK:       // %bb.0:

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll b/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll
index 4d3c4fa3616ef..f53b77c8da536 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll
@@ -73,16 +73,13 @@ define <vscale x 2 x i64> @masked_sload_passthru(<vscale x 2 x i32> *%a, <vscale
 ; Return type requires splitting
 define <vscale x 16 x i32> @masked_sload_nxv16i8(<vscale x 16 x i8>* %a, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: masked_sload_nxv16i8:
-; CHECK:         punpklo p1.h, p0.b
-; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    punpklo p2.h, p1.b
-; CHECK-NEXT:    punpkhi p1.h, p1.b
-; CHECK-NEXT:    ld1sb { z0.s }, p2/z, [x0]
-; CHECK-NEXT:    punpklo p2.h, p0.b
-; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    ld1sb { z1.s }, p1/z, [x0, #1, mul vl]
-; CHECK-NEXT:    ld1sb { z2.s }, p2/z, [x0, #2, mul vl]
-; CHECK-NEXT:    ld1sb { z3.s }, p0/z, [x0, #3, mul vl]
+; CHECK:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    sunpklo z1.h, z0.b
+; CHECK-NEXT:    sunpkhi z3.h, z0.b
+; CHECK-NEXT:    sunpklo z0.s, z1.h
+; CHECK-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-NEXT:    sunpklo z2.s, z3.h
+; CHECK-NEXT:    sunpkhi z3.s, z3.h
 ; CHECK-NEXT:    ret
   %load = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>* %a, i32 2, <vscale x 16 x i1> %mask, <vscale x 16 x i8> undef)
   %ext = sext <vscale x 16 x i8> %load to <vscale x 16 x i32>
@@ -92,14 +89,13 @@ define <vscale x 16 x i32> @masked_sload_nxv16i8(<vscale x 16 x i8>* %a, <vscale
 ; Masked load requires promotion
 define <vscale x 4 x double> @masked_sload_4i8_4f32(<vscale x 4 x i8>* noalias %in, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_sload_4i8_4f32:
-; CHECK:       punpkhi p2.h, p0.b
-; CHECK-NEXT:  punpklo p0.h, p0.b
-; CHECK-NEXT:  ld1sb { z1.d }, p2/z, [x0, #1, mul vl]
-; CHECK-NEXT:  ld1sb { z0.d }, p0/z, [x0]
-; CHECK-NEXT:  ptrue p1.d
-; CHECK-NEXT:  scvtf z0.d, p1/m, z0.d
-; CHECK-NEXT:  scvtf z1.d, p1/m, z1.d
-; CHECK-NEXT:  ret
+; CHECK:    ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sunpkhi z1.d, z0.s
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    scvtf z0.d, p1/m, z0.d
+; CHECK-NEXT:    scvtf z1.d, p1/m, z1.d
+; CHECK-NEXT:    ret
   %wide.load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %in, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
   %sext = sext <vscale x 4 x i8> %wide.load to <vscale x 4 x i64>
   %res = sitofp <vscale x 4 x i64> %sext to <vscale x 4 x double>