[llvm] 2762da0 - [SVE][CodeGen] Legalisation of masked loads and stores

Kerry McLaughlin via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 16 02:57:43 PDT 2020


Author: Kerry McLaughlin
Date: 2020-07-16T10:55:45+01:00
New Revision: 2762da0a16a763654254e3320a3f46be2bb742b4

URL: https://github.com/llvm/llvm-project/commit/2762da0a16a763654254e3320a3f46be2bb742b4
DIFF: https://github.com/llvm/llvm-project/commit/2762da0a16a763654254e3320a3f46be2bb742b4.diff

LOG: [SVE][CodeGen] Legalisation of masked loads and stores

Summary:
This patch modifies IncrementMemoryAddress to use a vscale
when calculating the new address if the data type is scalable.

Also adds tablegen patterns which match an extract_subvector
of a legal predicate type with zip1/zip2 instructions

Reviewers: sdesmalen, efriedma, david-arm

Reviewed By: efriedma, david-arm

Subscribers: tschuett, hiraditya, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D83137

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/test/CodeGen/AArch64/sve-split-load.ll
    llvm/test/CodeGen/AArch64/sve-split-store.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 96df20039b15..3c989a933c48 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7153,6 +7153,9 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
   assert(DataVT.getVectorNumElements() == MaskVT.getVectorNumElements() &&
          "Incompatible types of Data and Mask");
   if (IsCompressedMemory) {
+    if (DataVT.isScalableVector())
+      report_fatal_error(
+          "Cannot currently handle compressed memory with scalable vectors");
     // Incrementing the pointer according to number of '1's in the mask.
     EVT MaskIntVT = EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits());
     SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask);
@@ -7168,6 +7171,10 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
     SDValue Scale = DAG.getConstant(DataVT.getScalarSizeInBits() / 8, DL,
                                     AddrVT);
     Increment = DAG.getNode(ISD::MUL, DL, AddrVT, Increment, Scale);
+  } else if (DataVT.isScalableVector()) {
+    Increment = DAG.getVScale(DL, AddrVT,
+                              APInt(AddrVT.getSizeInBits().getFixedSize(),
+                                    DataVT.getStoreSize().getKnownMinSize()));
   } else
     Increment = DAG.getConstant(DataVT.getStoreSize(), DL, AddrVT);
 

diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 28a54e6f7d79..5b1990e49262 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1109,6 +1109,20 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1", AArch64trn1>;
   defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>;
 
+  // Extract lo/hi halves of legal predicate types.
+  def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))),
+            (ZIP1_PPP_S PPR:$Ps, (PFALSE))>;
+  def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))),
+            (ZIP2_PPP_S PPR:$Ps, (PFALSE))>;
+  def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))),
+            (ZIP1_PPP_H PPR:$Ps, (PFALSE))>;
+  def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))),
+            (ZIP2_PPP_H PPR:$Ps, (PFALSE))>;
+  def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
+            (ZIP1_PPP_B PPR:$Ps, (PFALSE))>;
+  def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
+            (ZIP2_PPP_B PPR:$Ps, (PFALSE))>;
+
   defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
   defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
   defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>;

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-load.ll b/llvm/test/CodeGen/AArch64/sve-split-load.ll
index a76b27e63557..5e64ff981286 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-load.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
 
-; LOAD
+; UNPREDICATED
 
-define <vscale x 4 x i16> @load_promote_4i8(<vscale x 4 x i16>* %a) {
-; CHECK-LABEL: load_promote_4i8:
+define <vscale x 4 x i16> @load_promote_4i16(<vscale x 4 x i16>* %a) {
+; CHECK-LABEL: load_promote_4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
@@ -53,3 +53,82 @@ define <vscale x 16 x i64> @load_split_16i64(<vscale x 16 x i64>* %a) {
   %load = load <vscale x 16 x i64>, <vscale x 16 x i64>* %a
   ret <vscale x 16 x i64> %load
 }
+
+; MASKED
+
+define <vscale x 2 x i32> @masked_load_promote_2i32(<vscale x 2 x i32> *%a, <vscale x 2 x i1> %pg) {
+; CHECK-LABEL: masked_load_promote_2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32> *%a, i32 1, <vscale x 2 x i1> %pg, <vscale x 2 x i32> undef)
+  ret <vscale x 2 x i32> %load
+}
+
+define <vscale x 32 x i8> @masked_load_split_32i8(<vscale x 32 x i8> *%a, <vscale x 32 x i1> %pg) {
+; CHECK-LABEL: masked_load_split_32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8(<vscale x 32 x i8> *%a, i32 1, <vscale x 32 x i1> %pg, <vscale x 32 x i8> undef)
+  ret <vscale x 32 x i8> %load
+}
+
+define <vscale x 32 x i16> @masked_load_split_32i16(<vscale x 32 x i16> *%a, <vscale x 32 x i1> %pg) {
+; CHECK-LABEL: masked_load_split_32i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p2.b
+; CHECK-NEXT:    zip1 p3.b, p0.b, p2.b
+; CHECK-NEXT:    zip2 p0.b, p0.b, p2.b
+; CHECK-NEXT:    ld1h { z0.h }, p3/z, [x0]
+; CHECK-NEXT:    zip1 p3.b, p1.b, p2.b
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    zip2 p0.b, p1.b, p2.b
+; CHECK-NEXT:    ld1h { z2.h }, p3/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1h { z3.h }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 32 x i16> @llvm.masked.load.nxv32i16(<vscale x 32 x i16> *%a, i32 1, <vscale x 32 x i1> %pg, <vscale x 32 x i16> undef)
+  ret <vscale x 32 x i16> %load
+}
+
+define <vscale x 8 x i32> @masked_load_split_8i32(<vscale x 8 x i32> *%a, <vscale x 8 x i1> %pg) {
+; CHECK-LABEL: masked_load_split_8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    zip1 p2.h, p0.h, p1.h
+; CHECK-NEXT:    zip2 p0.h, p0.h, p1.h
+; CHECK-NEXT:    ld1w { z0.s }, p2/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 8 x i32> @llvm.masked.load.nxv8i32(<vscale x 8 x i32> *%a, i32 1, <vscale x 8 x i1> %pg, <vscale x 8 x i32> undef)
+  ret <vscale x 8 x i32> %load
+}
+
+define <vscale x 8 x i64> @masked_load_split_8i64(<vscale x 8 x i64> *%a, <vscale x 8 x i1> %pg) {
+; CHECK-LABEL: masked_load_split_8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    zip1 p2.h, p0.h, p1.h
+; CHECK-NEXT:    zip2 p0.h, p0.h, p1.h
+; CHECK-NEXT:    zip1 p3.s, p2.s, p1.s
+; CHECK-NEXT:    zip2 p2.s, p2.s, p1.s
+; CHECK-NEXT:    ld1d { z0.d }, p3/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p2/z, [x0, #1, mul vl]
+; CHECK-NEXT:    zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1d { z2.d }, p2/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 8 x i64> @llvm.masked.load.nxv8i64(<vscale x 8 x i64> *%a, i32 1, <vscale x 8 x i1> %pg, <vscale x 8 x i64> undef)
+  ret <vscale x 8 x i64> %load
+}
+
+declare <vscale x 32 x i8> @llvm.masked.load.nxv32i8(<vscale x 32 x i8>*, i32, <vscale x 32 x i1>, <vscale x 32 x i8>)
+
+declare <vscale x 32 x i16> @llvm.masked.load.nxv32i16(<vscale x 32 x i16>*, i32, <vscale x 32 x i1>, <vscale x 32 x i16>)
+
+declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 8 x i32> @llvm.masked.load.nxv8i32(<vscale x 8 x i32>*, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)
+
+declare <vscale x 8 x i64> @llvm.masked.load.nxv8i64(<vscale x 8 x i64>*, i32, <vscale x 8 x i1>, <vscale x 8 x i64>)

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-store.ll b/llvm/test/CodeGen/AArch64/sve-split-store.ll
index 2fba0404ef34..a3a9b8b53ec7 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-store.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
 
+; UNPREDICATED
+
 define void @store_promote_4i8(<vscale x 4 x i8> %data, <vscale x 4 x i8>* %a) {
 ; CHECK-LABEL: store_promote_4i8:
 ; CHECK:       // %bb.0:
@@ -51,3 +53,82 @@ define void @store_split_16i64(<vscale x 16 x i64> %data, <vscale x 16 x i64>* %
   store <vscale x 16 x i64> %data, <vscale x 16 x i64>* %a
   ret void
 }
+
+; MASKED
+
+define void @masked_store_promote_2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8> *%a, <vscale x 2 x i1> %pg) {
+; CHECK-LABEL: masked_store_promote_2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1b { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8> *%a, i32 1, <vscale x 2 x i1> %pg)
+  ret void
+}
+
+define void @masked_store_split_32i8(<vscale x 32 x i8> %data, <vscale x 32 x i8> *%a, <vscale x 32 x i1> %pg) {
+; CHECK-LABEL: masked_store_split_32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1b { z1.b }, p1, [x0, #1, mul vl]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.nxv32i8(<vscale x 32 x i8> %data, <vscale x 32 x i8> *%a, i32 1, <vscale x 32 x i1> %pg)
+  ret void
+}
+
+define void @masked_store_split_32i16(<vscale x 32 x i16> %data, <vscale x 32 x i16> *%a, <vscale x 32 x i1> %pg) {
+; CHECK-LABEL: masked_store_split_32i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p2.b
+; CHECK-NEXT:    zip2 p3.b, p1.b, p2.b
+; CHECK-NEXT:    zip1 p1.b, p1.b, p2.b
+; CHECK-NEXT:    st1h { z3.h }, p3, [x0, #3, mul vl]
+; CHECK-NEXT:    zip2 p3.b, p0.b, p2.b
+; CHECK-NEXT:    zip1 p0.b, p0.b, p2.b
+; CHECK-NEXT:    st1h { z2.h }, p1, [x0, #2, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p3, [x0, #1, mul vl]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.nxv32i16(<vscale x 32 x i16> %data, <vscale x 32 x i16> *%a, i32 1, <vscale x 32 x i1> %pg)
+  ret void
+}
+
+define void @masked_store_split_8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32> *%a, <vscale x 8 x i1> %pg) {
+; CHECK-LABEL: masked_store_split_8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    zip2 p2.h, p0.h, p1.h
+; CHECK-NEXT:    zip1 p0.h, p0.h, p1.h
+; CHECK-NEXT:    st1w { z1.s }, p2, [x0, #1, mul vl]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32> *%a, i32 1, <vscale x 8 x i1> %pg)
+  ret void
+}
+
+define void @masked_store_split_8i64(<vscale x 8 x i64> %data, <vscale x 8 x i64> *%a, <vscale x 8 x i1> %pg) {
+; CHECK-LABEL: masked_store_split_8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    zip2 p2.h, p0.h, p1.h
+; CHECK-NEXT:    zip1 p0.h, p0.h, p1.h
+; CHECK-NEXT:    zip2 p3.s, p2.s, p1.s
+; CHECK-NEXT:    zip1 p2.s, p2.s, p1.s
+; CHECK-NEXT:    st1d { z2.d }, p2, [x0, #2, mul vl]
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    st1d { z3.d }, p3, [x0, #3, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p2, [x0, #1, mul vl]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.nxv8i64(<vscale x 8 x i64> %data, <vscale x 8 x i64> *%a, i32 1, <vscale x 8 x i1> %pg)
+  ret void
+}
+
+declare void @llvm.masked.store.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>*, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.store.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>*, i32, <vscale x 32 x i1>)
+
+declare void @llvm.masked.store.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>*, i32, <vscale x 32 x i1>)
+
+declare void @llvm.masked.store.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>*, i32, <vscale x 8 x i1>)
+
+declare void @llvm.masked.store.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>*, i32, <vscale x 8 x i1>)


        


More information about the llvm-commits mailing list