[llvm] 7a34145 - Reland "[AArch64][SVE][InstCombine] Combine contiguous gather/scatter to load/store"

Wed Nov 3 06:45:03 PDT 2021

Author: Peter Waller
Date: 2021-11-03T13:42:14Z
New Revision: 7a34145f407e4ec652111f5e9483a36f816a6a3a

URL: https://github.com/llvm/llvm-project/commit/7a34145f407e4ec652111f5e9483a36f816a6a3a
DIFF: https://github.com/llvm/llvm-project/commit/7a34145f407e4ec652111f5e9483a36f816a6a3a.diff

LOG: Reland "[AArch64][SVE][InstCombine] Combine contiguous gather/scatter to load/store"

This reverts commit 753eba64213ef20195644994df53d564f30eb65f.

Contiguous gather => masked load:

  (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
  => (masked.load (gep BasePtr IndexBase) Align Mask undef)

Contiguous scatter => masked store:

  (sve.ld1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
  => (masked.store Value (gep BasePtr IndexBase) Align Mask)

Tests with <vscale x 2 x double>:

[Gather, Scatter] for each [Positive test (index=1), Negative test
(index=2), Alignment propagation].

Differential Revision: https://reviews.llvm.org/D112076

Added: 
    llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-gatherscatter.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 52429ef94b0ba..ae4b7e596a0b8 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -864,6 +864,74 @@ static Optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
   return None;
 }
 
+static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC,
+                                                         IntrinsicInst &II) {
+  Value *Mask = II.getOperand(0);
+  Value *BasePtr = II.getOperand(1);
+  Value *Index = II.getOperand(2);
+  Type *Ty = II.getType();
+  Type *BasePtrTy = BasePtr->getType();
+  Value *PassThru = ConstantAggregateZero::get(Ty);
+
+  // Contiguous gather => masked load.
+  // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
+  // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
+  Value *IndexBase;
+  if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
+                       m_Value(IndexBase), m_SpecificInt(1)))) {
+    IRBuilder<> Builder(II.getContext());
+    Builder.SetInsertPoint(&II);
+
+    Align Alignment =
+        BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
+
+    Type *VecPtrTy = PointerType::getUnqual(Ty);
+    Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr,
+                                   IndexBase);
+    Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
+    CallInst *MaskedLoad =
+        Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
+    MaskedLoad->takeName(&II);
+    return IC.replaceInstUsesWith(II, MaskedLoad);
+  }
+
+  return None;
+}
+
+static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC,
+                                                          IntrinsicInst &II) {
+  Value *Val = II.getOperand(0);
+  Value *Mask = II.getOperand(1);
+  Value *BasePtr = II.getOperand(2);
+  Value *Index = II.getOperand(3);
+  Type *Ty = Val->getType();
+  Type *BasePtrTy = BasePtr->getType();
+
+  // Contiguous scatter => masked store.
+  // (sve.ld1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
+  // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
+  Value *IndexBase;
+  if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
+                       m_Value(IndexBase), m_SpecificInt(1)))) {
+    IRBuilder<> Builder(II.getContext());
+    Builder.SetInsertPoint(&II);
+
+    Align Alignment =
+        BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
+
+    Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr,
+                                   IndexBase);
+    Type *VecPtrTy = PointerType::getUnqual(Ty);
+    Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
+
+    (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
+
+    return IC.eraseInstFromFunction(II);
+  }
+
+  return None;
+}
+
 Optional<Instruction *>
 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                      IntrinsicInst &II) const {
@@ -915,6 +983,10 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   case Intrinsic::aarch64_sve_zip1:
   case Intrinsic::aarch64_sve_zip2:
     return instCombineSVEZip(IC, II);
+  case Intrinsic::aarch64_sve_ld1_gather_index:
+    return instCombineLD1GatherIndex(IC, II);
+  case Intrinsic::aarch64_sve_st1_scatter_index:
+    return instCombineST1ScatterIndex(IC, II);
   }
 
   return None;

diff  --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-gatherscatter.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-gatherscatter.ll
new file mode 100644
index 0000000000000..2f665d5d6610f
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-gatherscatter.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine -dce < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;;
+;; Gathers.
+;;
+
+define <vscale x 2 x double> @test_ld1_gather_index_nxv2f64_stride1(<vscale x 2 x i1> %pred, double* %x, i64 %base) #0 {
+; CHECK-LABEL: @test_ld1_gather_index_nxv2f64_stride1(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[BASE:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <vscale x 2 x double>*
+; CHECK-NEXT:    [[LD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double>* [[TMP2]], i32 1, <vscale x 2 x i1> [[PRED:%.*]], <vscale x 2 x double> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 2 x double> [[LD]]
+;
+  %idx = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %base, i64 1)
+  %ld = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %pred, double* %x, <vscale x 2 x i64> %idx)
+  ret <vscale x 2 x double> %ld
+}
+
+define <vscale x 2 x double> @test_ld1_gather_index_nxv2f64_stride2_negtest(<vscale x 2 x i1> %pred, double* %x, i64 %base) #0 {
+; CHECK-LABEL: @test_ld1_gather_index_nxv2f64_stride2_negtest(
+; CHECK-NEXT:    [[IDX:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 [[BASE:%.*]], i64 2)
+; CHECK-NEXT:    [[LD:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> [[PRED:%.*]], double* [[X:%.*]], <vscale x 2 x i64> [[IDX]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[LD]]
+;
+  %idx = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %base, i64 2)
+  %ld = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %pred, double* %x, <vscale x 2 x i64> %idx)
+  ret <vscale x 2 x double> %ld
+}
+
+define <vscale x 2 x double> @test_ld1_gather_index_nxv2f64_stride1_align8(<vscale x 2 x i1> %pred, double* align 8 %x, i64 %base) #0 {
+; CHECK-LABEL: @test_ld1_gather_index_nxv2f64_stride1_align8(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[BASE:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <vscale x 2 x double>*
+; CHECK-NEXT:    [[LD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double>* [[TMP2]], i32 8, <vscale x 2 x i1> [[PRED:%.*]], <vscale x 2 x double> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 2 x double> [[LD]]
+;
+  %idx = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %base, i64 1)
+  %ld = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %pred, double* %x, <vscale x 2 x i64> %idx)
+  ret <vscale x 2 x double> %ld
+}
+
+;;
+;; Scatters.
+;;
+
+define void @test_st1_scatter_index_nxv2f64_stride1(<vscale x 2 x i1> %pred, double* %x, i64 %base, <vscale x 2 x double> %val) #0 {
+; CHECK-LABEL: @test_st1_scatter_index_nxv2f64_stride1(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[BASE:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <vscale x 2 x double>*
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0nxv2f64(<vscale x 2 x double> [[VAL:%.*]], <vscale x 2 x double>* [[TMP2]], i32 1, <vscale x 2 x i1> [[PRED:%.*]])
+; CHECK-NEXT:    ret void
+;
+  %idx = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %base, i64 1)
+  tail call void @llvm.aarch64.sve.st1.scatter.index.nxv2f64(<vscale x 2 x double> %val, <vscale x 2 x i1> %pred, double* %x, <vscale x 2 x i64> %idx)
+  ret void
+}
+
+define void @test_st1_scatter_index_nxv2f64_stride2_negtest(<vscale x 2 x i1> %pred, double* %x, i64 %base, <vscale x 2 x double> %val) #0 {
+; CHECK-LABEL: @test_st1_scatter_index_nxv2f64_stride2_negtest(
+; CHECK-NEXT:    [[IDX:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 [[BASE:%.*]], i64 2)
+; CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.scatter.index.nxv2f64(<vscale x 2 x double> [[VAL:%.*]], <vscale x 2 x i1> [[PRED:%.*]], double* [[X:%.*]], <vscale x 2 x i64> [[IDX]])
+; CHECK-NEXT:    ret void
+;
+  %idx = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %base, i64 2)
+  tail call void @llvm.aarch64.sve.st1.scatter.index.nxv2f64(<vscale x 2 x double> %val, <vscale x 2 x i1> %pred, double* %x, <vscale x 2 x i64> %idx)
+  ret void
+}
+
+define void @test_st1_scatter_index_nxv2f64_stride1_align8(<vscale x 2 x i1> %pred, double* align 8 %x, i64 %base, <vscale x 2 x double> %val) #0 {
+; CHECK-LABEL: @test_st1_scatter_index_nxv2f64_stride1_align8(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[BASE:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <vscale x 2 x double>*
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0nxv2f64(<vscale x 2 x double> [[VAL:%.*]], <vscale x 2 x double>* [[TMP2]], i32 8, <vscale x 2 x i1> [[PRED:%.*]])
+; CHECK-NEXT:    ret void
+;
+  %idx = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %base, i64 1)
+  tail call void @llvm.aarch64.sve.st1.scatter.index.nxv2f64(<vscale x 2 x double> %val, <vscale x 2 x i1> %pred, double* %x, <vscale x 2 x i64> %idx)
+  ret void
+}
+
+declare <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64, i64)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1.scatter.index.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*, <vscale x 2 x i64>)
+
+attributes #0 = { "target-features"="+sve" }