[llvm] r262269 - [x86, InstCombine] transform x86 AVX masked loads to LLVM intrinsics

Mon Feb 29 15:16:49 PST 2016

Author: spatel
Date: Mon Feb 29 17:16:48 2016
New Revision: 262269

URL: http://llvm.org/viewvc/llvm-project?rev=262269&view=rev
Log:
[x86, InstCombine] transform x86 AVX masked loads to LLVM intrinsics

The intended effect of this patch in conjunction with:
http://reviews.llvm.org/rL259392
http://reviews.llvm.org/rL260145

is that customers using the AVX intrinsics in C will benefit from combines when
the load mask is constant:

__m128 mload_zeros(float *f) {
  return _mm_maskload_ps(f, _mm_set1_epi32(0));
}

__m128 mload_fakeones(float *f) {
  return _mm_maskload_ps(f, _mm_set1_epi32(1));
}

__m128 mload_ones(float *f) {
  return _mm_maskload_ps(f, _mm_set1_epi32(0x80000000));
}

__m128 mload_oneset(float *f) {
  return _mm_maskload_ps(f, _mm_set_epi32(0x80000000, 0, 0, 0));
}

...so none of the above will actually generate a masked load for optimized code.

This is the masked load counterpart to:
http://reviews.llvm.org/rL262064


Modified:
    llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
    llvm/trunk/test/Transforms/InstCombine/x86-masked-memops.ll

Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp?rev=262269&r1=262268&r2=262269&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp Mon Feb 29 17:16:48 2016
@@ -831,6 +831,39 @@ static Instruction *simplifyMaskedScatte
 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
 // XMM register mask efficiently, we could transform all x86 masked intrinsics
 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
+static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
+  Value *Ptr = II.getOperand(0);
+  Value *Mask = II.getOperand(1);
+
+  // Special case a zero mask since that's not a ConstantDataVector.
+  // This masked load instruction does nothing, so return an undef.
+  if (isa<ConstantAggregateZero>(Mask))
+    return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+
+  auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
+  if (!ConstMask)
+    return nullptr;
+
+  // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
+  // to allow target-independent optimizations.
+
+  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
+  // the LLVM intrinsic definition for the pointer argument.
+  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
+  PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
+  Value *PtrCast = IC.Builder->CreateBitCast(Ptr, VecPtrTy, "castvec");
+
+  // Second, convert the x86 XMM integer vector mask to a vector of bools based
+  // on each element's most significant bit (the sign bit).
+  Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
+
+  CallInst *NewMaskedLoad = IC.Builder->CreateMaskedLoad(PtrCast, 1, BoolMask);
+  return IC.replaceInstUsesWith(II, NewMaskedLoad);
+}
+
+// TODO: If the x86 backend knew how to convert a bool vector mask back to an
+// XMM register mask efficiently, we could transform all x86 masked intrinsics
+// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
   Value *Ptr = II.getOperand(0);
   Value *Mask = II.getOperand(1);
@@ -854,7 +887,6 @@ static bool simplifyX86MaskedStore(Intri
   // the LLVM intrinsic definition for the pointer argument.
   unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
   PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
-
   Value *PtrCast = IC.Builder->CreateBitCast(Ptr, VecPtrTy, "castvec");
 
   // Second, convert the x86 XMM integer vector mask to a vector of bools based
@@ -1630,6 +1662,12 @@ Instruction *InstCombiner::visitCallInst
       return replaceInstUsesWith(*II, V);
     break;
 
+  case Intrinsic::x86_avx_maskload_ps:
+  // TODO: Add the other masked load variants.
+    if (Instruction *I = simplifyX86MaskedLoad(*II, *this))
+      return I;
+    break;
+
   case Intrinsic::x86_avx_maskstore_ps:
   case Intrinsic::x86_avx_maskstore_pd:
   case Intrinsic::x86_avx_maskstore_ps_256:

Modified: llvm/trunk/test/Transforms/InstCombine/x86-masked-memops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/x86-masked-memops.ll?rev=262269&r1=262268&r2=262269&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/x86-masked-memops.ll (original)
+++ llvm/trunk/test/Transforms/InstCombine/x86-masked-memops.ll Mon Feb 29 17:16:48 2016
@@ -1,5 +1,64 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
+;; MASKED LOADS
+
+; If the mask isn't constant, do nothing.
+
+define <4 x float> @mload(i8* %f, <4 x i32> %mask) {
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask)
+  ret <4 x float> %ld
+
+; CHECK-LABEL: @mload(
+; CHECK-NEXT:  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask)
+; CHECK-NEXT:  ret <4 x float> %ld
+}
+
+; Zero mask is a nop.
+
+define <4 x float> @mload_zeros(i8* %f) {
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> zeroinitializer)
+  ret <4 x float> %ld
+
+; CHECK-LABEL: @mload_zeros(
+; CHECK-NEXT:  ret <4 x float> undef
+}
+
+; Only the sign bit matters.
+
+define <4 x float> @mload_fake_ones(i8* %f) {
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 1, i32 2, i32 3, i32 2147483647>)
+  ret <4 x float> %ld
+
+; CHECK-LABEL: @mload_fake_ones(
+; CHECK-NEXT:  ret <4 x float> undef
+}
+
+; All mask bits are set, so this is just a vector load.
+
+define <4 x float> @mload_real_ones(i8* %f) {
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 -1, i32 -2, i32 -3, i32 2147483648>)
+  ret <4 x float> %ld
+
+; CHECK-LABEL: @mload_real_ones(
+; CHECK-NEXT:  %castvec = bitcast i8* %f to <4 x float>*
+; CHECK-NEXT:  %unmaskedload = load <4 x float>, <4 x float>* %castvec
+; CHECK-NEXT:  ret <4 x float> %unmaskedload
+}
+
+; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further.
+
+define <4 x float> @mload_one_one(i8* %f) {
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
+  ret <4 x float> %ld
+
+; CHECK-LABEL: @mload_one_one(
+; CHECK-NEXT:  %castvec = bitcast i8* %f to <4 x float>*
+; CHECK-NEXT:  %1 = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %castvec, i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> undef)
+; CHECK-NEXT:  ret <4 x float> %1
+}
+
+;; MASKED STORES
+
 ; If the mask isn't constant, do nothing.
 
 define void @mstore(i8* %f, <4 x i32> %mask, <4 x float> %v) {
@@ -131,6 +190,10 @@ define void @mstore_v4i64(i8* %f, <4 x i
 ; CHECK-NEXT:  ret void
 }
 
+declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>)
+declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>)
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>)
 
 declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>)
 declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>)