[llvm] d030aad - [InstCombine][X86] Add tests for masked load/stores with comparisons.

Sat Sep 12 06:32:48 PDT 2020

Author: Simon Pilgrim
Date: 2020-09-12T14:32:27+01:00
New Revision: d030aad7893a8cf7a68877b8b55eed1cd632411a

URL: https://github.com/llvm/llvm-project/commit/d030aad7893a8cf7a68877b8b55eed1cd632411a
DIFF: https://github.com/llvm/llvm-project/commit/d030aad7893a8cf7a68877b8b55eed1cd632411a.diff

LOG: [InstCombine][X86] Add tests for masked load/stores with comparisons.

As detailed on PR11210, if the mask is known to come from a (sign extended) bool vector (e.g. comparisons) then we can represent with a generic masked load/store without losing anything.

Added: 
    

Modified: 
    llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll b/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll
index d845dcb5cac4..2975b1c27479 100644

--- a/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll
@@ -12,7 +12,21 @@ define <4 x float> @mload(i8* %f, <4 x i32> %mask) {
 ;
   %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask)
   ret <4 x float> %ld
+}
+
+; TODO: If the mask comes from a comparison, convert to an LLVM intrinsic. The backend should optimize further.
 
+define <4 x float> @mload_v4f32_cmp(i8* %f, <4 x i32> %src) {
+; CHECK-LABEL: @mload_v4f32_cmp(
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp ne <4 x i32> [[SRC:%.*]], zeroinitializer
+; CHECK-NEXT:    [[MASK:%.*]] = sext <4 x i1> [[ICMP]] to <4 x i32>
+; CHECK-NEXT:    [[LD:%.*]] = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* [[F:%.*]], <4 x i32> [[MASK]])
+; CHECK-NEXT:    ret <4 x float> [[LD]]
+;
+  %icmp = icmp ne <4 x i32> %src, zeroinitializer
+  %mask = sext <4 x i1> %icmp to <4 x i32>
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask)
+  ret <4 x float> %ld
 }
 
 ; Zero mask returns a zero vector.
@@ -23,7 +37,6 @@ define <4 x float> @mload_zeros(i8* %f) {
 ;
   %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> zeroinitializer)
   ret <4 x float> %ld
-
 }
 
 ; Only the sign bit matters.
@@ -34,7 +47,6 @@ define <4 x float> @mload_fake_ones(i8* %f) {
 ;
   %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 1, i32 2, i32 3, i32 2147483647>)
   ret <4 x float> %ld
-
 }
 
 ; All mask bits are set, so this is just a vector load.
@@ -47,7 +59,6 @@ define <4 x float> @mload_real_ones(i8* %f) {
 ;
   %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 -1, i32 -2, i32 -3, i32 2147483648>)
   ret <4 x float> %ld
-
 }
 
 ; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further.
@@ -60,7 +71,6 @@ define <4 x float> @mload_one_one(i8* %f) {
 ;
   %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
   ret <4 x float> %ld
-
 }
 
 ; Try doubles.
@@ -73,7 +83,6 @@ define <2 x double> @mload_one_one_double(i8* %f) {
 ;
   %ld = tail call <2 x double> @llvm.x86.avx.maskload.pd(i8* %f, <2 x i64> <i64 -1, i64 0>)
   ret <2 x double> %ld
-
 }
 
 ; Try 256-bit FP ops.
@@ -86,7 +95,24 @@ define <8 x float> @mload_v8f32(i8* %f) {
 ;
   %ld = tail call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
   ret <8 x float> %ld
+}
 
+define <8 x float> @mload_v8f32_cmp(i8* %f, <8 x float> %src0, <8 x float> %src1) {
+; CHECK-LABEL: @mload_v8f32_cmp(
+; CHECK-NEXT:    [[ICMP0:%.*]] = fcmp one <8 x float> [[SRC0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[ICMP1:%.*]] = fcmp one <8 x float> [[SRC1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[MASK1:%.*]] = and <8 x i1> [[ICMP0]], [[ICMP1]]
+; CHECK-NEXT:    [[MASK:%.*]] = sext <8 x i1> [[MASK1]] to <8 x i32>
+; CHECK-NEXT:    [[LD:%.*]] = tail call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* [[F:%.*]], <8 x i32> [[MASK]])
+; CHECK-NEXT:    ret <8 x float> [[LD]]
+;
+  %icmp0 = fcmp one <8 x float> %src0, zeroinitializer
+  %icmp1 = fcmp one <8 x float> %src1, zeroinitializer
+  %ext0 = sext <8 x i1> %icmp0 to <8 x i32>
+  %ext1 = sext <8 x i1> %icmp1 to <8 x i32>
+  %mask = and <8 x i32> %ext0, %ext1
+  %ld = tail call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %f, <8 x i32> %mask)
+  ret <8 x float> %ld
 }
 
 define <4 x double> @mload_v4f64(i8* %f) {
@@ -97,7 +123,6 @@ define <4 x double> @mload_v4f64(i8* %f) {
 ;
   %ld = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
   ret <4 x double> %ld
-
 }
 
 ; Try the AVX2 variants.
@@ -110,7 +135,6 @@ define <4 x i32> @mload_v4i32(i8* %f) {
 ;
   %ld = tail call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
   ret <4 x i32> %ld
-
 }
 
 define <2 x i64> @mload_v2i64(i8* %f) {
@@ -121,7 +145,6 @@ define <2 x i64> @mload_v2i64(i8* %f) {
 ;
   %ld = tail call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %f, <2 x i64> <i64 -1, i64 0>)
   ret <2 x i64> %ld
-
 }
 
 define <8 x i32> @mload_v8i32(i8* %f) {
@@ -132,7 +155,6 @@ define <8 x i32> @mload_v8i32(i8* %f) {
 ;
   %ld = tail call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
   ret <8 x i32> %ld
-
 }
 
 define <4 x i64> @mload_v4i64(i8* %f) {
@@ -143,9 +165,20 @@ define <4 x i64> @mload_v4i64(i8* %f) {
 ;
   %ld = tail call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
   ret <4 x i64> %ld
-
 }
 
+define <4 x i64> @mload_v4i64_cmp(i8* %f, <4 x i64> %src) {
+; CHECK-LABEL: @mload_v4i64_cmp(
+; CHECK-NEXT:    [[SRC_LOBIT:%.*]] = ashr <4 x i64> [[SRC:%.*]], <i64 63, i64 63, i64 63, i64 63>
+; CHECK-NEXT:    [[SRC_LOBIT_NOT:%.*]] = xor <4 x i64> [[SRC_LOBIT]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; CHECK-NEXT:    [[LD:%.*]] = tail call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* [[F:%.*]], <4 x i64> [[SRC_LOBIT_NOT]])
+; CHECK-NEXT:    ret <4 x i64> [[LD]]
+;
+  %icmp = icmp sge <4 x i64> %src, zeroinitializer
+  %mask = sext <4 x i1> %icmp to <4 x i64>
+  %ld = tail call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %f, <4 x i64> %mask)
+  ret <4 x i64> %ld
+}
 
 ;; MASKED STORES
 
@@ -158,7 +191,21 @@ define void @mstore(i8* %f, <4 x i32> %mask, <4 x float> %v) {
 ;
   tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> %mask, <4 x float> %v)
   ret void
+}
 
+; TODO: If the mask comes from a comparison, convert to an LLVM intrinsic. The backend should optimize further.
+
+define void @mstore_v4f32_cmp(i8* %f, <4 x i32> %src, <4 x float> %v) {
+; CHECK-LABEL: @mstore_v4f32_cmp(
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq <4 x i32> [[SRC:%.*]], zeroinitializer
+; CHECK-NEXT:    [[MASK:%.*]] = sext <4 x i1> [[ICMP]] to <4 x i32>
+; CHECK-NEXT:    tail call void @llvm.x86.avx.maskstore.ps(i8* [[F:%.*]], <4 x i32> [[MASK]], <4 x float> [[V:%.*]])
+; CHECK-NEXT:    ret void
+;
+  %icmp = icmp eq <4 x i32> %src, zeroinitializer
+  %mask = sext <4 x i1> %icmp to <4 x i32>
+  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> %mask, <4 x float> %v)
+  ret void
 }
 
 ; Zero mask is a nop.
@@ -169,7 +216,6 @@ define void @mstore_zeros(i8* %f, <4 x float> %v)  {
 ;
   tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> zeroinitializer, <4 x float> %v)
   ret void
-
 }
 
 ; Only the sign bit matters.
@@ -180,7 +226,6 @@ define void @mstore_fake_ones(i8* %f, <4 x float> %v) {
 ;
   tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 1, i32 2, i32 3, i32 2147483647>, <4 x float> %v)
   ret void
-
 }
 
 ; All mask bits are set, so this is just a vector store.
@@ -193,7 +238,6 @@ define void @mstore_real_ones(i8* %f, <4 x float> %v) {
 ;
   tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 -1, i32 -2, i32 -3, i32 -2147483648>, <4 x float> %v)
   ret void
-
 }
 
 ; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further.
@@ -206,7 +250,6 @@ define void @mstore_one_one(i8* %f, <4 x float> %v) {
 ;
   tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>, <4 x float> %v)
   ret void
-
 }
 
 ; Try doubles.
@@ -219,7 +262,6 @@ define void @mstore_one_one_double(i8* %f, <2 x double> %v) {
 ;
   tail call void @llvm.x86.avx.maskstore.pd(i8* %f, <2 x i64> <i64 -1, i64 0>, <2 x double> %v)
   ret void
-
 }
 
 ; Try 256-bit FP ops.
@@ -232,7 +274,6 @@ define void @mstore_v8f32(i8* %f, <8 x float> %v) {
 ;
   tail call void @llvm.x86.avx.maskstore.ps.256(i8* %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x float> %v)
   ret void
-
 }
 
 define void @mstore_v4f64(i8* %f, <4 x double> %v) {
@@ -243,7 +284,20 @@ define void @mstore_v4f64(i8* %f, <4 x double> %v) {
 ;
   tail call void @llvm.x86.avx.maskstore.pd.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x double> %v)
   ret void
+}
 
+define void @mstore_v4f64_cmp(i8* %f, <4 x i32> %src, <4 x double> %v) {
+; CHECK-LABEL: @mstore_v4f64_cmp(
+; CHECK-NEXT:    [[SRC_LOBIT:%.*]] = ashr <4 x i32> [[SRC:%.*]], <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i32> [[SRC_LOBIT]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[DOTNOT:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    tail call void @llvm.x86.avx.maskstore.pd.256(i8* [[F:%.*]], <4 x i64> [[DOTNOT]], <4 x double> [[V:%.*]])
+; CHECK-NEXT:    ret void
+;
+  %icmp = icmp sge <4 x i32> %src, zeroinitializer
+  %mask = sext <4 x i1> %icmp to <4 x i64>
+  tail call void @llvm.x86.avx.maskstore.pd.256(i8* %f, <4 x i64> %mask, <4 x double> %v)
+  ret void
 }
 
 ; Try the AVX2 variants.
@@ -256,7 +310,6 @@ define void @mstore_v4i32(i8* %f, <4 x i32> %v) {
 ;
   tail call void @llvm.x86.avx2.maskstore.d(i8* %f, <4 x i32> <i32 0, i32 1, i32 -1, i32 -2>, <4 x i32> %v)
   ret void
-
 }
 
 define void @mstore_v2i64(i8* %f, <2 x i64> %v) {
@@ -278,7 +331,6 @@ define void @mstore_v8i32(i8* %f, <8 x i32> %v) {
 ;
   tail call void @llvm.x86.avx2.maskstore.d.256(i8* %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x i32> %v)
   ret void
-
 }
 
 define void @mstore_v4i64(i8* %f, <4 x i64> %v) {
@@ -289,7 +341,24 @@ define void @mstore_v4i64(i8* %f, <4 x i64> %v) {
 ;
   tail call void @llvm.x86.avx2.maskstore.q.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x i64> %v)
   ret void
+}
 
+define void @mstore_v4i64_cmp(i8* %f, <4 x i64> %src0, <4 x i64> %src1, <4 x i64> %v) {
+; CHECK-LABEL: @mstore_v4i64_cmp(
+; CHECK-NEXT:    [[ICMP0:%.*]] = icmp eq <4 x i64> [[SRC0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[ICMP1:%.*]] = icmp ne <4 x i64> [[SRC1:%.*]], zeroinitializer
+; CHECK-NEXT:    [[MASK1:%.*]] = and <4 x i1> [[ICMP0]], [[ICMP1]]
+; CHECK-NEXT:    [[MASK:%.*]] = sext <4 x i1> [[MASK1]] to <4 x i64>
+; CHECK-NEXT:    tail call void @llvm.x86.avx2.maskstore.q.256(i8* [[F:%.*]], <4 x i64> [[MASK]], <4 x i64> [[V:%.*]])
+; CHECK-NEXT:    ret void
+;
+  %icmp0 = icmp eq <4 x i64> %src0, zeroinitializer
+  %icmp1 = icmp ne <4 x i64> %src1, zeroinitializer
+  %ext0 = sext <4 x i1> %icmp0 to <4 x i64>
+  %ext1 = sext <4 x i1> %icmp1 to <4 x i64>
+  %mask = and <4 x i64> %ext0, %ext1
+  tail call void @llvm.x86.avx2.maskstore.q.256(i8* %f, <4 x i64> %mask, <4 x i64> %v)
+  ret void
 }
 
 ; The original SSE2 masked store variant.
@@ -300,10 +369,8 @@ define void @mstore_v16i8_sse2_zeros(<16 x i8> %d, i8* %p) {
 ;
   tail call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %d, <16 x i8> zeroinitializer, i8* %p)
   ret void
-
 }
 
-
 declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>)
 declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>)
 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>)