[llvm] [X86] Generate `kmov` for masking integers (PR #120593)

Fri Dec 20 03:00:16 PST 2024

================
@@ -0,0 +1,205 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s
+
+define dso_local void @foo_16_ne(ptr nocapture noundef writeonly %c, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %mask) {
+; CHECK-LABEL: foo_16_ne:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    kmovw %ecx, %k1
+; CHECK-NEXT:    vmovups (%rdx), %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovups (%rsi), %zmm1 {%k1} {z}
+; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vmovups %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %0 = and i32 %mask, 65535
+  %.splatinsert = insertelement <16 x i32> poison, i32 %0, i64 0
+  %.splat = shufflevector <16 x i32> %.splatinsert, <16 x i32> poison, <16 x i32> zeroinitializer
+  %1 = and <16 x i32> %.splat, <i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 16384, i32 32768>
+  %hir.cmp.45 = icmp ne <16 x i32> %1, zeroinitializer
+  %2 = tail call <16 x float> @llvm.masked.load.v16f32.p0(ptr %b, i32 4, <16 x i1> %hir.cmp.45, <16 x float> poison)
+  %3 = tail call <16 x float> @llvm.masked.load.v16f32.p0(ptr %a, i32 4, <16 x i1> %hir.cmp.45, <16 x float> poison)
+  %4 = fadd reassoc nsz arcp contract afn <16 x float> %2, %3
+  tail call void @llvm.masked.store.v16f32.p0(<16 x float> %4, ptr %c, i32 4, <16 x i1> %hir.cmp.45)
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind uwtable
+define dso_local void @foo_16_eq(ptr nocapture noundef writeonly %c, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %mask) {
+; CHECK-LABEL: foo_16_eq:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    kmovw %ecx, %k0
+; CHECK-NEXT:    knotw %k0, %k1
+; CHECK-NEXT:    vmovups (%rdx), %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovups (%rsi), %zmm1 {%k1} {z}
+; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vmovups %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %0 = and i32 %mask, 65535
+  %.splatinsert = insertelement <16 x i32> poison, i32 %0, i64 0
+  %.splat = shufflevector <16 x i32> %.splatinsert, <16 x i32> poison, <16 x i32> zeroinitializer
+  %1 = and <16 x i32> %.splat, <i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 16384, i32 32768>
+  %hir.cmp.45 = icmp eq <16 x i32> %1, zeroinitializer
+  %2 = tail call <16 x float> @llvm.masked.load.v16f32.p0(ptr %b, i32 4, <16 x i1> %hir.cmp.45, <16 x float> poison)
+  %3 = tail call <16 x float> @llvm.masked.load.v16f32.p0(ptr %a, i32 4, <16 x i1> %hir.cmp.45, <16 x float> poison)
+  %4 = fadd reassoc nsz arcp contract afn <16 x float> %2, %3
+  tail call void @llvm.masked.store.v16f32.p0(<16 x float> %4, ptr %c, i32 4, <16 x i1> %hir.cmp.45)
----------------
RKSimon wrote:

are all the masked load/stores necessary? aren't there simpler predicated instruction patterns that could be used?

https://github.com/llvm/llvm-project/pull/120593