[llvm] [PreISelIntrinsicLowering] Produce a memset_pattern16 libcall for llvm.experimental.memset.pattern when available (PR #120420)

Wed Dec 18 05:07:19 PST 2024

https://github.com/asb created https://github.com/llvm/llvm-project/pull/120420

This is to enable a transition of LoopIdiomRecognize to selecting the llvm.experimental.memset.pattern intrinsic as requested in #118632 (as opposed to supporting selection of the libcall or the intrinsic). As such, although it _is_ a FIXME to add costing considerations on whether to lower to the libcall (when available) or expand directly, lacking such logic is helpful at this stage in order to minimise any unexpected code gen changes in this transition.

>From 442c72fa70af7bc4cc64b8866c918e3b89a01981 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 18 Dec 2024 09:56:55 +0000
Subject: [PATCH] [PreISelIntrinsicLowering] Produce a memset_pattern16 libcall
 for llvm.experimental.memset.pattern when available

This is to enable a transition of LoopIdiomRecognize to selecting the
llvm.experimental.memset.pattern intrinsic as requested in #118632 (as
opposed to supporting selection of the libcall or the intrinsic). As
such, although it _is_ a FIXME to add costing considerations on whether
to lower to the libcall (when available) or expand directly, lacking
such logic is helpful at this stage in order to minimise any potential
code gen changes in this transition.
---
 llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp | 90 ++++++++++++++++++-
 .../X86/memset-pattern.ll                     | 69 ++++++++++++++
 2 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll

diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 4a3d1673c2a7c1..9067f29faa7b0c 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
 
 using namespace llvm;
@@ -232,6 +233,59 @@ static bool canEmitLibcall(const TargetMachine *TM, Function *F,
   return TLI->getLibcallName(LC) != nullptr;
 }
 
+// Return a value appropriate for use with the memset_pattern16 libcall, if
+// possible and if we know how. (Adapted from equivalent helper in
+// LoopIdiomRecognize).
+static Constant *getMemSetPattern16Value(MemSetPatternInst *Inst,
+                                         const TargetLibraryInfo &TLI) {
+  // FIXME: This could check for UndefValue because it can be merged into any
+  // other valid pattern.
+
+  // Don't emit libcalls if a non-default address space is being used.
+  if (Inst->getRawDest()->getType()->getPointerAddressSpace() != 0)
+    return nullptr;
+
+  Value *V = Inst->getValue();
+  const DataLayout &DL = Inst->getDataLayout();
+  Module *M = Inst->getModule();
+
+  if (!isLibFuncEmittable(M, &TLI, LibFunc_memset_pattern16))
+    return nullptr;
+
+  // If the value isn't a constant, we can't promote it to being in a constant
+  // array.  We could theoretically do a store to an alloca or something, but
+  // that doesn't seem worthwhile.
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C || isa<ConstantExpr>(C))
+    return nullptr;
+
+  // Only handle simple values that are a power of two bytes in size.
+  uint64_t Size = DL.getTypeSizeInBits(V->getType());
+  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
+    return nullptr;
+
+  // Don't care enough about darwin/ppc to implement this.
+  if (DL.isBigEndian())
+    return nullptr;
+
+  // Convert to size in bytes.
+  Size /= 8;
+
+  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
+  // if the top and bottom are the same (e.g. for vectors and large integers).
+  if (Size > 16)
+    return nullptr;
+
+  // If the constant is exactly 16 bytes, just use it.
+  if (Size == 16)
+    return C;
+
+  // Otherwise, we'll use an array of the constants.
+  unsigned ArraySize = 16 / Size;
+  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
+  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
+}
+
 // TODO: Handle atomic memcpy and memcpy.inline
 // TODO: Pass ScalarEvolution
 bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
@@ -322,7 +376,41 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
     }
     case Intrinsic::experimental_memset_pattern: {
       auto *Memset = cast<MemSetPatternInst>(Inst);
-      expandMemSetPatternAsLoop(Memset);
+      const TargetLibraryInfo &TLI = LookupTLI(*Memset->getFunction());
+      if (Constant *PatternValue = getMemSetPattern16Value(Memset, TLI)) {
+        // FIXME: There is currently no profitability calculation for emitting
+        // the libcall vs expanding the memset.pattern directly.
+        IRBuilder<> Builder(Inst);
+        Module *M = Memset->getModule();
+        const DataLayout &DL = Memset->getDataLayout();
+
+        StringRef FuncName = "memset_pattern16";
+        FunctionCallee MSP = getOrInsertLibFunc(
+            M, TLI, LibFunc_memset_pattern16, Builder.getVoidTy(),
+            Memset->getRawDest()->getType(), Builder.getPtrTy(),
+            Memset->getLength()->getType());
+        inferNonMandatoryLibFuncAttrs(M, FuncName, TLI);
+
+        // Otherwise we should form a memset_pattern16.  PatternValue is known
+        // to be an constant array of 16-bytes. Put the value into a mergable
+        // global.
+        GlobalVariable *GV = new GlobalVariable(
+            *M, PatternValue->getType(), true, GlobalValue::PrivateLinkage,
+            PatternValue, ".memset_pattern");
+        GV->setUnnamedAddr(
+            GlobalValue::UnnamedAddr::Global); // Ok to merge these.
+        GV->setAlignment(Align(16));
+        Value *PatternPtr = GV;
+        Value *NumBytes = Builder.CreateMul(
+            Builder.getInt64(
+                DL.getTypeSizeInBits(Memset->getValue()->getType()) / 8),
+            Memset->getLength());
+        CallInst *MemsetPattern16Call = Builder.CreateCall(
+            MSP, {Memset->getRawDest(), PatternPtr, NumBytes});
+        MemsetPattern16Call->setAAMetadata(Memset->getAAMetadata());
+      } else {
+        expandMemSetPatternAsLoop(Memset);
+      }
       Changed = true;
       Memset->eraseFromParent();
       break;
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll
new file mode 100644
index 00000000000000..6d5f5b8a6d4fb2
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=x86_64-apple-darwin10.0.0 -passes=pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
+
+define void @memset_pattern_i128_1_dynvalue(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_1_dynvalue(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK:       [[LOADSTORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 1, i1 0)
+  ret void
+}
+
+define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_1(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern, i64 16)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 0)
+  ret void
+}
+
+define void @memset_pattern_i128_1_nz_as(ptr addrspace(1) %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_1_nz_as(
+; CHECK-SAME: ptr addrspace(1) [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK:       [[LOADSTORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i128, ptr addrspace(1) [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    store i128 -113427455635030943652277463699152839203, ptr addrspace(1) [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr addrspace(1) %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 0)
+  ret void
+}
+
+define void @memset_pattern_i128_16(ptr %a) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_16(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.1, i64 256)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 16, i1 0)
+  ret void
+}
+
+define void @memset_pattern_i128_x(ptr %a, i64 %x) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_x(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 16, [[X]]
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.2, i64 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 %x, i1 0)
+  ret void
+}