[llvm] [X86] For minsize, use size for alignment, rather than actual alignment (PR #87003)

Mon Jul 15 15:06:09 PDT 2024

https://github.com/AtariDreams updated https://github.com/llvm/llvm-project/pull/87003

>From 0003d438929c376a2f00421f0730116d5d2c39a7 Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Fri, 5 Apr 2024 14:16:40 -0400
Subject: [PATCH 1/2] [X86] Pre-commit test (NFC)

---
 .../CodeGen/X86/memset-vs-memset-inline.ll    | 137 ++++++++++++++++++
 1 file changed, 137 insertions(+)

diff --git a/llvm/test/CodeGen/X86/memset-vs-memset-inline.ll b/llvm/test/CodeGen/X86/memset-vs-memset-inline.ll
index b8fdd936b4389..90a3f8e785e90 100644
--- a/llvm/test/CodeGen/X86/memset-vs-memset-inline.ll
+++ b/llvm/test/CodeGen/X86/memset-vs-memset-inline.ll
@@ -163,3 +163,140 @@ define void @inlined_set_doesnt_call_external_function(ptr %a, i8 %value) nounwi
   tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 1024, i1 0)
   ret void
 }
+
+define void @memset_inlined_insize(ptr %a) nounwind minsize {
+; CHECK-LABEL: memset_inlined_insize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
+; CHECK-NEXT:    movq %rax, 1016(%rdi)
+; CHECK-NEXT:    movq %rax, 1008(%rdi)
+; CHECK-NEXT:    movq %rax, 1000(%rdi)
+; CHECK-NEXT:    movq %rax, 992(%rdi)
+; CHECK-NEXT:    movq %rax, 984(%rdi)
+; CHECK-NEXT:    movq %rax, 976(%rdi)
+; CHECK-NEXT:    movq %rax, 968(%rdi)
+; CHECK-NEXT:    movq %rax, 960(%rdi)
+; CHECK-NEXT:    movq %rax, 952(%rdi)
+; CHECK-NEXT:    movq %rax, 944(%rdi)
+; CHECK-NEXT:    movq %rax, 936(%rdi)
+; CHECK-NEXT:    movq %rax, 928(%rdi)
+; CHECK-NEXT:    movq %rax, 920(%rdi)
+; CHECK-NEXT:    movq %rax, 912(%rdi)
+; CHECK-NEXT:    movq %rax, 904(%rdi)
+; CHECK-NEXT:    movq %rax, 896(%rdi)
+; CHECK-NEXT:    movq %rax, 888(%rdi)
+; CHECK-NEXT:    movq %rax, 880(%rdi)
+; CHECK-NEXT:    movq %rax, 872(%rdi)
+; CHECK-NEXT:    movq %rax, 864(%rdi)
+; CHECK-NEXT:    movq %rax, 856(%rdi)
+; CHECK-NEXT:    movq %rax, 848(%rdi)
+; CHECK-NEXT:    movq %rax, 840(%rdi)
+; CHECK-NEXT:    movq %rax, 832(%rdi)
+; CHECK-NEXT:    movq %rax, 824(%rdi)
+; CHECK-NEXT:    movq %rax, 816(%rdi)
+; CHECK-NEXT:    movq %rax, 808(%rdi)
+; CHECK-NEXT:    movq %rax, 800(%rdi)
+; CHECK-NEXT:    movq %rax, 792(%rdi)
+; CHECK-NEXT:    movq %rax, 784(%rdi)
+; CHECK-NEXT:    movq %rax, 776(%rdi)
+; CHECK-NEXT:    movq %rax, 768(%rdi)
+; CHECK-NEXT:    movq %rax, 760(%rdi)
+; CHECK-NEXT:    movq %rax, 752(%rdi)
+; CHECK-NEXT:    movq %rax, 744(%rdi)
+; CHECK-NEXT:    movq %rax, 736(%rdi)
+; CHECK-NEXT:    movq %rax, 728(%rdi)
+; CHECK-NEXT:    movq %rax, 720(%rdi)
+; CHECK-NEXT:    movq %rax, 712(%rdi)
+; CHECK-NEXT:    movq %rax, 704(%rdi)
+; CHECK-NEXT:    movq %rax, 696(%rdi)
+; CHECK-NEXT:    movq %rax, 688(%rdi)
+; CHECK-NEXT:    movq %rax, 680(%rdi)
+; CHECK-NEXT:    movq %rax, 672(%rdi)
+; CHECK-NEXT:    movq %rax, 664(%rdi)
+; CHECK-NEXT:    movq %rax, 656(%rdi)
+; CHECK-NEXT:    movq %rax, 648(%rdi)
+; CHECK-NEXT:    movq %rax, 640(%rdi)
+; CHECK-NEXT:    movq %rax, 632(%rdi)
+; CHECK-NEXT:    movq %rax, 624(%rdi)
+; CHECK-NEXT:    movq %rax, 616(%rdi)
+; CHECK-NEXT:    movq %rax, 608(%rdi)
+; CHECK-NEXT:    movq %rax, 600(%rdi)
+; CHECK-NEXT:    movq %rax, 592(%rdi)
+; CHECK-NEXT:    movq %rax, 584(%rdi)
+; CHECK-NEXT:    movq %rax, 576(%rdi)
+; CHECK-NEXT:    movq %rax, 568(%rdi)
+; CHECK-NEXT:    movq %rax, 560(%rdi)
+; CHECK-NEXT:    movq %rax, 552(%rdi)
+; CHECK-NEXT:    movq %rax, 544(%rdi)
+; CHECK-NEXT:    movq %rax, 536(%rdi)
+; CHECK-NEXT:    movq %rax, 528(%rdi)
+; CHECK-NEXT:    movq %rax, 520(%rdi)
+; CHECK-NEXT:    movq %rax, 512(%rdi)
+; CHECK-NEXT:    movq %rax, 504(%rdi)
+; CHECK-NEXT:    movq %rax, 496(%rdi)
+; CHECK-NEXT:    movq %rax, 488(%rdi)
+; CHECK-NEXT:    movq %rax, 480(%rdi)
+; CHECK-NEXT:    movq %rax, 472(%rdi)
+; CHECK-NEXT:    movq %rax, 464(%rdi)
+; CHECK-NEXT:    movq %rax, 456(%rdi)
+; CHECK-NEXT:    movq %rax, 448(%rdi)
+; CHECK-NEXT:    movq %rax, 440(%rdi)
+; CHECK-NEXT:    movq %rax, 432(%rdi)
+; CHECK-NEXT:    movq %rax, 424(%rdi)
+; CHECK-NEXT:    movq %rax, 416(%rdi)
+; CHECK-NEXT:    movq %rax, 408(%rdi)
+; CHECK-NEXT:    movq %rax, 400(%rdi)
+; CHECK-NEXT:    movq %rax, 392(%rdi)
+; CHECK-NEXT:    movq %rax, 384(%rdi)
+; CHECK-NEXT:    movq %rax, 376(%rdi)
+; CHECK-NEXT:    movq %rax, 368(%rdi)
+; CHECK-NEXT:    movq %rax, 360(%rdi)
+; CHECK-NEXT:    movq %rax, 352(%rdi)
+; CHECK-NEXT:    movq %rax, 344(%rdi)
+; CHECK-NEXT:    movq %rax, 336(%rdi)
+; CHECK-NEXT:    movq %rax, 328(%rdi)
+; CHECK-NEXT:    movq %rax, 320(%rdi)
+; CHECK-NEXT:    movq %rax, 312(%rdi)
+; CHECK-NEXT:    movq %rax, 304(%rdi)
+; CHECK-NEXT:    movq %rax, 296(%rdi)
+; CHECK-NEXT:    movq %rax, 288(%rdi)
+; CHECK-NEXT:    movq %rax, 280(%rdi)
+; CHECK-NEXT:    movq %rax, 272(%rdi)
+; CHECK-NEXT:    movq %rax, 264(%rdi)
+; CHECK-NEXT:    movq %rax, 256(%rdi)
+; CHECK-NEXT:    movq %rax, 248(%rdi)
+; CHECK-NEXT:    movq %rax, 240(%rdi)
+; CHECK-NEXT:    movq %rax, 232(%rdi)
+; CHECK-NEXT:    movq %rax, 224(%rdi)
+; CHECK-NEXT:    movq %rax, 216(%rdi)
+; CHECK-NEXT:    movq %rax, 208(%rdi)
+; CHECK-NEXT:    movq %rax, 200(%rdi)
+; CHECK-NEXT:    movq %rax, 192(%rdi)
+; CHECK-NEXT:    movq %rax, 184(%rdi)
+; CHECK-NEXT:    movq %rax, 176(%rdi)
+; CHECK-NEXT:    movq %rax, 168(%rdi)
+; CHECK-NEXT:    movq %rax, 160(%rdi)
+; CHECK-NEXT:    movq %rax, 152(%rdi)
+; CHECK-NEXT:    movq %rax, 144(%rdi)
+; CHECK-NEXT:    movq %rax, 136(%rdi)
+; CHECK-NEXT:    movq %rax, 128(%rdi)
+; CHECK-NEXT:    movq %rax, 120(%rdi)
+; CHECK-NEXT:    movq %rax, 112(%rdi)
+; CHECK-NEXT:    movq %rax, 104(%rdi)
+; CHECK-NEXT:    movq %rax, 96(%rdi)
+; CHECK-NEXT:    movq %rax, 88(%rdi)
+; CHECK-NEXT:    movq %rax, 80(%rdi)
+; CHECK-NEXT:    movq %rax, 72(%rdi)
+; CHECK-NEXT:    movq %rax, 64(%rdi)
+; CHECK-NEXT:    movq %rax, 56(%rdi)
+; CHECK-NEXT:    movq %rax, 48(%rdi)
+; CHECK-NEXT:    movq %rax, 40(%rdi)
+; CHECK-NEXT:    movq %rax, 32(%rdi)
+; CHECK-NEXT:    movq %rax, 24(%rdi)
+; CHECK-NEXT:    movq %rax, 16(%rdi)
+; CHECK-NEXT:    movq %rax, 8(%rdi)
+; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    retq
+  tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 42, i64 1024, i1 0)
+  ret void
+}

>From dbeacd1e0d95ad1603ee1715f19bd25f1c43c6b0 Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Thu, 28 Mar 2024 16:19:34 -0400
Subject: [PATCH 2/2] [X86] For minsize, use size for alignment, rather than
 actual alignment

If we have minsize, then don't care about the alignment.
On x86, the CPU doesn't care and neither should you.

As long as the count is aligned, we can use less instructions.
---
 llvm/lib/Target/X86/X86SelectionDAGInfo.cpp | 65 ++++++++++++++-------
 llvm/test/CodeGen/X86/memset-minsize.ll     |  4 +-
 2 files changed, 47 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index e5f07f230fe6c..fcc093dcef1c8 100644
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -65,11 +65,20 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
   // If not DWORD aligned or size is more than the threshold, call the library.
   // The libc version is likely to be faster for these cases. It can use the
   // address value and run time information about the CPU.
-  if (Alignment < Align(4) || !ConstantSize ||
-      ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())
+  if (!AlwaysInline &&
+      (Alignment < Align(4) || !ConstantSize ||
+       ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()))
     return SDValue();
 
-  uint64_t SizeVal = ConstantSize->getZExtValue();
+  // If we have minsize, then don't care about the alignment.
+  // On x86, the CPU doesn't care and neither should you.
+  // As long as the count is aligned, we can use the minimum number of
+  // instructions without always having to resort to stosb.
+  //
+  // Because this is a feature specific to x86, we must handle it here.
+  if (DAG.getMachineFunction().getFunction().hasMinSize())
+    Alignment = commonAlignment(Align(Subtarget.is64Bit() ? 8 : 4), SizeVal);
+
   SDValue InGlue;
   EVT AVT;
   SDValue Count;
@@ -102,12 +111,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
       Count = DAG.getIntPtrConstant(SizeVal, dl);
     }
 
-    if (AVT.bitsGT(MVT::i8)) {
-      unsigned UBytes = AVT.getSizeInBits() / 8;
-      Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
-      BytesLeft = SizeVal % UBytes;
-    }
-
+    const uint64_t BlockBytes = AVT.getSizeInBits() / 8;
+    const uint64_t BlockCount = SizeVal / BlockBytes;
+    Count = DAG.getIntPtrConstant(BlockCount, dl);
+    BytesLeft = SizeVal % BlockBytes;
     Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
                              InGlue);
     InGlue = Chain.getValue(1);
@@ -119,11 +126,11 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
   }
 
   bool Use64BitRegs = Subtarget.isTarget64BitLP64();
-  Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
-                           Count, InGlue);
+  Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX, Count,
+                           InGlue);
   InGlue = Chain.getValue(1);
-  Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
-                           Dst, InGlue);
+  Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI, Dst,
+                           InGlue);
   InGlue = Chain.getValue(1);
 
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -131,6 +138,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
   SDValue RepStos = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
 
   /// RepStos can process the whole length.
+  //
+  // Because we changed the alignment earlier in the function to work on size
+  // when we have the minsize attribute, this is guaranteed to be 0 when we get
+  // here.
   if (BytesLeft == 0)
     return RepStos;
 
@@ -146,7 +157,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
                     DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
                                 DAG.getConstant(Offset, dl, AddrVT)),
                     Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
-                    isVolatile, AlwaysInline,
+                    isVolatile, /* AlwaysInline */ true,
                     /* isTailCall */ false, DstPtrInfo.getWithOffset(Offset)));
 
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
@@ -222,13 +233,32 @@ static SDValue emitConstantSizeRepmov(
   assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
   /// We assume runtime memcpy will do a better job for unaligned copies when
   /// ERMS is not present.
-  if (!AlwaysInline && (Alignment.value() & 3) != 0)
+  if (!AlwaysInline && (Alignment < Align(4)))
     return SDValue();
 
+  // If we have minsize, then don't care about the alignment.
+  // On x86, the CPU doesn't care and neither should you.
+  // As long as the count is aligned, we can use the minimum number of
+  // instructions without always having to resort to movsb
+  //
+  // Because this is a feature specific to x86, we must handle it here.
+
+  if (DAG.getMachineFunction().getFunction().hasMinSize())
+    Alignment = commonAlignment(Align(Subtarget.is64Bit() ? 8 : 4), Size);
+
   const MVT BlockType = getOptimalRepmovsType(Subtarget, Alignment);
   const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
   const uint64_t BlockCount = Size / BlockBytes;
   const uint64_t BytesLeft = Size % BlockBytes;
+
+  if (DAG.getMachineFunction().getFunction().hasMinSize()) {
+    // Use the one instruction determined. Because we changed the alignment
+    // earlier in the function to work on size when we have the minsize
+    // attribute, it is guaranteed to process the entire length.
+    return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
+                       DAG.getIntPtrConstant(BlockCount, dl), BlockType);
+  }
+
   SDValue RepMovs =
       emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
                   DAG.getIntPtrConstant(BlockCount, dl), BlockType);
@@ -239,11 +269,6 @@ static SDValue emitConstantSizeRepmov(
 
   assert(BytesLeft && "We have leftover at this point");
 
-  /// In case we optimize for size we use repmovsb even if it's less efficient
-  /// so we can save the loads/stores of the leftover.
-  if (DAG.getMachineFunction().getFunction().hasMinSize())
-    return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
-
   // Handle the last 1 - 7 bytes.
   SmallVector<SDValue, 4> Results;
   Results.push_back(RepMovs);
diff --git a/llvm/test/CodeGen/X86/memset-minsize.ll b/llvm/test/CodeGen/X86/memset-minsize.ll
index cc0f2156262bb..6d8fbc88e9020 100644
--- a/llvm/test/CodeGen/X86/memset-minsize.ll
+++ b/llvm/test/CodeGen/X86/memset-minsize.ll
@@ -14,10 +14,10 @@ entry:
 define void @small_memset_to_rep_stos(ptr %ptr) minsize nounwind {
 ; CHECK-LABEL: small_memset_to_rep_stos:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq $32
+; CHECK-NEXT:    pushq $16
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    rep;stosl %eax, %es:(%rdi)
+; CHECK-NEXT:    rep;stosq %rax, %es:(%rdi)
 ; CHECK-NEXT:    retq
 entry:
   call void @llvm.memset.p0.i32(ptr align 4 %ptr, i8 0, i32 128, i1 false)