[llvm] 0ab6a15 - [X86] Add support for using fast short rep mov for memcpy lowering.

Wed Sep 9 12:46:54 PDT 2020

Author: Hiroshi Yamauchi
Date: 2020-09-09T12:46:40-07:00
New Revision: 0ab6a1569806783fcbf6303c462f051e9b5f764b

URL: https://github.com/llvm/llvm-project/commit/0ab6a1569806783fcbf6303c462f051e9b5f764b
DIFF: https://github.com/llvm/llvm-project/commit/0ab6a1569806783fcbf6303c462f051e9b5f764b.diff

LOG: [X86] Add support for using fast short rep mov for memcpy lowering.

Disabled by default behind an option.

Differential Revision: https://reviews.llvm.org/D86883

Added: 
    llvm/test/CodeGen/X86/memcpy-inline-fsrm.ll

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86SelectionDAGInfo.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1cd928c1de12..ce46dd9167f1 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3109,7 +3109,7 @@ argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
                                          SDValue Chain, ISD::ArgFlagsTy Flags,
                                          SelectionDAG &DAG, const SDLoc &dl) {
-  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
+  SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
 
   return DAG.getMemcpy(
       Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),

diff  --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index ce8d1d464da9..e76908ef4bc4 100644
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -24,6 +24,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "x86-selectiondag-info"
 
+static cl::opt<bool>
+    UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
+                     cl::desc("Use fast short rep mov in memcpy lowering"));
+
 bool X86SelectionDAGInfo::isBaseRegConflictPossible(
     SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
   // We cannot use TRI->hasBasePointer() until *after* we select all basic
@@ -306,6 +310,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
   const X86Subtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<X86Subtarget>();
 
+  // If enabled and available, use fast short rep mov.
+  if (UseFSRMForMemcpy && Subtarget.hasFSRM())
+    return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8);
+
   /// Handle constant sizes,
   if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
     return emitConstantSizeRepmov(

diff  --git a/llvm/test/CodeGen/X86/memcpy-inline-fsrm.ll b/llvm/test/CodeGen/X86/memcpy-inline-fsrm.ll
new file mode 100644
index 000000000000..54f7973dea39
--- /dev/null
+++ b/llvm/test/CodeGen/X86/memcpy-inline-fsrm.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -x86-use-fsrm-for-memcpy -mattr=-fsrm < %s -o - | FileCheck %s --check-prefix=NOFSRM
+; RUN: llc -mtriple=x86_64-linux-gnu -x86-use-fsrm-for-memcpy -mattr=+fsrm < %s -o - | FileCheck %s --check-prefix=FSRM
+; RUN: llc -mtriple=x86_64-linux-gnu -x86-use-fsrm-for-memcpy -mcpu=haswell < %s | FileCheck %s --check-prefix=NOFSRM
+; RUN: llc -mtriple=x86_64-linux-gnu -x86-use-fsrm-for-memcpy -mcpu=icelake-client < %s | FileCheck %s --check-prefix=FSRM
+; RUN: llc -mtriple=x86_64-linux-gnu -x86-use-fsrm-for-memcpy -mcpu=icelake-server < %s | FileCheck %s --check-prefix=FSRM
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
+
+define void @test1(i8* %a, i8* %b, i64 %s) nounwind {
+; NOFSRM-LABEL: test1
+; NOFSRM:       # %bb.0:
+; NOFSRM:         jmp memcpy
+;
+; FSRM-LABEL: test1
+; FSRM:       # %bb.0:
+; FSRM-NEXT:    movq %rdx, %rcx
+; FSRM-NEXT:    rep;movsb (%rsi), %es:(%rdi)
+; FSRM-NEXT:    retq
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 %s, i1 0)
+  ret void
+}
+
+; Check that we don't crash due to a memcpy size type mismatch error ("Cannot
+; emit physreg copy instruction") in X86InstrInfo::copyPhysReg.
+%struct = type { [4096 x i8] }
+declare void @foo(%struct* byval)
+define void @test2(%struct* %x) {
+  call void @foo(%struct* byval %x)
+  ret void
+}