[llvm] acb30f6 - [X86] For 32-bit targets, emit two-byte NOP when possible

Wed Jun 17 10:45:04 PDT 2020

Author: Alexandre Ganea
Date: 2020-06-17T13:44:38-04:00
New Revision: acb30f6856c34b929b30bfd76dc938a1087af0a9

URL: https://github.com/llvm/llvm-project/commit/acb30f6856c34b929b30bfd76dc938a1087af0a9
DIFF: https://github.com/llvm/llvm-project/commit/acb30f6856c34b929b30bfd76dc938a1087af0a9.diff

LOG: [X86] For 32-bit targets, emit two-byte NOP when possible

In order to support hot-patching, we need to make sure the first emitted instruction in a function is a two-byte+ op. This is already the case on x86_64, which seems to always emit two-byte+ ops. However on 32-bit targets this wasn't the case.

PATCHABLE_OP now lowers to a XCHG AX, AX, (66 90) like MSVC does. However when targetting pentium3 (/arch:SSE) or i386 (/arch:IA32) targets, we generate MOV EDI,EDI (8B FF) like MSVC does. This is for compatiblity reasons with older tools that rely on this two byte pattern.

Differential Revision: https://reviews.llvm.org/D81301

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86MCInstLower.cpp
    llvm/test/CodeGen/X86/patchable-function-entry.ll
    llvm/test/CodeGen/X86/patchable-prologue.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index f966fe654769..60f29ae5e6a5 100644

--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1083,26 +1083,23 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
 /// target cpu.  15-bytes is the longest single NOP instruction, but some
 /// platforms can't decode the longest forms efficiently.
 static unsigned maxLongNopLength(const X86Subtarget *Subtarget) {
-  uint64_t MaxNopLength = 10;
   if (Subtarget->getFeatureBits()[X86::ProcIntelSLM])
-    MaxNopLength = 7;
-  else if (Subtarget->getFeatureBits()[X86::FeatureFast15ByteNOP])
-    MaxNopLength = 15;
-  else if (Subtarget->getFeatureBits()[X86::FeatureFast11ByteNOP])
-    MaxNopLength = 11;
-  return MaxNopLength;
+    return 7;
+  if (Subtarget->getFeatureBits()[X86::FeatureFast15ByteNOP])
+    return 15;
+  if (Subtarget->getFeatureBits()[X86::FeatureFast11ByteNOP])
+    return 11;
+  if (Subtarget->getFeatureBits()[X86::FeatureNOPL] || Subtarget->is64Bit())
+    return 10;
+  if (Subtarget->is32Bit())
+    return 2;
+  return 1;
 }
 
 /// Emit the largest nop instruction smaller than or equal to \p NumBytes
 /// bytes.  Return the size of nop emitted.
 static unsigned emitNop(MCStreamer &OS, unsigned NumBytes,
                         const X86Subtarget *Subtarget) {
-  if (!Subtarget->is64Bit()) {
-    // TODO Do additional checking if the CPU supports multi-byte nops.
-    OS.emitInstruction(MCInstBuilder(X86::NOOP), *Subtarget);
-    return 1;
-  }
-
   // Cap a single nop emission at the profitable value for the target
   NumBytes = std::min(NumBytes, maxLongNopLength(Subtarget));
 
@@ -1342,7 +1339,17 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
   CodeEmitter->encodeInstruction(MCI, VecOS, Fixups, getSubtargetInfo());
 
   if (Code.size() < MinSize) {
-    if (MinSize == 2 && Opcode == X86::PUSH64r) {
+    if (MinSize == 2 && Subtarget->is32Bit() &&
+        Subtarget->isTargetWindowsMSVC() &&
+        (Subtarget->getCPU().empty() || Subtarget->getCPU() == "pentium3")) {
+      // For compatibilty reasons, when targetting MSVC, is is important to
+      // generate a 'legacy' NOP in the form of a 8B FF MOV EDI, EDI. Some tools
+      // rely specifically on this pattern to be able to patch a function.
+      // This is only for 32-bit targets, when using /arch:IA32 or /arch:SSE.
+      OutStreamer->emitInstruction(
+          MCInstBuilder(X86::MOV32rr_REV).addReg(X86::EDI).addReg(X86::EDI),
+          *Subtarget);
+    } else if (MinSize == 2 && Opcode == X86::PUSH64r) {
       // This is an optimization that lets us get away without emitting a nop in
       // many cases.
       //

diff  --git a/llvm/test/CodeGen/X86/patchable-function-entry.ll b/llvm/test/CodeGen/X86/patchable-function-entry.ll
index 9be41f256fd5..8d0984327ffc 100644
--- a/llvm/test/CodeGen/X86/patchable-function-entry.ll
+++ b/llvm/test/CodeGen/X86/patchable-function-entry.ll
@@ -31,7 +31,7 @@ define void @f1() "patchable-function-entry"="1" {
 define void @f2() "patchable-function-entry"="2" {
 ; CHECK-LABEL: f2:
 ; CHECK-NEXT: .Lfunc_begin2:
-; 32-COUNT-2:  nop
+; 32:          xchgw %ax, %ax
 ; 64:          xchgw %ax, %ax
 ; CHECK-NEXT:  ret
 ; CHECK:       .section __patchable_function_entries,"awo", at progbits,f2{{$}}
@@ -46,7 +46,8 @@ $f3 = comdat any
 define void @f3() "patchable-function-entry"="3" comdat {
 ; CHECK-LABEL: f3:
 ; CHECK-NEXT: .Lfunc_begin3:
-; 32-COUNT-3:  nop
+; 32:          xchgw %ax, %ax
+; 32-NEXT:     nop
 ; 64:          nopl (%rax)
 ; CHECK:       ret
 ; CHECK:       .section __patchable_function_entries,"aGwo", at progbits,f3,comdat,f3{{$}}
@@ -61,7 +62,8 @@ $f5 = comdat any
 define void @f5() "patchable-function-entry"="5" comdat {
 ; CHECK-LABEL: f5:
 ; CHECK-NEXT: .Lfunc_begin4:
-; 32-COUNT-5:  nop
+; 32-COUNT-2:  xchgw %ax, %ax
+; 32-NEXT:     nop
 ; 64:          nopl 8(%rax,%rax)
 ; CHECK-NEXT:  ret
 ; CHECK:       .section __patchable_function_entries,"aGwo", at progbits,f5,comdat,f5{{$}}

diff  --git a/llvm/test/CodeGen/X86/patchable-prologue.ll b/llvm/test/CodeGen/X86/patchable-prologue.ll
index 38a2e8852b6b..b9537f447b41 100644
--- a/llvm/test/CodeGen/X86/patchable-prologue.ll
+++ b/llvm/test/CodeGen/X86/patchable-prologue.ll
@@ -1,5 +1,14 @@
 ; RUN: llc -verify-machineinstrs -filetype=obj -o - -mtriple=x86_64-apple-macosx < %s | llvm-objdump --triple=x86_64-apple-macosx -d - | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=x86_64-apple-macosx < %s | FileCheck %s --check-prefix=CHECK-ALIGN
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386 < %s | FileCheck %s --check-prefixes=32,32CFI,XCHG
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc < %s | FileCheck %s --check-prefixes=32,MOV
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc -mcpu=pentium3 < %s | FileCheck %s --check-prefixes=32,MOV
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc -mcpu=pentium4 < %s | FileCheck %s --check-prefixes=32,XCHG
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=64
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-unknown-linux-code16 < %s | FileCheck %s --check-prefix=16
+
+; 16-NOT: movl   %edi, %edi
+; 16-NOT: xchgw   %ax, %ax
 
 declare void @callee(i64*)
 
@@ -10,6 +19,18 @@ define void @f0() "patchable-function"="prologue-short-redirect" {
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f0:
 
+; 32: f0:
+; 32CFI-NEXT: .cfi_startproc
+; 32-NEXT: # %bb.0:
+; XCHG-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
+; MOV-NEXT: movl    %edi, %edi              # encoding: [0x8b,0xff]
+; 32-NEXT: retl
+
+; 64: f0:
+; 64-NEXT: # %bb.0:
+; 64-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
+; 64-NEXT: retq
+		
   ret void
 }
 
@@ -19,6 +40,19 @@ define void @f1() "patchable-function"="prologue-short-redirect" "frame-pointer"
 
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f1:
+
+; 32: f1:
+; 32CFI-NEXT: .cfi_startproc
+; 32-NEXT: # %bb.0:
+; XCHG-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
+; MOV-NEXT: movl    %edi, %edi              # encoding: [0x8b,0xff]
+; 32-NEXT: pushl   %ebp
+
+; 64: f1:
+; 64-NEXT: .seh_proc f1
+; 64-NEXT: # %bb.0:
+; 64-NEXT: pushq   %rbp
+		
   ret void
 }
 
@@ -28,6 +62,19 @@ define void @f2() "patchable-function"="prologue-short-redirect" {
 
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f2:
+
+; 32: f2:
+; 32CFI-NEXT: .cfi_startproc
+; 32-NEXT: # %bb.0:
+; XCHG-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
+; MOV-NEXT: movl    %edi, %edi              # encoding: [0x8b,0xff]
+; 32-NEXT: pushl   %ebp
+
+; 64: f2:
+; 64-NEXT: .seh_proc f2
+; 64-NEXT: # %bb.0:
+; 64-NEXT: subq    $200, %rsp
+		
   %ptr = alloca i64, i32 20
   call void @callee(i64* %ptr)
   ret void
@@ -39,6 +86,19 @@ define void @f3() "patchable-function"="prologue-short-redirect" optsize {
 
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f3:
+
+; 32: f3:
+; 32CFI-NEXT: .cfi_startproc
+; 32-NEXT: # %bb.0:
+; XCHG-NEXT: xchgw   %ax, %ax
+; MOV-NEXT: movl   %edi, %edi
+; 32-NEXT: retl
+
+; 64: f3:
+; 64-NEXT: # %bb.0:
+; 64-NEXT: xchgw   %ax, %ax
+; 64-NEXT: retq
+
   ret void
 }
 
@@ -47,6 +107,17 @@ define void @f3() "patchable-function"="prologue-short-redirect" optsize {
 ; patchable one.
 ; CHECK-LABEL: f4{{>?}}:
 ; CHECK-NEXT: 8b 0c 37  movl  (%rdi,%rsi), %ecx
+; 32: f4:
+; 32CFI-NEXT: .cfi_startproc
+; 32-NEXT: # %bb.0:
+; XCHG-NEXT: xchgw   %ax, %ax
+; MOV-NEXT: movl   %edi, %edi
+; 32-NEXT: pushl   %ebx
+
+; 64: f4:
+; 64-NEXT: # %bb.0:
+; 64-NOT: xchgw   %ax, %ax
+
 define i32 @f4(i8* %arg1, i64 %arg2, i32 %arg3) "patchable-function"="prologue-short-redirect" {
 bb:
   %tmp10 = getelementptr i8, i8* %arg1, i64 %arg2