[llvm] [Clang][inlineasm] Add special support for "rm" output constraints (PR #92040)
Bill Wendling via llvm-commits
llvm-commits at lists.llvm.org
Wed May 29 17:25:47 PDT 2024
https://github.com/bwendling updated https://github.com/llvm/llvm-project/pull/92040
>From 94e01760b8363ad59a860c9c036918e670cc3783 Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Mon, 29 Apr 2024 14:40:54 -0700
Subject: [PATCH] [Clang][inlineasm] Add special support for "rm" output
constraints
Clang isn't able to support multiple constraints on inputs and outputs.
Instead, it picks the "safest" one to use, i.e. the most conseravite. In
the case of "rm" it picks the memory constraint. This leads to obviously
horrible code:
asm __volatile__ ("pushf\n\t"
"popq %0"
: "=rm" (x));
is converted to:
#APP
pushf
popq -8(%rsp)
#NO_APP
movq -8(%rsp), %rax
Blech!
This hack^Wchange, makes a special exception for "rm" to use "r" if at
all possible. The "RegMayBeFolded" flag is then used by the register
allocators to allow for the old behavior if register pressure is too
great.
Fixes: https://github.com/llvm/llvm-project/issues/20571
Cc: Nick Desaulniers <ndesaulniers at google.com>
Cc: Kees Cook <keescook at google.com>
Cc: llvm at lists.linux.dev
---
llvm/include/llvm/CodeGen/TargetLowering.h | 5 +
llvm/include/llvm/CodeGen/TargetPassConfig.h | 2 +
.../SelectionDAG/SelectionDAGBuilder.cpp | 25 +-
.../SelectionDAG/SelectionDAGBuilder.h | 5 +-
.../CodeGen/SelectionDAG/TargetLowering.cpp | 30 +-
llvm/lib/CodeGen/TargetPassConfig.cpp | 6 +
llvm/test/CodeGen/X86/asm-constraints-rm.ll | 363 ++++++++++++++++++
llvm/test/CodeGen/X86/inlineasm-sched-bug.ll | 5 +-
8 files changed, 424 insertions(+), 17 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/asm-constraints-rm.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 50a8c7eb75af5..ff321f6aa0f62 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4939,6 +4939,11 @@ class TargetLowering : public TargetLoweringBase {
/// Memory, Other, Unknown.
TargetLowering::ConstraintType ConstraintType = TargetLowering::C_Unknown;
+ /// The register may be folded. This is used if the constraint is "rm",
+ /// where we prefer using a register, but can fall back to a memory slot
+ /// under register pressure.
+ bool MayFoldRegister = false;
+
/// If this is the result output operand or a clobber, this is null,
/// otherwise it is the incoming operand to the CallInst. This gets
/// modified as the asm is processed.
diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h
index d00e0bed91a45..c1f4199536409 100644
--- a/llvm/include/llvm/CodeGen/TargetPassConfig.h
+++ b/llvm/include/llvm/CodeGen/TargetPassConfig.h
@@ -496,6 +496,8 @@ class TargetPassConfig : public ImmutablePass {
void registerCodeGenCallback(PassInstrumentationCallbacks &PIC,
LLVMTargetMachine &);
+bool usesGreedyOrDefaultRegisterAllocator();
+
} // end namespace llvm
#endif // LLVM_CODEGEN_TARGETPASSCONFIG_H
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ca352da5d36eb..7bc03becf1a5a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1008,7 +1008,8 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
}
void RegsForValue::AddInlineAsmOperands(InlineAsm::Kind Code, bool HasMatching,
- unsigned MatchingIdx, const SDLoc &dl,
+ unsigned MatchingIdx,
+ bool MayFoldRegister, const SDLoc &dl,
SelectionDAG &DAG,
std::vector<SDValue> &Ops) const {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -1024,7 +1025,9 @@ void RegsForValue::AddInlineAsmOperands(InlineAsm::Kind Code, bool HasMatching,
// from the def.
const MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
const TargetRegisterClass *RC = MRI.getRegClass(Regs.front());
+
Flag.setRegClass(RC->getID());
+ Flag.setRegMayBeFolded(MayFoldRegister);
}
SDValue Res = DAG.getTargetConstant(Flag, dl, MVT::i32);
@@ -9775,8 +9778,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
AsmNodeOperands.push_back(OpInfo.CallOperand);
} else {
// Otherwise, this outputs to a register (directly for C_Register /
- // C_RegisterClass, and a target-defined fashion for
- // C_Immediate/C_Other). Find a register that we can use.
+ // C_RegisterClass, and a target-defined fashion for C_Immediate /
+ // C_Other). Find a register that we can use.
if (OpInfo.AssignedRegs.Regs.empty()) {
emitInlineAsmError(
Call, "couldn't allocate output register for constraint '" +
@@ -9792,7 +9795,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
OpInfo.AssignedRegs.AddInlineAsmOperands(
OpInfo.isEarlyClobber ? InlineAsm::Kind::RegDefEarlyClobber
: InlineAsm::Kind::RegDef,
- false, 0, getCurSDLoc(), DAG, AsmNodeOperands);
+ false, 0, OpInfo.MayFoldRegister, getCurSDLoc(), DAG,
+ AsmNodeOperands);
}
break;
@@ -9834,9 +9838,9 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
SDLoc dl = getCurSDLoc();
// Use the produced MatchedRegs object to
MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Glue, &Call);
- MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind::RegUse, true,
- OpInfo.getMatchedOperand(), dl, DAG,
- AsmNodeOperands);
+ MatchedRegs.AddInlineAsmOperands(
+ InlineAsm::Kind::RegUse, true, OpInfo.getMatchedOperand(),
+ OpInfo.MayFoldRegister, dl, DAG, AsmNodeOperands);
break;
}
@@ -9965,7 +9969,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
&Call);
OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind::RegUse, false,
- 0, dl, DAG, AsmNodeOperands);
+ 0, OpInfo.MayFoldRegister, dl,
+ DAG, AsmNodeOperands);
break;
}
case InlineAsm::isClobber:
@@ -9973,8 +9978,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
// allocator is aware that the physreg got clobbered.
if (!OpInfo.AssignedRegs.Regs.empty())
OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind::Clobber,
- false, 0, getCurSDLoc(), DAG,
- AsmNodeOperands);
+ false, 0, false, getCurSDLoc(),
+ DAG, AsmNodeOperands);
break;
}
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index ae361f8c500a0..daf9cfbbe1279 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -783,8 +783,9 @@ struct RegsForValue {
/// code marker, matching input operand index (if applicable), and includes
/// the number of values added into it.
void AddInlineAsmOperands(InlineAsm::Kind Code, bool HasMatching,
- unsigned MatchingIdx, const SDLoc &dl,
- SelectionDAG &DAG, std::vector<SDValue> &Ops) const;
+ unsigned MatchingIdx, bool MayFoldRegister,
+ const SDLoc &dl, SelectionDAG &DAG,
+ std::vector<SDValue> &Ops) const;
/// Check if the total RegCount is greater than one.
bool occupiesMultipleRegs() const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 7beaeb9b7a171..cadb609ec72f5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
@@ -33,6 +34,7 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Triple.h"
#include <cctype>
using namespace llvm;
@@ -5668,6 +5670,7 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
unsigned ResNo = 0; // ResNo - The result number of the next output.
unsigned LabelNo = 0; // LabelNo - CallBr indirect dest number.
+ const Triple &T = getTargetMachine().getTargetTriple();
for (InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
ConstraintOperands.emplace_back(std::move(CI));
AsmOperandInfo &OpInfo = ConstraintOperands.back();
@@ -5678,6 +5681,16 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
OpInfo.ConstraintVT = MVT::Other;
+ // Special treatment for all platforms (currently only x86) that can fold a
+ // register into a spill. This is used for the "rm" constraint, where we
+ // would vastly prefer to use 'r' over 'm', but can't because of LLVM's
+ // architecture picks the most "conservative" constraint to ensure that (in
+ // the case of "rm") register pressure cause bad things to happen.
+ if (T.isX86() && !OpInfo.hasMatchingInput() && OpInfo.Codes.size() == 2 &&
+ llvm::is_contained(OpInfo.Codes, "r") &&
+ llvm::is_contained(OpInfo.Codes, "m"))
+ OpInfo.MayFoldRegister = true;
+
// Compute the value type for each operand.
switch (OpInfo.Type) {
case InlineAsm::isOutput:
@@ -5954,7 +5967,12 @@ TargetLowering::ConstraintWeight
/// 1) If there is an 'other' constraint, and if the operand is valid for
/// that constraint, use it. This makes us take advantage of 'i'
/// constraints when available.
-/// 2) Otherwise, pick the most general constraint present. This prefers
+/// 2) Special processing is done for the "rm" constraint. If specified, we
+/// opt for the 'r' constraint, but mark the operand as being "foldable."
+/// In the face of register exhaustion, the register allocator is free to
+/// choose to use a stack slot. This only applies to the greedy and default
+/// register allocators. FIXME: Support other allocators (fast?).
+/// 3) Otherwise, pick the most general constraint present. This prefers
/// 'm' over 'r', for example.
///
TargetLowering::ConstraintGroup TargetLowering::getConstraintPreferences(
@@ -5962,6 +5980,16 @@ TargetLowering::ConstraintGroup TargetLowering::getConstraintPreferences(
ConstraintGroup Ret;
Ret.reserve(OpInfo.Codes.size());
+
+ // If we can fold the register (i.e. it has an "rm" constraint), opt for the
+ // 'r' constraint, and allow the register allocator to spill if need be.
+ // Applies only to the greedy and default register allocators.
+ if (OpInfo.MayFoldRegister && usesGreedyOrDefaultRegisterAllocator()) {
+ Ret.emplace_back(ConstraintPair("r", getConstraintType("r")));
+ Ret.emplace_back(ConstraintPair("m", getConstraintType("m")));
+ return Ret;
+ }
+
for (StringRef Code : OpInfo.Codes) {
TargetLowering::ConstraintType CType = getConstraintType(Code);
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 8832b51333d91..b768cde55d79f 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1077,6 +1077,12 @@ static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
RegAlloc("regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
cl::desc("Register allocator to use"));
+bool llvm::usesGreedyOrDefaultRegisterAllocator() {
+ return RegAlloc == (RegisterRegAlloc::
+ FunctionPassCtor)&createGreedyRegisterAllocator ||
+ RegAlloc == &useDefaultRegisterAllocator;
+}
+
/// Add the complete set of target-independent postISel code generator passes.
///
/// This can be read as the standard order of major LLVM CodeGen stages. Stages
diff --git a/llvm/test/CodeGen/X86/asm-constraints-rm.ll b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
new file mode 100644
index 0000000000000..f718f6b26abb3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
@@ -0,0 +1,363 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "^\t#" --version 4
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=greedy < %s | FileCheck --check-prefix=GREEDY-X86_64 %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=greedy < %s | FileCheck --check-prefix=GREEDY-I386 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=basic < %s | FileCheck --check-prefix=BASIC-X86_64 %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=basic < %s | FileCheck --check-prefix=BASIC-I386 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=fast < %s | FileCheck --check-prefix=FAST-X86_64 %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=fast < %s | FileCheck --check-prefix=FAST-I386 %s
+
+; The Greedy register allocator should use registers when there isn't register
+; pressure.
+
+define dso_local i32 @test1(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test1:
+; GREEDY-X86_64: #APP
+; GREEDY-X86_64: # 'rm' input no pressure -> %eax %ecx
+; GREEDY-X86_64: #NO_APP
+;
+; GREEDY-I386-LABEL: test1:
+; GREEDY-I386: #APP
+; GREEDY-I386: # 'rm' input no pressure -> %ecx %edx
+; GREEDY-I386: #NO_APP
+;
+; BASIC-X86_64-LABEL: test1:
+; BASIC-X86_64: #APP
+; BASIC-X86_64: # 'rm' input no pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; BASIC-X86_64: #NO_APP
+;
+; BASIC-I386-LABEL: test1:
+; BASIC-I386: #APP
+; BASIC-I386: # 'rm' input no pressure -> {{[0-9]+}}(%esp) (%esp)
+; BASIC-I386: #NO_APP
+;
+; FAST-X86_64-LABEL: test1:
+; FAST-X86_64: #APP
+; FAST-X86_64: # 'rm' input no pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; FAST-X86_64: #NO_APP
+;
+; FAST-I386-LABEL: test1:
+; FAST-I386: #APP
+; FAST-I386: # 'rm' input no pressure -> {{[0-9]+}}(%esp) (%esp)
+; FAST-I386: #NO_APP
+entry:
+ %b = getelementptr inbounds i8, ptr %ptr, i64 4
+ %0 = load i32, ptr %b, align 4
+ %d = getelementptr inbounds i8, ptr %ptr, i64 12
+ %1 = load i32, ptr %d, align 4
+ tail call void asm sideeffect "# 'rm' input no pressure -> $0 $1", "rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1) #1
+ %2 = load i32, ptr %ptr, align 4
+ ret i32 %2
+}
+
+define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test2:
+; GREEDY-X86_64: #APP # 8-byte Folded Reload
+; GREEDY-X86_64: # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; GREEDY-X86_64: #NO_APP
+;
+; GREEDY-I386-LABEL: test2:
+; GREEDY-I386: #APP # 8-byte Folded Reload
+; GREEDY-I386: # 'rm' input pressure -> {{[0-9]+}}(%esp) (%esp)
+; GREEDY-I386: #NO_APP
+;
+; BASIC-X86_64-LABEL: test2:
+; BASIC-X86_64: #APP
+; BASIC-X86_64: # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; BASIC-X86_64: #NO_APP
+;
+; BASIC-I386-LABEL: test2:
+; BASIC-I386: #APP
+; BASIC-I386: # 'rm' input pressure -> {{[0-9]+}}(%esp) (%esp)
+; BASIC-I386: #NO_APP
+;
+; FAST-X86_64-LABEL: test2:
+; FAST-X86_64: #APP
+; FAST-X86_64: # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; FAST-X86_64: #NO_APP
+;
+; FAST-I386-LABEL: test2:
+; FAST-I386: #APP
+; FAST-I386: # 'rm' input pressure -> {{[0-9]+}}(%esp) {{[0-9]+}}(%esp)
+; FAST-I386: #NO_APP
+entry:
+ %b = getelementptr inbounds i8, ptr %ptr, i64 4
+ %0 = load i32, ptr %b, align 4
+ %d = getelementptr inbounds i8, ptr %ptr, i64 12
+ %1 = load i32, ptr %d, align 4
+ tail call void asm sideeffect "# 'rm' input pressure -> $0 $1", "rm,rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1) #1
+ %2 = load i32, ptr %ptr, align 4
+ ret i32 %2
+}
+
+define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test3:
+; GREEDY-X86_64: #APP
+; GREEDY-X86_64: # 'rm' output no pressure -> %eax %ecx
+; GREEDY-X86_64: #NO_APP
+;
+; GREEDY-I386-LABEL: test3:
+; GREEDY-I386: #APP
+; GREEDY-I386: # 'rm' output no pressure -> %ecx %edx
+; GREEDY-I386: #NO_APP
+;
+; BASIC-X86_64-LABEL: test3:
+; BASIC-X86_64: #APP
+; BASIC-X86_64: # 'rm' output no pressure -> 4(%rdi) 12(%rdi)
+; BASIC-X86_64: #NO_APP
+;
+; BASIC-I386-LABEL: test3:
+; BASIC-I386: #APP
+; BASIC-I386: # 'rm' output no pressure -> 4(%eax) 12(%eax)
+; BASIC-I386: #NO_APP
+;
+; FAST-X86_64-LABEL: test3:
+; FAST-X86_64: #APP
+; FAST-X86_64: # 'rm' output no pressure -> 4(%rdi) 12(%rdi)
+; FAST-X86_64: #NO_APP
+;
+; FAST-I386-LABEL: test3:
+; FAST-I386: #APP
+; FAST-I386: # 'rm' output no pressure -> 4(%eax) 12(%eax)
+; FAST-I386: #NO_APP
+entry:
+ %b = getelementptr inbounds i8, ptr %ptr, i64 4
+ %d = getelementptr inbounds i8, ptr %ptr, i64 12
+ tail call void asm sideeffect "# 'rm' output no pressure -> $0 $1", "=*rm,=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d) #1
+ %0 = load i32, ptr %ptr, align 4
+ ret i32 %0
+}
+
+define dso_local i32 @test4(ptr noundef %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test4:
+; GREEDY-X86_64: #APP
+; GREEDY-X86_64: # tied 'rm' no pressure -> %eax %ecx %eax %ecx
+; GREEDY-X86_64: #NO_APP
+;
+; GREEDY-I386-LABEL: test4:
+; GREEDY-I386: #APP
+; GREEDY-I386: # tied 'rm' no pressure -> %ecx %edx %ecx %edx
+; GREEDY-I386: #NO_APP
+;
+; BASIC-X86_64-LABEL: test4:
+; BASIC-X86_64: #APP
+; BASIC-X86_64: # tied 'rm' no pressure -> %eax %ecx %eax %ecx
+; BASIC-X86_64: #NO_APP
+;
+; BASIC-I386-LABEL: test4:
+; BASIC-I386: #APP
+; BASIC-I386: # tied 'rm' no pressure -> %eax %ecx %eax %ecx
+; BASIC-I386: #NO_APP
+;
+; FAST-X86_64-LABEL: test4:
+; FAST-X86_64: #APP
+; FAST-X86_64: # tied 'rm' no pressure -> %ecx %eax %ecx %eax
+; FAST-X86_64: #NO_APP
+;
+; FAST-I386-LABEL: test4:
+; FAST-I386: #APP
+; FAST-I386: # tied 'rm' no pressure -> %edx %ecx %edx %ecx
+; FAST-I386: #NO_APP
+entry:
+ %b = getelementptr inbounds i8, ptr %ptr, i64 4
+ %0 = load i32, ptr %b, align 4
+ %d = getelementptr inbounds i8, ptr %ptr, i64 12
+ %1 = load i32, ptr %d, align 4
+ tail call void asm sideeffect "# tied 'rm' no pressure -> $0 $1 $2 $3", "=*rm,=*rm,0,1,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d, i32 %0, i32 %1) #1
+ %2 = load i32, ptr %ptr, align 4
+ ret i32 %2
+}
+
+define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test5:
+; GREEDY-X86_64: #APP
+; GREEDY-X86_64: # 'rm' input -> %eax
+; GREEDY-X86_64: #NO_APP
+;
+; GREEDY-I386-LABEL: test5:
+; GREEDY-I386: #APP
+; GREEDY-I386: # 'rm' input -> %ecx
+; GREEDY-I386: #NO_APP
+;
+; BASIC-X86_64-LABEL: test5:
+; BASIC-X86_64: #APP
+; BASIC-X86_64: # 'rm' input -> -{{[0-9]+}}(%rsp)
+; BASIC-X86_64: #NO_APP
+;
+; BASIC-I386-LABEL: test5:
+; BASIC-I386: #APP
+; BASIC-I386: # 'rm' input -> (%esp)
+; BASIC-I386: #NO_APP
+;
+; FAST-X86_64-LABEL: test5:
+; FAST-X86_64: #APP
+; FAST-X86_64: # 'rm' input -> -{{[0-9]+}}(%rsp)
+; FAST-X86_64: #NO_APP
+;
+; FAST-I386-LABEL: test5:
+; FAST-I386: #APP
+; FAST-I386: # 'rm' input -> (%esp)
+; FAST-I386: #NO_APP
+entry:
+ %b = getelementptr inbounds i8, ptr %ptr, i64 4
+ %0 = load i32, ptr %b, align 4
+ tail call void asm sideeffect "# 'rm' input -> $0", "rm,~{dirflag},~{fpsr},~{flags}"(i32 %0) #1
+ %1 = load i32, ptr %ptr, align 4
+ ret i32 %1
+}
+
+define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test6:
+; GREEDY-X86_64: #APP
+; GREEDY-X86_64: # 'rm' and 'r' input -> %eax %ecx
+; GREEDY-X86_64: #NO_APP
+;
+; GREEDY-I386-LABEL: test6:
+; GREEDY-I386: #APP
+; GREEDY-I386: # 'rm' and 'r' input -> %ecx %edx
+; GREEDY-I386: #NO_APP
+;
+; BASIC-X86_64-LABEL: test6:
+; BASIC-X86_64: #APP
+; BASIC-X86_64: # 'rm' and 'r' input -> -{{[0-9]+}}(%rsp) %ecx
+; BASIC-X86_64: #NO_APP
+;
+; BASIC-I386-LABEL: test6:
+; BASIC-I386: #APP
+; BASIC-I386: # 'rm' and 'r' input -> (%esp) %ecx
+; BASIC-I386: #NO_APP
+;
+; FAST-X86_64-LABEL: test6:
+; FAST-X86_64: #APP
+; FAST-X86_64: # 'rm' and 'r' input -> -{{[0-9]+}}(%rsp) %eax
+; FAST-X86_64: #NO_APP
+;
+; FAST-I386-LABEL: test6:
+; FAST-I386: #APP
+; FAST-I386: # 'rm' and 'r' input -> (%esp) %ecx
+; FAST-I386: #NO_APP
+entry:
+ %b = getelementptr inbounds i8, ptr %ptr, i64 4
+ %0 = load i32, ptr %b, align 4
+ %d = getelementptr inbounds i8, ptr %ptr, i64 12
+ %1 = load i32, ptr %d, align 4
+ tail call void asm sideeffect "# 'rm' and 'r' input -> $0 $1", "rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1) #1
+ %2 = load i32, ptr %ptr, align 4
+ ret i32 %2
+}
+
+define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test7:
+; GREEDY-X86_64: #APP
+; GREEDY-X86_64: # 'rm' output -> %eax
+; GREEDY-X86_64: #NO_APP
+;
+; GREEDY-I386-LABEL: test7:
+; GREEDY-I386: #APP
+; GREEDY-I386: # 'rm' output -> %ecx
+; GREEDY-I386: #NO_APP
+;
+; BASIC-X86_64-LABEL: test7:
+; BASIC-X86_64: #APP
+; BASIC-X86_64: # 'rm' output -> 4(%rdi)
+; BASIC-X86_64: #NO_APP
+;
+; BASIC-I386-LABEL: test7:
+; BASIC-I386: #APP
+; BASIC-I386: # 'rm' output -> 4(%eax)
+; BASIC-I386: #NO_APP
+;
+; FAST-X86_64-LABEL: test7:
+; FAST-X86_64: #APP
+; FAST-X86_64: # 'rm' output -> 4(%rdi)
+; FAST-X86_64: #NO_APP
+;
+; FAST-I386-LABEL: test7:
+; FAST-I386: #APP
+; FAST-I386: # 'rm' output -> 4(%eax)
+; FAST-I386: #NO_APP
+entry:
+ %b = getelementptr inbounds i8, ptr %ptr, i64 4
+ tail call void asm sideeffect "# 'rm' output -> $0", "=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b) #1
+ %0 = load i32, ptr %ptr, align 4
+ ret i32 %0
+}
+
+define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test8:
+; GREEDY-X86_64: #APP
+; GREEDY-X86_64: # 'rm' tied -> %eax
+; GREEDY-X86_64: #NO_APP
+;
+; GREEDY-I386-LABEL: test8:
+; GREEDY-I386: #APP
+; GREEDY-I386: # 'rm' tied -> %ecx
+; GREEDY-I386: #NO_APP
+;
+; BASIC-X86_64-LABEL: test8:
+; BASIC-X86_64: #APP
+; BASIC-X86_64: # 'rm' tied -> %eax
+; BASIC-X86_64: #NO_APP
+;
+; BASIC-I386-LABEL: test8:
+; BASIC-I386: #APP
+; BASIC-I386: # 'rm' tied -> %eax
+; BASIC-I386: #NO_APP
+;
+; FAST-X86_64-LABEL: test8:
+; FAST-X86_64: #APP
+; FAST-X86_64: # 'rm' tied -> %eax
+; FAST-X86_64: #NO_APP
+;
+; FAST-I386-LABEL: test8:
+; FAST-I386: #APP
+; FAST-I386: # 'rm' tied -> %ecx
+; FAST-I386: #NO_APP
+entry:
+ %b = getelementptr inbounds i8, ptr %ptr, i64 4
+ %0 = load i32, ptr %b, align 4
+ tail call void asm sideeffect "# 'rm' tied -> $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, i32 %0) #1
+ %1 = load i32, ptr %ptr, align 4
+ ret i32 %1
+}
+
+define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test9:
+; GREEDY-X86_64: #APP
+; GREEDY-X86_64: # 'r' output == input location -> %eax
+; GREEDY-X86_64: #NO_APP
+;
+; GREEDY-I386-LABEL: test9:
+; GREEDY-I386: #APP
+; GREEDY-I386: # 'r' output == input location -> %ecx
+; GREEDY-I386: #NO_APP
+;
+; BASIC-X86_64-LABEL: test9:
+; BASIC-X86_64: #APP
+; BASIC-X86_64: # 'r' output == input location -> %eax
+; BASIC-X86_64: #NO_APP
+;
+; BASIC-I386-LABEL: test9:
+; BASIC-I386: #APP
+; BASIC-I386: # 'r' output == input location -> %eax
+; BASIC-I386: #NO_APP
+;
+; FAST-X86_64-LABEL: test9:
+; FAST-X86_64: #APP
+; FAST-X86_64: # 'r' output == input location -> %eax
+; FAST-X86_64: #NO_APP
+;
+; FAST-I386-LABEL: test9:
+; FAST-I386: #APP
+; FAST-I386: # 'r' output == input location -> %ecx
+; FAST-I386: #NO_APP
+entry:
+ %b = getelementptr inbounds i8, ptr %ptr, i64 4
+ %0 = load i32, ptr %b, align 4
+ %1 = tail call i32 asm sideeffect "# 'r' output == input location -> $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %0) #1
+ store i32 %1, ptr %b, align 4
+ %2 = load i32, ptr %ptr, align 4
+ ret i32 %2
+}
+
+attributes #0 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll b/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll
index be4d1c29332f7..a322bd3003a58 100644
--- a/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll
+++ b/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll
@@ -6,16 +6,13 @@
define i32 @foo(i32 %treemap) nounwind {
; CHECK-LABEL: foo:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: pushl %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: negl %ecx
; CHECK-NEXT: andl %eax, %ecx
-; CHECK-NEXT: movl %ecx, (%esp)
; CHECK-NEXT: #APP
-; CHECK-NEXT: bsfl (%esp), %eax
+; CHECK-NEXT: bsfl %ecx, %eax
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: popl %ecx
; CHECK-NEXT: retl
entry:
%sub = sub i32 0, %treemap
More information about the llvm-commits
mailing list