[clang] [llvm] [Clang][inlineasm] Add special support for "rm" output constraints (PR #92040)

Sat Feb 7 04:57:45 PST 2026

https://github.com/bwendling updated https://github.com/llvm/llvm-project/pull/92040

>From 94e01760b8363ad59a860c9c036918e670cc3783 Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Mon, 29 Apr 2024 14:40:54 -0700
Subject: [PATCH 01/29] [Clang][inlineasm] Add special support for "rm" output
 constraints

Clang isn't able to support multiple constraints on inputs and outputs.
Instead, it picks the "safest" one to use, i.e. the most conseravite. In
the case of "rm" it picks the memory constraint. This leads to obviously
horrible code:

  asm __volatile__ ("pushf\n\t"
                    "popq %0"
                    : "=rm" (x));

is converted to:

	#APP
        pushf
	popq -8(%rsp)
	#NO_APP
	movq	-8(%rsp), %rax

Blech!

This hack^Wchange, makes a special exception for "rm" to use "r" if at
all possible. The "RegMayBeFolded" flag is then used by the register
allocators to allow for the old behavior if register pressure is too
great.

Fixes: https://github.com/llvm/llvm-project/issues/20571

Cc: Nick Desaulniers <ndesaulniers at google.com>
Cc: Kees Cook <keescook at google.com>
Cc: llvm at lists.linux.dev
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |   5 +
 llvm/include/llvm/CodeGen/TargetPassConfig.h  |   2 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  25 +-
 .../SelectionDAG/SelectionDAGBuilder.h        |   5 +-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  30 +-
 llvm/lib/CodeGen/TargetPassConfig.cpp         |   6 +
 llvm/test/CodeGen/X86/asm-constraints-rm.ll   | 363 ++++++++++++++++++
 llvm/test/CodeGen/X86/inlineasm-sched-bug.ll  |   5 +-
 8 files changed, 424 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/asm-constraints-rm.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 50a8c7eb75af5..ff321f6aa0f62 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4939,6 +4939,11 @@ class TargetLowering : public TargetLoweringBase {
     /// Memory, Other, Unknown.
     TargetLowering::ConstraintType ConstraintType = TargetLowering::C_Unknown;
 
+    /// The register may be folded. This is used if the constraint is "rm",
+    /// where we prefer using a register, but can fall back to a memory slot
+    /// under register pressure.
+    bool MayFoldRegister = false;
+
     /// If this is the result output operand or a clobber, this is null,
     /// otherwise it is the incoming operand to the CallInst.  This gets
     /// modified as the asm is processed.
diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h
index d00e0bed91a45..c1f4199536409 100644
--- a/llvm/include/llvm/CodeGen/TargetPassConfig.h
+++ b/llvm/include/llvm/CodeGen/TargetPassConfig.h
@@ -496,6 +496,8 @@ class TargetPassConfig : public ImmutablePass {
 void registerCodeGenCallback(PassInstrumentationCallbacks &PIC,
                              LLVMTargetMachine &);
 
+bool usesGreedyOrDefaultRegisterAllocator();
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_TARGETPASSCONFIG_H
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ca352da5d36eb..7bc03becf1a5a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1008,7 +1008,8 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
 }
 
 void RegsForValue::AddInlineAsmOperands(InlineAsm::Kind Code, bool HasMatching,
-                                        unsigned MatchingIdx, const SDLoc &dl,
+                                        unsigned MatchingIdx,
+                                        bool MayFoldRegister, const SDLoc &dl,
                                         SelectionDAG &DAG,
                                         std::vector<SDValue> &Ops) const {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -1024,7 +1025,9 @@ void RegsForValue::AddInlineAsmOperands(InlineAsm::Kind Code, bool HasMatching,
     // from the def.
     const MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
     const TargetRegisterClass *RC = MRI.getRegClass(Regs.front());
+
     Flag.setRegClass(RC->getID());
+    Flag.setRegMayBeFolded(MayFoldRegister);
   }
 
   SDValue Res = DAG.getTargetConstant(Flag, dl, MVT::i32);
@@ -9775,8 +9778,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
         AsmNodeOperands.push_back(OpInfo.CallOperand);
       } else {
         // Otherwise, this outputs to a register (directly for C_Register /
-        // C_RegisterClass, and a target-defined fashion for
-        // C_Immediate/C_Other). Find a register that we can use.
+        // C_RegisterClass, and a target-defined fashion for C_Immediate /
+        // C_Other). Find a register that we can use.
         if (OpInfo.AssignedRegs.Regs.empty()) {
           emitInlineAsmError(
               Call, "couldn't allocate output register for constraint '" +
@@ -9792,7 +9795,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
         OpInfo.AssignedRegs.AddInlineAsmOperands(
             OpInfo.isEarlyClobber ? InlineAsm::Kind::RegDefEarlyClobber
                                   : InlineAsm::Kind::RegDef,
-            false, 0, getCurSDLoc(), DAG, AsmNodeOperands);
+            false, 0, OpInfo.MayFoldRegister, getCurSDLoc(), DAG,
+            AsmNodeOperands);
       }
       break;
 
@@ -9834,9 +9838,9 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
           SDLoc dl = getCurSDLoc();
           // Use the produced MatchedRegs object to
           MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Glue, &Call);
-          MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind::RegUse, true,
-                                           OpInfo.getMatchedOperand(), dl, DAG,
-                                           AsmNodeOperands);
+          MatchedRegs.AddInlineAsmOperands(
+              InlineAsm::Kind::RegUse, true, OpInfo.getMatchedOperand(),
+              OpInfo.MayFoldRegister, dl, DAG, AsmNodeOperands);
           break;
         }
 
@@ -9965,7 +9969,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
                                         &Call);
 
       OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind::RegUse, false,
-                                               0, dl, DAG, AsmNodeOperands);
+                                               0, OpInfo.MayFoldRegister, dl,
+                                               DAG, AsmNodeOperands);
       break;
     }
     case InlineAsm::isClobber:
@@ -9973,8 +9978,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
       // allocator is aware that the physreg got clobbered.
       if (!OpInfo.AssignedRegs.Regs.empty())
         OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind::Clobber,
-                                                 false, 0, getCurSDLoc(), DAG,
-                                                 AsmNodeOperands);
+                                                 false, 0, false, getCurSDLoc(),
+                                                 DAG, AsmNodeOperands);
       break;
     }
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index ae361f8c500a0..daf9cfbbe1279 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -783,8 +783,9 @@ struct RegsForValue {
   /// code marker, matching input operand index (if applicable), and includes
   /// the number of values added into it.
   void AddInlineAsmOperands(InlineAsm::Kind Code, bool HasMatching,
-                            unsigned MatchingIdx, const SDLoc &dl,
-                            SelectionDAG &DAG, std::vector<SDValue> &Ops) const;
+                            unsigned MatchingIdx, bool MayFoldRegister,
+                            const SDLoc &dl, SelectionDAG &DAG,
+                            std::vector<SDValue> &Ops) const;
 
   /// Check if the total RegCount is greater than one.
   bool occupiesMultipleRegs() const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 7beaeb9b7a171..cadb609ec72f5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -33,6 +34,7 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Triple.h"
 #include <cctype>
 using namespace llvm;
 
@@ -5668,6 +5670,7 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
   unsigned ResNo = 0; // ResNo - The result number of the next output.
   unsigned LabelNo = 0; // LabelNo - CallBr indirect dest number.
 
+  const Triple &T = getTargetMachine().getTargetTriple();
   for (InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
     ConstraintOperands.emplace_back(std::move(CI));
     AsmOperandInfo &OpInfo = ConstraintOperands.back();
@@ -5678,6 +5681,16 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
 
     OpInfo.ConstraintVT = MVT::Other;
 
+    // Special treatment for all platforms (currently only x86) that can fold a
+    // register into a spill. This is used for the "rm" constraint, where we
+    // would vastly prefer to use 'r' over 'm', but can't because of LLVM's
+    // architecture picks the most "conservative" constraint to ensure that (in
+    // the case of "rm") register pressure cause bad things to happen.
+    if (T.isX86() && !OpInfo.hasMatchingInput() && OpInfo.Codes.size() == 2 &&
+        llvm::is_contained(OpInfo.Codes, "r") &&
+        llvm::is_contained(OpInfo.Codes, "m"))
+      OpInfo.MayFoldRegister = true;
+
     // Compute the value type for each operand.
     switch (OpInfo.Type) {
     case InlineAsm::isOutput:
@@ -5954,7 +5967,12 @@ TargetLowering::ConstraintWeight
 ///  1) If there is an 'other' constraint, and if the operand is valid for
 ///     that constraint, use it.  This makes us take advantage of 'i'
 ///     constraints when available.
-///  2) Otherwise, pick the most general constraint present.  This prefers
+///  2) Special processing is done for the "rm" constraint. If specified, we
+///     opt for the 'r' constraint, but mark the operand as being "foldable."
+///     In the face of register exhaustion, the register allocator is free to
+///     choose to use a stack slot. This only applies to the greedy and default
+///     register allocators. FIXME: Support other allocators (fast?).
+///  3) Otherwise, pick the most general constraint present.  This prefers
 ///     'm' over 'r', for example.
 ///
 TargetLowering::ConstraintGroup TargetLowering::getConstraintPreferences(
@@ -5962,6 +5980,16 @@ TargetLowering::ConstraintGroup TargetLowering::getConstraintPreferences(
   ConstraintGroup Ret;
 
   Ret.reserve(OpInfo.Codes.size());
+
+  // If we can fold the register (i.e. it has an "rm" constraint), opt for the
+  // 'r' constraint, and allow the register allocator to spill if need be.
+  // Applies only to the greedy and default register allocators.
+  if (OpInfo.MayFoldRegister && usesGreedyOrDefaultRegisterAllocator()) {
+    Ret.emplace_back(ConstraintPair("r", getConstraintType("r")));
+    Ret.emplace_back(ConstraintPair("m", getConstraintType("m")));
+    return Ret;
+  }
+
   for (StringRef Code : OpInfo.Codes) {
     TargetLowering::ConstraintType CType = getConstraintType(Code);
 
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 8832b51333d91..b768cde55d79f 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1077,6 +1077,12 @@ static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
     RegAlloc("regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
              cl::desc("Register allocator to use"));
 
+bool llvm::usesGreedyOrDefaultRegisterAllocator() {
+  return RegAlloc == (RegisterRegAlloc::
+                          FunctionPassCtor)&createGreedyRegisterAllocator ||
+         RegAlloc == &useDefaultRegisterAllocator;
+}
+
 /// Add the complete set of target-independent postISel code generator passes.
 ///
 /// This can be read as the standard order of major LLVM CodeGen stages. Stages
diff --git a/llvm/test/CodeGen/X86/asm-constraints-rm.ll b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
new file mode 100644
index 0000000000000..f718f6b26abb3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
@@ -0,0 +1,363 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "^\t#" --version 4
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=greedy < %s | FileCheck --check-prefix=GREEDY-X86_64 %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=greedy   < %s | FileCheck --check-prefix=GREEDY-I386 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=basic  < %s | FileCheck --check-prefix=BASIC-X86_64 %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=basic    < %s | FileCheck --check-prefix=BASIC-I386 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=fast   < %s | FileCheck --check-prefix=FAST-X86_64 %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=fast     < %s | FileCheck --check-prefix=FAST-I386 %s
+
+; The Greedy register allocator should use registers when there isn't register
+; pressure.
+
+define dso_local i32 @test1(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test1:
+; GREEDY-X86_64:    #APP
+; GREEDY-X86_64:    # 'rm' input no pressure -> %eax %ecx
+; GREEDY-X86_64:    #NO_APP
+;
+; GREEDY-I386-LABEL: test1:
+; GREEDY-I386:    #APP
+; GREEDY-I386:    # 'rm' input no pressure -> %ecx %edx
+; GREEDY-I386:    #NO_APP
+;
+; BASIC-X86_64-LABEL: test1:
+; BASIC-X86_64:    #APP
+; BASIC-X86_64:    # 'rm' input no pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; BASIC-X86_64:    #NO_APP
+;
+; BASIC-I386-LABEL: test1:
+; BASIC-I386:    #APP
+; BASIC-I386:    # 'rm' input no pressure -> {{[0-9]+}}(%esp) (%esp)
+; BASIC-I386:    #NO_APP
+;
+; FAST-X86_64-LABEL: test1:
+; FAST-X86_64:    #APP
+; FAST-X86_64:    # 'rm' input no pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; FAST-X86_64:    #NO_APP
+;
+; FAST-I386-LABEL: test1:
+; FAST-I386:    #APP
+; FAST-I386:    # 'rm' input no pressure -> {{[0-9]+}}(%esp) (%esp)
+; FAST-I386:    #NO_APP
+entry:
+  %b = getelementptr inbounds i8, ptr %ptr, i64 4
+  %0 = load i32, ptr %b, align 4
+  %d = getelementptr inbounds i8, ptr %ptr, i64 12
+  %1 = load i32, ptr %d, align 4
+  tail call void asm sideeffect "# 'rm' input no pressure -> $0 $1", "rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1) #1
+  %2 = load i32, ptr %ptr, align 4
+  ret i32 %2
+}
+
+define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test2:
+; GREEDY-X86_64:    #APP # 8-byte Folded Reload
+; GREEDY-X86_64:    # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; GREEDY-X86_64:    #NO_APP
+;
+; GREEDY-I386-LABEL: test2:
+; GREEDY-I386:    #APP # 8-byte Folded Reload
+; GREEDY-I386:    # 'rm' input pressure -> {{[0-9]+}}(%esp) (%esp)
+; GREEDY-I386:    #NO_APP
+;
+; BASIC-X86_64-LABEL: test2:
+; BASIC-X86_64:    #APP
+; BASIC-X86_64:    # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; BASIC-X86_64:    #NO_APP
+;
+; BASIC-I386-LABEL: test2:
+; BASIC-I386:    #APP
+; BASIC-I386:    # 'rm' input pressure -> {{[0-9]+}}(%esp) (%esp)
+; BASIC-I386:    #NO_APP
+;
+; FAST-X86_64-LABEL: test2:
+; FAST-X86_64:    #APP
+; FAST-X86_64:    # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; FAST-X86_64:    #NO_APP
+;
+; FAST-I386-LABEL: test2:
+; FAST-I386:    #APP
+; FAST-I386:    # 'rm' input pressure -> {{[0-9]+}}(%esp) {{[0-9]+}}(%esp)
+; FAST-I386:    #NO_APP
+entry:
+  %b = getelementptr inbounds i8, ptr %ptr, i64 4
+  %0 = load i32, ptr %b, align 4
+  %d = getelementptr inbounds i8, ptr %ptr, i64 12
+  %1 = load i32, ptr %d, align 4
+  tail call void asm sideeffect "# 'rm' input pressure -> $0 $1", "rm,rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1) #1
+  %2 = load i32, ptr %ptr, align 4
+  ret i32 %2
+}
+
+define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test3:
+; GREEDY-X86_64:    #APP
+; GREEDY-X86_64:    # 'rm' output no pressure -> %eax %ecx
+; GREEDY-X86_64:    #NO_APP
+;
+; GREEDY-I386-LABEL: test3:
+; GREEDY-I386:    #APP
+; GREEDY-I386:    # 'rm' output no pressure -> %ecx %edx
+; GREEDY-I386:    #NO_APP
+;
+; BASIC-X86_64-LABEL: test3:
+; BASIC-X86_64:    #APP
+; BASIC-X86_64:    # 'rm' output no pressure -> 4(%rdi) 12(%rdi)
+; BASIC-X86_64:    #NO_APP
+;
+; BASIC-I386-LABEL: test3:
+; BASIC-I386:    #APP
+; BASIC-I386:    # 'rm' output no pressure -> 4(%eax) 12(%eax)
+; BASIC-I386:    #NO_APP
+;
+; FAST-X86_64-LABEL: test3:
+; FAST-X86_64:    #APP
+; FAST-X86_64:    # 'rm' output no pressure -> 4(%rdi) 12(%rdi)
+; FAST-X86_64:    #NO_APP
+;
+; FAST-I386-LABEL: test3:
+; FAST-I386:    #APP
+; FAST-I386:    # 'rm' output no pressure -> 4(%eax) 12(%eax)
+; FAST-I386:    #NO_APP
+entry:
+  %b = getelementptr inbounds i8, ptr %ptr, i64 4
+  %d = getelementptr inbounds i8, ptr %ptr, i64 12
+  tail call void asm sideeffect "# 'rm' output no pressure -> $0 $1", "=*rm,=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d) #1
+  %0 = load i32, ptr %ptr, align 4
+  ret i32 %0
+}
+
+define dso_local i32 @test4(ptr noundef %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test4:
+; GREEDY-X86_64:    #APP
+; GREEDY-X86_64:    # tied 'rm' no pressure -> %eax %ecx %eax %ecx
+; GREEDY-X86_64:    #NO_APP
+;
+; GREEDY-I386-LABEL: test4:
+; GREEDY-I386:    #APP
+; GREEDY-I386:    # tied 'rm' no pressure -> %ecx %edx %ecx %edx
+; GREEDY-I386:    #NO_APP
+;
+; BASIC-X86_64-LABEL: test4:
+; BASIC-X86_64:    #APP
+; BASIC-X86_64:    # tied 'rm' no pressure -> %eax %ecx %eax %ecx
+; BASIC-X86_64:    #NO_APP
+;
+; BASIC-I386-LABEL: test4:
+; BASIC-I386:    #APP
+; BASIC-I386:    # tied 'rm' no pressure -> %eax %ecx %eax %ecx
+; BASIC-I386:    #NO_APP
+;
+; FAST-X86_64-LABEL: test4:
+; FAST-X86_64:    #APP
+; FAST-X86_64:    # tied 'rm' no pressure -> %ecx %eax %ecx %eax
+; FAST-X86_64:    #NO_APP
+;
+; FAST-I386-LABEL: test4:
+; FAST-I386:    #APP
+; FAST-I386:    # tied 'rm' no pressure -> %edx %ecx %edx %ecx
+; FAST-I386:    #NO_APP
+entry:
+  %b = getelementptr inbounds i8, ptr %ptr, i64 4
+  %0 = load i32, ptr %b, align 4
+  %d = getelementptr inbounds i8, ptr %ptr, i64 12
+  %1 = load i32, ptr %d, align 4
+  tail call void asm sideeffect "# tied 'rm' no pressure -> $0 $1 $2 $3", "=*rm,=*rm,0,1,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d, i32 %0, i32 %1) #1
+  %2 = load i32, ptr %ptr, align 4
+  ret i32 %2
+}
+
+define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test5:
+; GREEDY-X86_64:    #APP
+; GREEDY-X86_64:    # 'rm' input -> %eax
+; GREEDY-X86_64:    #NO_APP
+;
+; GREEDY-I386-LABEL: test5:
+; GREEDY-I386:    #APP
+; GREEDY-I386:    # 'rm' input -> %ecx
+; GREEDY-I386:    #NO_APP
+;
+; BASIC-X86_64-LABEL: test5:
+; BASIC-X86_64:    #APP
+; BASIC-X86_64:    # 'rm' input -> -{{[0-9]+}}(%rsp)
+; BASIC-X86_64:    #NO_APP
+;
+; BASIC-I386-LABEL: test5:
+; BASIC-I386:    #APP
+; BASIC-I386:    # 'rm' input -> (%esp)
+; BASIC-I386:    #NO_APP
+;
+; FAST-X86_64-LABEL: test5:
+; FAST-X86_64:    #APP
+; FAST-X86_64:    # 'rm' input -> -{{[0-9]+}}(%rsp)
+; FAST-X86_64:    #NO_APP
+;
+; FAST-I386-LABEL: test5:
+; FAST-I386:    #APP
+; FAST-I386:    # 'rm' input -> (%esp)
+; FAST-I386:    #NO_APP
+entry:
+  %b = getelementptr inbounds i8, ptr %ptr, i64 4
+  %0 = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# 'rm' input -> $0", "rm,~{dirflag},~{fpsr},~{flags}"(i32 %0) #1
+  %1 = load i32, ptr %ptr, align 4
+  ret i32 %1
+}
+
+define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test6:
+; GREEDY-X86_64:    #APP
+; GREEDY-X86_64:    # 'rm' and 'r' input -> %eax %ecx
+; GREEDY-X86_64:    #NO_APP
+;
+; GREEDY-I386-LABEL: test6:
+; GREEDY-I386:    #APP
+; GREEDY-I386:    # 'rm' and 'r' input -> %ecx %edx
+; GREEDY-I386:    #NO_APP
+;
+; BASIC-X86_64-LABEL: test6:
+; BASIC-X86_64:    #APP
+; BASIC-X86_64:    # 'rm' and 'r' input -> -{{[0-9]+}}(%rsp) %ecx
+; BASIC-X86_64:    #NO_APP
+;
+; BASIC-I386-LABEL: test6:
+; BASIC-I386:    #APP
+; BASIC-I386:    # 'rm' and 'r' input -> (%esp) %ecx
+; BASIC-I386:    #NO_APP
+;
+; FAST-X86_64-LABEL: test6:
+; FAST-X86_64:    #APP
+; FAST-X86_64:    # 'rm' and 'r' input -> -{{[0-9]+}}(%rsp) %eax
+; FAST-X86_64:    #NO_APP
+;
+; FAST-I386-LABEL: test6:
+; FAST-I386:    #APP
+; FAST-I386:    # 'rm' and 'r' input -> (%esp) %ecx
+; FAST-I386:    #NO_APP
+entry:
+  %b = getelementptr inbounds i8, ptr %ptr, i64 4
+  %0 = load i32, ptr %b, align 4
+  %d = getelementptr inbounds i8, ptr %ptr, i64 12
+  %1 = load i32, ptr %d, align 4
+  tail call void asm sideeffect "# 'rm' and 'r' input -> $0 $1", "rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1) #1
+  %2 = load i32, ptr %ptr, align 4
+  ret i32 %2
+}
+
+define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test7:
+; GREEDY-X86_64:    #APP
+; GREEDY-X86_64:    # 'rm' output -> %eax
+; GREEDY-X86_64:    #NO_APP
+;
+; GREEDY-I386-LABEL: test7:
+; GREEDY-I386:    #APP
+; GREEDY-I386:    # 'rm' output -> %ecx
+; GREEDY-I386:    #NO_APP
+;
+; BASIC-X86_64-LABEL: test7:
+; BASIC-X86_64:    #APP
+; BASIC-X86_64:    # 'rm' output -> 4(%rdi)
+; BASIC-X86_64:    #NO_APP
+;
+; BASIC-I386-LABEL: test7:
+; BASIC-I386:    #APP
+; BASIC-I386:    # 'rm' output -> 4(%eax)
+; BASIC-I386:    #NO_APP
+;
+; FAST-X86_64-LABEL: test7:
+; FAST-X86_64:    #APP
+; FAST-X86_64:    # 'rm' output -> 4(%rdi)
+; FAST-X86_64:    #NO_APP
+;
+; FAST-I386-LABEL: test7:
+; FAST-I386:    #APP
+; FAST-I386:    # 'rm' output -> 4(%eax)
+; FAST-I386:    #NO_APP
+entry:
+  %b = getelementptr inbounds i8, ptr %ptr, i64 4
+  tail call void asm sideeffect "# 'rm' output -> $0", "=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b) #1
+  %0 = load i32, ptr %ptr, align 4
+  ret i32 %0
+}
+
+define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test8:
+; GREEDY-X86_64:    #APP
+; GREEDY-X86_64:    # 'rm' tied -> %eax
+; GREEDY-X86_64:    #NO_APP
+;
+; GREEDY-I386-LABEL: test8:
+; GREEDY-I386:    #APP
+; GREEDY-I386:    # 'rm' tied -> %ecx
+; GREEDY-I386:    #NO_APP
+;
+; BASIC-X86_64-LABEL: test8:
+; BASIC-X86_64:    #APP
+; BASIC-X86_64:    # 'rm' tied -> %eax
+; BASIC-X86_64:    #NO_APP
+;
+; BASIC-I386-LABEL: test8:
+; BASIC-I386:    #APP
+; BASIC-I386:    # 'rm' tied -> %eax
+; BASIC-I386:    #NO_APP
+;
+; FAST-X86_64-LABEL: test8:
+; FAST-X86_64:    #APP
+; FAST-X86_64:    # 'rm' tied -> %eax
+; FAST-X86_64:    #NO_APP
+;
+; FAST-I386-LABEL: test8:
+; FAST-I386:    #APP
+; FAST-I386:    # 'rm' tied -> %ecx
+; FAST-I386:    #NO_APP
+entry:
+  %b = getelementptr inbounds i8, ptr %ptr, i64 4
+  %0 = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# 'rm' tied -> $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, i32 %0) #1
+  %1 = load i32, ptr %ptr, align 4
+  ret i32 %1
+}
+
+define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr #0 {
+; GREEDY-X86_64-LABEL: test9:
+; GREEDY-X86_64:    #APP
+; GREEDY-X86_64:    # 'r' output == input location -> %eax
+; GREEDY-X86_64:    #NO_APP
+;
+; GREEDY-I386-LABEL: test9:
+; GREEDY-I386:    #APP
+; GREEDY-I386:    # 'r' output == input location -> %ecx
+; GREEDY-I386:    #NO_APP
+;
+; BASIC-X86_64-LABEL: test9:
+; BASIC-X86_64:    #APP
+; BASIC-X86_64:    # 'r' output == input location -> %eax
+; BASIC-X86_64:    #NO_APP
+;
+; BASIC-I386-LABEL: test9:
+; BASIC-I386:    #APP
+; BASIC-I386:    # 'r' output == input location -> %eax
+; BASIC-I386:    #NO_APP
+;
+; FAST-X86_64-LABEL: test9:
+; FAST-X86_64:    #APP
+; FAST-X86_64:    # 'r' output == input location -> %eax
+; FAST-X86_64:    #NO_APP
+;
+; FAST-I386-LABEL: test9:
+; FAST-I386:    #APP
+; FAST-I386:    # 'r' output == input location -> %ecx
+; FAST-I386:    #NO_APP
+entry:
+  %b = getelementptr inbounds i8, ptr %ptr, i64 4
+  %0 = load i32, ptr %b, align 4
+  %1 = tail call i32 asm sideeffect "# 'r' output == input location -> $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %0) #1
+  store i32 %1, ptr %b, align 4
+  %2 = load i32, ptr %ptr, align 4
+  ret i32 %2
+}
+
+attributes #0 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll b/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll
index be4d1c29332f7..a322bd3003a58 100644
--- a/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll
+++ b/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll
@@ -6,16 +6,13 @@
 define i32 @foo(i32 %treemap) nounwind {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    negl %ecx
 ; CHECK-NEXT:    andl %eax, %ecx
-; CHECK-NEXT:    movl %ecx, (%esp)
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    bsfl (%esp), %eax
+; CHECK-NEXT:    bsfl %ecx, %eax
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    popl %ecx
 ; CHECK-NEXT:    retl
 entry:
   %sub = sub i32 0, %treemap

>From 9378b7ae2fa44c977bd8e1ab500db3883ecdb3da Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Wed, 10 Jul 2024 11:00:13 -0700
Subject: [PATCH 02/29] Remove function identifying the register allocator
 used.

---
 llvm/include/llvm/CodeGen/TargetPassConfig.h     | 2 --
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 2 +-
 llvm/lib/CodeGen/TargetPassConfig.cpp            | 6 ------
 3 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h
index c1f4199536409..d00e0bed91a45 100644
--- a/llvm/include/llvm/CodeGen/TargetPassConfig.h
+++ b/llvm/include/llvm/CodeGen/TargetPassConfig.h
@@ -496,8 +496,6 @@ class TargetPassConfig : public ImmutablePass {
 void registerCodeGenCallback(PassInstrumentationCallbacks &PIC,
                              LLVMTargetMachine &);
 
-bool usesGreedyOrDefaultRegisterAllocator();
-
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_TARGETPASSCONFIG_H
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e10014a64b25e..fa74f2789bfdb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6001,7 +6001,7 @@ TargetLowering::ConstraintGroup TargetLowering::getConstraintPreferences(
   // If we can fold the register (i.e. it has an "rm" constraint), opt for the
   // 'r' constraint, and allow the register allocator to spill if need be.
   // Applies only to the greedy and default register allocators.
-  if (OpInfo.MayFoldRegister && usesGreedyOrDefaultRegisterAllocator()) {
+  if (OpInfo.MayFoldRegister) {
     Ret.emplace_back(ConstraintPair("r", getConstraintType("r")));
     Ret.emplace_back(ConstraintPair("m", getConstraintType("m")));
     return Ret;
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 558dd6523aeec..3658e8320a0cc 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1080,12 +1080,6 @@ static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
     RegAlloc("regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
              cl::desc("Register allocator to use"));
 
-bool llvm::usesGreedyOrDefaultRegisterAllocator() {
-  return RegAlloc == (RegisterRegAlloc::
-                          FunctionPassCtor)&createGreedyRegisterAllocator ||
-         RegAlloc == &useDefaultRegisterAllocator;
-}
-
 /// Add the complete set of target-independent postISel code generator passes.
 ///
 /// This can be read as the standard order of major LLVM CodeGen stages. Stages

>From c1cfcefd1901d57306d77ee0397c98670caf56af Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Thu, 25 Jul 2024 11:30:17 -0700
Subject: [PATCH 03/29] Run instnamer and remove unneeded '-O2'.

---
 llvm/test/CodeGen/X86/asm-constraints-rm.ll | 86 ++++++++++-----------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/llvm/test/CodeGen/X86/asm-constraints-rm.ll b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
index f718f6b26abb3..6031eb7b22e6d 100644
--- a/llvm/test/CodeGen/X86/asm-constraints-rm.ll
+++ b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "^\t#" --version 4
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=greedy < %s | FileCheck --check-prefix=GREEDY-X86_64 %s
-; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=greedy   < %s | FileCheck --check-prefix=GREEDY-I386 %s
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=basic  < %s | FileCheck --check-prefix=BASIC-X86_64 %s
-; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=basic    < %s | FileCheck --check-prefix=BASIC-I386 %s
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=fast   < %s | FileCheck --check-prefix=FAST-X86_64 %s
-; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=fast     < %s | FileCheck --check-prefix=FAST-I386 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -regalloc=greedy < %s | FileCheck --check-prefix=GREEDY-X86_64 %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -regalloc=greedy   < %s | FileCheck --check-prefix=GREEDY-I386 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -regalloc=basic  < %s | FileCheck --check-prefix=BASIC-X86_64 %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -regalloc=basic    < %s | FileCheck --check-prefix=BASIC-I386 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -regalloc=fast   < %s | FileCheck --check-prefix=FAST-X86_64 %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -regalloc=fast     < %s | FileCheck --check-prefix=FAST-I386 %s
 
 ; The Greedy register allocator should use registers when there isn't register
 ; pressure.
@@ -41,12 +41,12 @@ define dso_local i32 @test1(ptr nocapture noundef readonly %ptr) local_unnamed_a
 ; FAST-I386:    #NO_APP
 entry:
   %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  %0 = load i32, ptr %b, align 4
+  %i = load i32, ptr %b, align 4
   %d = getelementptr inbounds i8, ptr %ptr, i64 12
-  %1 = load i32, ptr %d, align 4
-  tail call void asm sideeffect "# 'rm' input no pressure -> $0 $1", "rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1) #1
-  %2 = load i32, ptr %ptr, align 4
-  ret i32 %2
+  %i1 = load i32, ptr %d, align 4
+  tail call void asm sideeffect "# 'rm' input no pressure -> $0 $1", "rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1) #1
+  %i2 = load i32, ptr %ptr, align 4
+  ret i32 %i2
 }
 
 define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 {
@@ -81,12 +81,12 @@ define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_a
 ; FAST-I386:    #NO_APP
 entry:
   %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  %0 = load i32, ptr %b, align 4
+  %i = load i32, ptr %b, align 4
   %d = getelementptr inbounds i8, ptr %ptr, i64 12
-  %1 = load i32, ptr %d, align 4
-  tail call void asm sideeffect "# 'rm' input pressure -> $0 $1", "rm,rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1) #1
-  %2 = load i32, ptr %ptr, align 4
-  ret i32 %2
+  %i1 = load i32, ptr %d, align 4
+  tail call void asm sideeffect "# 'rm' input pressure -> $0 $1", "rm,rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1) #1
+  %i2 = load i32, ptr %ptr, align 4
+  ret i32 %i2
 }
 
 define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr #0 {
@@ -123,8 +123,8 @@ entry:
   %b = getelementptr inbounds i8, ptr %ptr, i64 4
   %d = getelementptr inbounds i8, ptr %ptr, i64 12
   tail call void asm sideeffect "# 'rm' output no pressure -> $0 $1", "=*rm,=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d) #1
-  %0 = load i32, ptr %ptr, align 4
-  ret i32 %0
+  %i = load i32, ptr %ptr, align 4
+  ret i32 %i
 }
 
 define dso_local i32 @test4(ptr noundef %ptr) local_unnamed_addr #0 {
@@ -159,12 +159,12 @@ define dso_local i32 @test4(ptr noundef %ptr) local_unnamed_addr #0 {
 ; FAST-I386:    #NO_APP
 entry:
   %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  %0 = load i32, ptr %b, align 4
+  %i = load i32, ptr %b, align 4
   %d = getelementptr inbounds i8, ptr %ptr, i64 12
-  %1 = load i32, ptr %d, align 4
-  tail call void asm sideeffect "# tied 'rm' no pressure -> $0 $1 $2 $3", "=*rm,=*rm,0,1,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d, i32 %0, i32 %1) #1
-  %2 = load i32, ptr %ptr, align 4
-  ret i32 %2
+  %i1 = load i32, ptr %d, align 4
+  tail call void asm sideeffect "# tied 'rm' no pressure -> $0 $1 $2 $3", "=*rm,=*rm,0,1,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d, i32 %i, i32 %i1) #1
+  %i2 = load i32, ptr %ptr, align 4
+  ret i32 %i2
 }
 
 define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 {
@@ -199,10 +199,10 @@ define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_a
 ; FAST-I386:    #NO_APP
 entry:
   %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  %0 = load i32, ptr %b, align 4
-  tail call void asm sideeffect "# 'rm' input -> $0", "rm,~{dirflag},~{fpsr},~{flags}"(i32 %0) #1
-  %1 = load i32, ptr %ptr, align 4
-  ret i32 %1
+  %i = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# 'rm' input -> $0", "rm,~{dirflag},~{fpsr},~{flags}"(i32 %i) #1
+  %i1 = load i32, ptr %ptr, align 4
+  ret i32 %i1
 }
 
 define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 {
@@ -237,12 +237,12 @@ define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_a
 ; FAST-I386:    #NO_APP
 entry:
   %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  %0 = load i32, ptr %b, align 4
+  %i = load i32, ptr %b, align 4
   %d = getelementptr inbounds i8, ptr %ptr, i64 12
-  %1 = load i32, ptr %d, align 4
-  tail call void asm sideeffect "# 'rm' and 'r' input -> $0 $1", "rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1) #1
-  %2 = load i32, ptr %ptr, align 4
-  ret i32 %2
+  %i1 = load i32, ptr %d, align 4
+  tail call void asm sideeffect "# 'rm' and 'r' input -> $0 $1", "rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1) #1
+  %i2 = load i32, ptr %ptr, align 4
+  ret i32 %i2
 }
 
 define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr #0 {
@@ -278,8 +278,8 @@ define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr #0 {
 entry:
   %b = getelementptr inbounds i8, ptr %ptr, i64 4
   tail call void asm sideeffect "# 'rm' output -> $0", "=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b) #1
-  %0 = load i32, ptr %ptr, align 4
-  ret i32 %0
+  %i = load i32, ptr %ptr, align 4
+  ret i32 %i
 }
 
 define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr #0 {
@@ -314,10 +314,10 @@ define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr #0 {
 ; FAST-I386:    #NO_APP
 entry:
   %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  %0 = load i32, ptr %b, align 4
-  tail call void asm sideeffect "# 'rm' tied -> $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, i32 %0) #1
-  %1 = load i32, ptr %ptr, align 4
-  ret i32 %1
+  %i = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# 'rm' tied -> $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, i32 %i) #1
+  %i1 = load i32, ptr %ptr, align 4
+  ret i32 %i1
 }
 
 define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr #0 {
@@ -352,11 +352,11 @@ define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr #0 {
 ; FAST-I386:    #NO_APP
 entry:
   %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  %0 = load i32, ptr %b, align 4
-  %1 = tail call i32 asm sideeffect "# 'r' output == input location -> $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %0) #1
-  store i32 %1, ptr %b, align 4
-  %2 = load i32, ptr %ptr, align 4
-  ret i32 %2
+  %i = load i32, ptr %b, align 4
+  %i1 = tail call i32 asm sideeffect "# 'r' output == input location -> $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %i) #1
+  store i32 %i1, ptr %b, align 4
+  %i2 = load i32, ptr %ptr, align 4
+  ret i32 %i2
 }
 
 attributes #0 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }

>From 03d62f1f9fb7b47ff55cbaae0f7619c78491a3f1 Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard at gmail.com>
Date: Thu, 15 Jan 2026 01:21:53 -0800
Subject: [PATCH 04/29] [CodeGen] Add InlineAsmPrepare pass to convert "rm"
 constraints to "m" for fast regalloc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces a new IR-level pass, InlineAsmPrepare, that converts
inline assembly "rm" (register-or-memory) constraints to "m" (memory-only)
constraints when using the fast register allocator on x86 platforms.

Background:
The "rm" constraint allows the compiler to choose between a register or
memory operand. However, LLVM's architecture conservatively picks the most
restrictive interpretation to avoid issues with register pressure. This
causes the fast register allocator to always select registers for "rm"
constraints, even when memory would be more appropriate, leading to
unnecessary spills and suboptimal code.

Solution:
The InlineAsmPrepare pass runs at -O0 before instruction selection and
transforms inline asm calls by:
1. Converting "rm" constraints to "m" (memory-only)
2. Creating allocas for each converted operand
3. For inputs: storing values to allocas and passing pointers
4. For outputs: passing alloca pointers and loading results afterward
5. For tied constraints (e.g., "=rm,0"): storing input to output's alloca
   and passing the same pointer for both the output and tied input

Implementation details:
- InlineAsmPrepare.cpp: New pass that processes CallInst nodes with inline asm
  * Parses constraint strings and identifies "rm" patterns
  * Tracks tied input-output pairs via TiedOutput mapping
  * Reconstructs return values by loading from allocas for converted outputs
  * Properly handles struct returns with mixed converted/unconverted outputs

- SelectionDAG integration:
  * Added MayFoldRegister flag to mark "rm" constraints
  * Modified constraint preference to prefer 'r' over 'm' when MayFoldRegister
    is set, allowing the register allocator to spill to memory if needed
  * Updated AddInlineAsmOperands to propagate the MayFoldRegister flag

- Pass integration:
  * Registered in PassBuilder and PassRegistry
  * Added to TargetPassConfig for -O0 compilation

Test coverage:
Added inline-asm-prepare-memory.ll with three scenarios:
- Input-only: "rm" input → "m" with alloca + store + pointer arg
- Output-only: "=rm" output → "=*m" with alloca + pointer arg + load
- Tied/read-write: "=rm,0" → "=*m,0" with store + dual pointer args + load

The pass only runs on x86 platforms at -O0 to improve code generation for
the fast register allocator without impacting optimized builds.
---
 clang/lib/CodeGen/CGStmt.cpp                  |  13 +-
 clang/test/CodeGen/asm.c                      |   2 +-
 llvm/include/llvm/CodeGen/InlineAsmPrepare.h  |  23 ++
 llvm/include/llvm/CodeGen/Passes.h            |   5 +
 llvm/include/llvm/CodeGen/TargetLowering.h    |  10 +-
 llvm/include/llvm/IR/InlineAsm.h              |   8 +
 llvm/include/llvm/InitializePasses.h          |   1 +
 llvm/lib/CodeGen/CMakeLists.txt               |   1 +
 llvm/lib/CodeGen/CodeGen.cpp                  |   1 +
 llvm/lib/CodeGen/InlineAsmPrepare.cpp         | 334 ++++++++++++++++++
 llvm/lib/CodeGen/TargetPassConfig.cpp         |   3 +
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 llvm/test/CodeGen/AArch64/O0-pipeline.ll      |   1 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   1 +
 llvm/test/CodeGen/LoongArch/O0-pipeline.ll    |   1 +
 llvm/test/CodeGen/PowerPC/O0-pipeline.ll      |   1 +
 llvm/test/CodeGen/RISCV/O0-pipeline.ll        |   1 +
 llvm/test/CodeGen/SPIRV/llc-pipeline.ll       |   1 +
 llvm/test/CodeGen/X86/O0-pipeline.ll          |   1 +
 llvm/test/CodeGen/X86/asm-constraints-rm.ll   |  69 ++--
 .../CodeGen/X86/inline-asm-prepare-memory.ll  |  38 ++
 22 files changed, 472 insertions(+), 45 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/InlineAsmPrepare.h
 create mode 100644 llvm/lib/CodeGen/InlineAsmPrepare.cpp
 create mode 100644 llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll

diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index cf5ddb78c3a1d..64eba8040f113 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -2921,13 +2921,20 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
     if (!Constraints.empty())
       Constraints += ',';
 
-    // If this is a register output, then make the inline asm return it
-    // by-value.  If this is a memory result, return the value by-reference.
+    // - If this is a register output, then make the inline asm return it
+    //   by-value.
+    // - If this is an "rm" constraint on x86, then treat it like a register
+    //   output. (We'll correct this before ISel if using the FastRA.)
+    // - If this is a memory result, return the value by-reference.
     QualType QTy = OutExpr->getType();
     const bool IsScalarOrAggregate = hasScalarEvaluationKind(QTy) ||
                                      hasAggregateEvaluationKind(QTy);
-    if (!Info.allowsMemory() && IsScalarOrAggregate) {
+    const bool X86RegisterMemoryConstraints =
+        getTarget().getTriple().isX86() &&
+        (OutputConstraint == "rm" || OutputConstraint == "mr");
 
+    if (IsScalarOrAggregate &&
+        (!Info.allowsMemory() || X86RegisterMemoryConstraints)) {
       Constraints += "=" + OutputConstraint;
       ResultRegQualTys.push_back(QTy);
       ResultRegDests.push_back(Dest);
diff --git a/clang/test/CodeGen/asm.c b/clang/test/CodeGen/asm.c
index 9687c993e6464..66a7142ee7fca 100644
--- a/clang/test/CodeGen/asm.c
+++ b/clang/test/CodeGen/asm.c
@@ -259,7 +259,7 @@ void t31(int len) {
   __asm__ volatile(""
                    : "+%%rm"(len), "+rm"(len));
   // CHECK: @t31
-  // CHECK: call void asm sideeffect "", "=*%rm,=*rm,0,1,~{dirflag},~{fpsr},~{flags}"
+  // CHECK: call i32 asm sideeffect "", "=*%rm,=rm,0,1,~{dirflag},~{fpsr},~{flags}"
 }
 
 // CHECK: @t32
diff --git a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
new file mode 100644
index 0000000000000..a400a78390dff
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
@@ -0,0 +1,23 @@
+//===-- InlineAsmPrepare - Prepare inline asm for code gen ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_INLINEASMPREPARE_H
+#define LLVM_CODEGEN_INLINEASMPREPARE_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class InlineAsmPreparePass : public PassInfoMixin<InlineAsmPreparePass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_INLINEASMPREPARE_H
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 303b9076131e3..9e1e34269baca 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -630,6 +630,11 @@ LLVM_ABI ModulePass *createWindowsSecureHotPatchingPass();
 
 /// Lowers KCFI operand bundles for indirect calls.
 LLVM_ABI FunctionPass *createKCFIPass();
+
+/// Modify inline asms with "rm" constraints to "m" for the fast register
+/// allocator.
+LLVM_ABI FunctionPass *createInlineAsmPass();
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e94668a5d7a76..76a790d057115 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5222,11 +5222,6 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
     /// Memory, Other, Unknown.
     TargetLowering::ConstraintType ConstraintType = TargetLowering::C_Unknown;
 
-    /// The register may be folded. This is used if the constraint is "rm",
-    /// where we prefer using a register, but can fall back to a memory slot
-    /// under register pressure.
-    bool MayFoldRegister = false;
-
     /// If this is the result output operand or a clobber, this is null,
     /// otherwise it is the incoming operand to the CallInst.  This gets
     /// modified as the asm is processed.
@@ -5235,6 +5230,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
     /// The ValueType for the operand value.
     MVT ConstraintVT = MVT::Other;
 
+    /// The register may be folded. This is used if the constraint is "rm",
+    /// where we prefer using a register, but can fall back to a memory slot
+    /// under register pressure.
+    bool MayFoldRegister = false;
+
     /// Copy constructor for copying from a ConstraintInfo.
     AsmOperandInfo(InlineAsm::ConstraintInfo Info)
         : InlineAsm::ConstraintInfo(std::move(Info)) {}
diff --git a/llvm/include/llvm/IR/InlineAsm.h b/llvm/include/llvm/IR/InlineAsm.h
index 96887d129a69f..6491b0ff5e82b 100644
--- a/llvm/include/llvm/IR/InlineAsm.h
+++ b/llvm/include/llvm/IR/InlineAsm.h
@@ -181,6 +181,14 @@ class InlineAsm final : public Value {
     bool hasArg() const {
       return Type == isInput || (Type == isOutput && isIndirect);
     }
+
+    /// hassRegMemConstraints - Returns true if and only if the constraint
+    /// codes are "rm". This is useful when converting between a register form
+    /// to a memory form.
+    bool hasRegMemConstraints() const {
+      return Codes.size() == 2 && is_contained(Codes, "r") &&
+             is_contained(Codes, "m");
+    }
   };
 
   /// ParseConstraints - Split up the constraint string into the specific
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index e9e3ca3cc93a0..c3b550beb1e7f 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -145,6 +145,7 @@ initializeImmutableModuleSummaryIndexWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeImplicitNullChecksPass(PassRegistry &);
 LLVM_ABI void initializeIndirectBrExpandLegacyPassPass(PassRegistry &);
 LLVM_ABI void initializeInferAddressSpacesPass(PassRegistry &);
+LLVM_ABI void initializeInlineAsmPreparePass(PassRegistry &);
 LLVM_ABI void initializeInstSimplifyLegacyPassPass(PassRegistry &);
 LLVM_ABI void initializeInstructionCombiningPassPass(PassRegistry &);
 LLVM_ABI void initializeInstructionSelectPass(PassRegistry &);
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index f26b2cb6fddf5..9a1561402adfd 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -77,6 +77,7 @@ add_llvm_component_library(LLVMCodeGen
   IfConversion.cpp
   ImplicitNullChecks.cpp
   IndirectBrExpandPass.cpp
+  InlineAsmPrepare.cpp
   InitUndef.cpp
   InlineSpiller.cpp
   InsertCodePrefetch.cpp
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 3550eea13979a..a837f1c54f82e 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -53,6 +53,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeImplicitNullChecksPass(Registry);
   initializeIndirectBrExpandLegacyPassPass(Registry);
   initializeInitUndefLegacyPass(Registry);
+  initializeInlineAsmPreparePass(Registry);
   initializeInterleavedLoadCombinePass(Registry);
   initializeInterleavedAccessPass(Registry);
   initializeJMCInstrumenterPass(Registry);
diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
new file mode 100644
index 0000000000000..9524bcb302f8f
--- /dev/null
+++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
@@ -0,0 +1,334 @@
+//===-- InlineAsmPrepare - Prepare inline asm for code generation ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/InlineAsmPrepare.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/DerivedTypes.h"
+#include <sstream>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inline-asm-prepare"
+
+namespace {
+
+class InlineAsmPrepare : public FunctionPass {
+  InlineAsmPrepare(InlineAsmPrepare &) = delete;
+
+public:
+  InlineAsmPrepare() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {}
+  bool runOnFunction(Function &F) override;
+
+  static char ID;
+};
+
+char InlineAsmPrepare::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(InlineAsmPrepare, DEBUG_TYPE,
+                "Convert inline asm \"rm\" insts for fast register allocation",
+                false, false)
+FunctionPass *llvm::createInlineAsmPass() { return new InlineAsmPrepare(); }
+
+// For each inline asm, the "rm" constraint needs to default to "m" for the
+// fast register allocator.
+static SmallVector<CallBase *, 4> findInlineAsms(Function &F) {
+  SmallVector<CallBase *, 4> InlineAsms;
+
+  for_each(F, [&](BasicBlock &BB) {
+    for_each(BB, [&](Instruction &I) {
+      CallBase *CB = dyn_cast<CallBase>(&I);
+      if (!CB || !CB->isInlineAsm())
+        return;
+      InlineAsms.push_back(CB);
+    });
+  });
+
+  return InlineAsms;
+}
+
+static bool isRegMemConstraint(StringRef Constraint) {
+  return Constraint.size() == 2 && (Constraint == "rm" || Constraint == "mr");
+}
+
+// Convert instances of the "rm" constraints into "m".
+static std::string convertConstraintsToMemory(StringRef ConstraintStr) {
+  auto I = ConstraintStr.begin(), E = ConstraintStr.end();
+  std::ostringstream Out;
+
+  while (I != E) {
+    bool IsOutput = false;
+    bool HasIndirect = false;
+    if (*I == '=') {
+      Out << *I;
+      IsOutput = true;
+      ++I;
+    }
+    if (*I == '*') {
+      Out << '*';
+      HasIndirect = true;
+      ++I;
+    }
+    if (*I == '+') {
+      Out << '+';
+      IsOutput = true;
+      ++I;
+    }
+
+    auto Comma = std::find(I, E, ',');
+    std::string Sub(I, Comma);
+    if (isRegMemConstraint(Sub)) {
+      if (IsOutput && !HasIndirect)
+        Out << '*';
+      Out << 'm';
+    } else {
+      Out << Sub;
+    }
+
+    if (Comma == E)
+      break;
+
+    Out << ',';
+    I = Comma + 1;
+  }
+
+  return Out.str();
+}
+
+bool InlineAsmPrepare::runOnFunction(Function &F) {
+  // Only process "rm" on x86 platforms.
+  if (!F.getParent()->getTargetTriple().isX86())
+    return false;
+
+  SmallVector<CallBase *, 4> IAs = findInlineAsms(F);
+  if (IAs.empty())
+    return false;
+
+  bool Changed = false;
+  for (CallBase *CB : IAs) {
+    InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
+    const InlineAsm::ConstraintInfoVector &Constraints = IA->ParseConstraints();
+
+    std::string NewConstraintStr =
+        convertConstraintsToMemory(IA->getConstraintString());
+    if (NewConstraintStr == IA->getConstraintString())
+      continue;
+
+    IRBuilder<> Builder(CB);
+    // IRBuilder<> EntryBuilder(&F.getEntryBlock(), F.getEntryBlock().begin());
+
+    // Collect new arguments and return types.
+    SmallVector<Value *, 8> NewArgs;
+    SmallVector<Type *, 8> NewArgTypes;
+    SmallVector<Type *, 2> NewRetTypes;
+
+    SmallVector<std::pair<unsigned, Type *>, 8> ElementTypeAttrs;
+
+    // Track allocas created for converted outputs.
+    // Maps constraint index to the AllocaInst created for it (if any).
+    SmallVector<AllocaInst *, 8> OutputAllocas(Constraints.size(), nullptr);
+
+    // Track pairs of Input-Output tied constraints.
+    // TiedOutput[i] = j means Constraint i is an Input tied to Output Constraint j.
+    SmallVector<int, 8> TiedOutput(Constraints.size(), -1);
+    for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
+      const auto &C = Constraints[I];
+      if (C.Type == InlineAsm::isOutput && C.hasMatchingInput()) {
+        int InputIdx = C.MatchingInput;
+        if (InputIdx >= 0 && InputIdx < (int)Constraints.size())
+          TiedOutput[InputIdx] = I;
+      }
+      if (C.Type == InlineAsm::isInput && C.hasMatchingInput()) {
+        int OutputIdx = C.MatchingInput;
+        if (OutputIdx >= 0 && OutputIdx < (int)Constraints.size())
+          TiedOutput[I] = OutputIdx;
+      }
+    }
+
+    unsigned ArgNo = 0;
+    unsigned OutputIdx = 0;
+    for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
+      const auto &C = Constraints[I];
+
+      if (C.Type == InlineAsm::isOutput) {
+        // Output-only or Output with matching input (Read-Write)
+        Type *RetTy = CB->getType();
+        Type *SlotTy = RetTy;
+
+        if (StructType *ST = dyn_cast<StructType>(RetTy))
+          SlotTy = ST->getElementType(OutputIdx);
+
+        if (C.hasRegMemConstraints()) {
+          // Converted to memory constraint. Create alloca and pass pointer as
+          // argument.
+          AllocaInst *Slot = Builder.CreateAlloca(SlotTy, nullptr, "asm_mem");
+          NewArgs.push_back(Slot);
+          NewArgTypes.push_back(Slot->getType());
+          ElementTypeAttrs.push_back({NewArgs.size() - 1, SlotTy});
+          OutputAllocas[I] = Slot;
+          // No return value for this output since it's now an out-parameter.
+        } else {
+          // Unchanged, still an output return value.
+          NewRetTypes.push_back(SlotTy);
+        }
+
+        OutputIdx++;
+      } else if (C.Type == InlineAsm::isInput) {
+        // Input
+        Value *ArgVal = CB->getArgOperand(ArgNo);
+        Type *ArgTy = ArgVal->getType();
+        bool Handled = false;
+
+        if (TiedOutput[I] != -1) {
+          int MatchIdx = TiedOutput[I];
+          if (AllocaInst *Slot = OutputAllocas[MatchIdx]) {
+            // The matched output was converted to memory.
+            // Store this input into the alloca.
+            Builder.CreateStore(ArgVal, Slot);
+            // Pass the alloca pointer as the argument, instead of ArgVal.
+            // This ensures the tied "0" constraint matches the "*m" output.
+            NewArgs.push_back(Slot);
+            NewArgTypes.push_back(Slot->getType());
+            Handled = true;
+          }
+        }
+
+        if (!Handled) {
+            if (C.hasRegMemConstraints()) {
+              // Converted to memory constraint.
+              // Create alloca, store input, pass pointer as argument.
+              AllocaInst *Slot = Builder.CreateAlloca(ArgTy, nullptr, "asm_mem");
+              Builder.CreateStore(ArgVal, Slot);
+              NewArgs.push_back(Slot);
+              NewArgTypes.push_back(Slot->getType());
+            } else {
+              // Unchanged
+              NewArgs.push_back(ArgVal);
+              NewArgTypes.push_back(ArgTy);
+            }
+        }
+        ArgNo++;
+      }
+    }
+
+    Type *NewRetTy = nullptr;
+    if (NewRetTypes.empty()) {
+      NewRetTy = Type::getVoidTy(F.getContext());
+    } else if (NewRetTypes.size() == 1) {
+      NewRetTy = NewRetTypes[0];
+    } else {
+      NewRetTy = StructType::get(F.getContext(), NewRetTypes);
+    }
+
+    FunctionType *NewFTy = FunctionType::get(NewRetTy, NewArgTypes, false);
+    auto *NewIA = InlineAsm::get(
+        NewFTy, IA->getAsmString(), NewConstraintStr,
+        IA->hasSideEffects(), IA->isAlignStack(), IA->getDialect(),
+        IA->canThrow());
+
+    CallInst *NewCall = Builder.CreateCall(NewFTy, NewIA, NewArgs);
+    NewCall->setCallingConv(CB->getCallingConv());
+    NewCall->setAttributes(CB->getAttributes());
+    NewCall->setDebugLoc(CB->getDebugLoc());
+
+    for (const auto &Item : ElementTypeAttrs)
+      NewCall->addParamAttr(Item.first,
+                            Attribute::get(F.getContext(),
+                                           Attribute::ElementType,
+                                           Item.second));
+
+    // Reconstruct the return value and update users.
+    if (!CB->use_empty()) {
+        Value *Replacement = nullptr;
+        Type *RetTy = CB->getType();
+
+        if (RetTy->isVoidTy()) {
+            // No return value, nothing to replace.
+        } else if (isa<StructType>(RetTy)) {
+            // Multiple outputs. Reconstruct the struct.
+            Value *Res = UndefValue::get(RetTy);
+            unsigned NewRetIdx = 0;
+            unsigned OriginalOutIdx = 0;
+
+            for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
+                if (Constraints[I].Type != InlineAsm::isOutput)
+                    continue;
+
+                Value *Val = nullptr;
+                if (AllocaInst *Slot = OutputAllocas[I]) {
+                    // Converted to memory. Load from alloca.
+                    Val = Builder.CreateLoad(Slot->getAllocatedType(), Slot);
+                } else {
+                    // Not converted. Extract from NewCall return.
+                    if (NewRetTypes.size() == 1) {
+                         Val = NewCall;
+                    } else {
+                         Val = Builder.CreateExtractValue(NewCall, NewRetIdx);
+                    }
+                    NewRetIdx++;
+                }
+
+                Res = Builder.CreateInsertValue(Res, Val, OriginalOutIdx++);
+            }
+            Replacement = Res;
+        } else {
+            // Single output.
+            // Find the output constraint (should be the first one).
+            unsigned OutConstraintIdx = 0;
+            for (unsigned I = 0; I < Constraints.size(); ++I) {
+                 if (Constraints[I].Type == InlineAsm::isOutput) {
+                     OutConstraintIdx = I;
+                     break;
+                 }
+            }
+
+            if (AllocaInst *Slot = OutputAllocas[OutConstraintIdx]) {
+                Replacement = Builder.CreateLoad(Slot->getAllocatedType(), Slot);
+            } else {
+                Replacement = NewCall;
+            }
+        }
+
+        if (Replacement) {
+            CB->replaceAllUsesWith(Replacement);
+        }
+    }
+
+    CB->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses InlineAsmPreparePass::run(Function &F,
+                                            FunctionAnalysisManager &FAM) {
+  InlineAsmPrepare IAP;
+
+  bool Changed = IAP.runOnFunction(F);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 27e1afdcd7724..56ea3e12a528c 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -983,6 +983,9 @@ void TargetPassConfig::addISelPrepare() {
   if (getOptLevel() != CodeGenOptLevel::None)
     addPass(createObjCARCContractPass());
 
+  if (getOptLevel() == CodeGenOptLevel::None)
+    addPass(createInlineAsmPass());
+
   addPass(createCallBrPass());
 
   // Add both the safe stack and the stack protection passes: each of them will
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 8bb78c8c7df63..6e8237c571d6f 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -106,6 +106,7 @@
 #include "llvm/CodeGen/HardwareLoops.h"
 #include "llvm/CodeGen/IndirectBrExpand.h"
 #include "llvm/CodeGen/InitUndef.h"
+#include "llvm/CodeGen/InlineAsmPrepare.h"
 #include "llvm/CodeGen/InterleavedAccess.h"
 #include "llvm/CodeGen/InterleavedLoadCombine.h"
 #include "llvm/CodeGen/JMCInstrumenter.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 2cfb5b2592601..1b6774157e291 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -450,6 +450,7 @@ FUNCTION_PASS("helloworld", HelloWorldPass())
 FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass(*TM))
 FUNCTION_PASS("infer-address-spaces", InferAddressSpacesPass())
 FUNCTION_PASS("infer-alignment", InferAlignmentPass())
+FUNCTION_PASS("inline-asm-prepare", InlineAsmPreparePass())
 FUNCTION_PASS("inject-tli-mappings", InjectTLIMappings())
 FUNCTION_PASS("instcount", InstCountPass())
 FUNCTION_PASS("instnamer", InstructionNamerPass())
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
index cc0655b31d892..5f09a0fb04247 100644
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -33,6 +33,7 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       AArch64 Stack Tagging
 ; CHECK-NEXT:       Exception handling preparation
+; CHECK-NEXT:       Convert inline asm "rm" insts for fast register allocation
 ; CHECK-NEXT:       Prepare callbr
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 6940c1b238e1d..a2273c1ae93c6 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -94,6 +94,7 @@
 ; GCN-O0-NEXT:    Call Graph SCC Pass Manager
 ; GCN-O0-NEXT:      DummyCGSCCPass
 ; GCN-O0-NEXT:      FunctionPass Manager
+; GCN-O0-NEXT:        Convert inline asm "rm" insts for fast register allocation
 ; GCN-O0-NEXT:        Prepare callbr
 ; GCN-O0-NEXT:        Safe Stack instrumentation pass
 ; GCN-O0-NEXT:        Insert stack protectors
diff --git a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
index ad7eee3f975f6..eeb1488dcf4f3 100644
--- a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
@@ -31,6 +31,7 @@
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Exception handling preparation
+; CHECK-NEXT:       Convert inline asm "rm" insts for fast register allocation
 ; CHECK-NEXT:       Prepare callbr
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
diff --git a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
index d586328c5062e..fd2595a3c181b 100644
--- a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
@@ -30,6 +30,7 @@
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Exception handling preparation
+; CHECK-NEXT:       Convert inline asm "rm" insts for fast register allocation
 ; CHECK-NEXT:       Prepare callbr
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
index c3e0ed9b85ec7..d8c899ddafb2a 100644
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -32,6 +32,7 @@
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Exception handling preparation
+; CHECK-NEXT:       Convert inline asm "rm" insts for fast register allocation
 ; CHECK-NEXT:       Prepare callbr
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
diff --git a/llvm/test/CodeGen/SPIRV/llc-pipeline.ll b/llvm/test/CodeGen/SPIRV/llc-pipeline.ll
index cbd06ae1eec4e..f20c224b3e1d4 100644
--- a/llvm/test/CodeGen/SPIRV/llc-pipeline.ll
+++ b/llvm/test/CodeGen/SPIRV/llc-pipeline.ll
@@ -44,6 +44,7 @@
 ; SPIRV-O0-NEXT:    SPIRV emit intrinsics
 ; SPIRV-O0-NEXT:    FunctionPass Manager
 ; SPIRV-O0-NEXT:      SPIRV legalize bitcast pass
+; SPIRV-O0-NEXT:      Convert inline asm "rm" insts for fast register allocation
 ; SPIRV-O0-NEXT:      Prepare callbr
 ; SPIRV-O0-NEXT:      Safe Stack instrumentation pass
 ; SPIRV-O0-NEXT:      Insert stack protectors
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index 673b36968bdeb..4310ca2c4403d 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -32,6 +32,7 @@
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Expand indirectbr instructions
 ; CHECK-NEXT:       Exception handling preparation
+; CHECK-NEXT:       Convert inline asm "rm" insts for fast register allocation
 ; CHECK-NEXT:       Prepare callbr
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
diff --git a/llvm/test/CodeGen/X86/asm-constraints-rm.ll b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
index 6031eb7b22e6d..66ca437317997 100644
--- a/llvm/test/CodeGen/X86/asm-constraints-rm.ll
+++ b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
@@ -6,10 +6,10 @@
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -regalloc=fast   < %s | FileCheck --check-prefix=FAST-X86_64 %s
 ; RUN: llc -mtriple=i386-unknown-linux-gnu -regalloc=fast     < %s | FileCheck --check-prefix=FAST-I386 %s
 
-; The Greedy register allocator should use registers when there isn't register
-; pressure.
+; The non-fast register allocators should use registers when there isn't
+; register pressure.
 
-define dso_local i32 @test1(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 {
+define dso_local i32 @test1(ptr nocapture noundef readonly %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test1:
 ; GREEDY-X86_64:    #APP
 ; GREEDY-X86_64:    # 'rm' input no pressure -> %eax %ecx
@@ -22,12 +22,12 @@ define dso_local i32 @test1(ptr nocapture noundef readonly %ptr) local_unnamed_a
 ;
 ; BASIC-X86_64-LABEL: test1:
 ; BASIC-X86_64:    #APP
-; BASIC-X86_64:    # 'rm' input no pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; BASIC-X86_64:    # 'rm' input no pressure -> %ecx %eax
 ; BASIC-X86_64:    #NO_APP
 ;
 ; BASIC-I386-LABEL: test1:
 ; BASIC-I386:    #APP
-; BASIC-I386:    # 'rm' input no pressure -> {{[0-9]+}}(%esp) (%esp)
+; BASIC-I386:    # 'rm' input no pressure -> %ecx %eax
 ; BASIC-I386:    #NO_APP
 ;
 ; FAST-X86_64-LABEL: test1:
@@ -44,12 +44,12 @@ entry:
   %i = load i32, ptr %b, align 4
   %d = getelementptr inbounds i8, ptr %ptr, i64 12
   %i1 = load i32, ptr %d, align 4
-  tail call void asm sideeffect "# 'rm' input no pressure -> $0 $1", "rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1) #1
+  tail call void asm sideeffect "# 'rm' input no pressure -> $0 $1", "rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1)
   %i2 = load i32, ptr %ptr, align 4
   ret i32 %i2
 }
 
-define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 {
+define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test2:
 ; GREEDY-X86_64:    #APP # 8-byte Folded Reload
 ; GREEDY-X86_64:    # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
@@ -61,13 +61,13 @@ define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_a
 ; GREEDY-I386:    #NO_APP
 ;
 ; BASIC-X86_64-LABEL: test2:
-; BASIC-X86_64:    #APP
+; BASIC-X86_64:    #APP # 8-byte Folded Reload
 ; BASIC-X86_64:    # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
 ; BASIC-X86_64:    #NO_APP
 ;
 ; BASIC-I386-LABEL: test2:
-; BASIC-I386:    #APP
-; BASIC-I386:    # 'rm' input pressure -> {{[0-9]+}}(%esp) (%esp)
+; BASIC-I386:    #APP # 8-byte Folded Reload
+; BASIC-I386:    # 'rm' input pressure -> (%esp) {{[0-9]+}}(%esp)
 ; BASIC-I386:    #NO_APP
 ;
 ; FAST-X86_64-LABEL: test2:
@@ -84,12 +84,12 @@ entry:
   %i = load i32, ptr %b, align 4
   %d = getelementptr inbounds i8, ptr %ptr, i64 12
   %i1 = load i32, ptr %d, align 4
-  tail call void asm sideeffect "# 'rm' input pressure -> $0 $1", "rm,rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1) #1
+  tail call void asm sideeffect "# 'rm' input pressure -> $0 $1", "rm,rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1)
   %i2 = load i32, ptr %ptr, align 4
   ret i32 %i2
 }
 
-define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr #0 {
+define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test3:
 ; GREEDY-X86_64:    #APP
 ; GREEDY-X86_64:    # 'rm' output no pressure -> %eax %ecx
@@ -102,12 +102,12 @@ define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr #0 {
 ;
 ; BASIC-X86_64-LABEL: test3:
 ; BASIC-X86_64:    #APP
-; BASIC-X86_64:    # 'rm' output no pressure -> 4(%rdi) 12(%rdi)
+; BASIC-X86_64:    # 'rm' output no pressure -> %eax %ecx
 ; BASIC-X86_64:    #NO_APP
 ;
 ; BASIC-I386-LABEL: test3:
 ; BASIC-I386:    #APP
-; BASIC-I386:    # 'rm' output no pressure -> 4(%eax) 12(%eax)
+; BASIC-I386:    # 'rm' output no pressure -> %eax %ecx
 ; BASIC-I386:    #NO_APP
 ;
 ; FAST-X86_64-LABEL: test3:
@@ -122,12 +122,12 @@ define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr #0 {
 entry:
   %b = getelementptr inbounds i8, ptr %ptr, i64 4
   %d = getelementptr inbounds i8, ptr %ptr, i64 12
-  tail call void asm sideeffect "# 'rm' output no pressure -> $0 $1", "=*rm,=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d) #1
+  tail call void asm sideeffect "# 'rm' output no pressure -> $0 $1", "=*rm,=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d)
   %i = load i32, ptr %ptr, align 4
   ret i32 %i
 }
 
-define dso_local i32 @test4(ptr noundef %ptr) local_unnamed_addr #0 {
+define dso_local i32 @test4(ptr noundef %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test4:
 ; GREEDY-X86_64:    #APP
 ; GREEDY-X86_64:    # tied 'rm' no pressure -> %eax %ecx %eax %ecx
@@ -162,12 +162,12 @@ entry:
   %i = load i32, ptr %b, align 4
   %d = getelementptr inbounds i8, ptr %ptr, i64 12
   %i1 = load i32, ptr %d, align 4
-  tail call void asm sideeffect "# tied 'rm' no pressure -> $0 $1 $2 $3", "=*rm,=*rm,0,1,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d, i32 %i, i32 %i1) #1
+  tail call void asm sideeffect "# tied 'rm' no pressure -> $0 $1 $2 $3", "=*rm,=*rm,0,1,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d, i32 %i, i32 %i1)
   %i2 = load i32, ptr %ptr, align 4
   ret i32 %i2
 }
 
-define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 {
+define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test5:
 ; GREEDY-X86_64:    #APP
 ; GREEDY-X86_64:    # 'rm' input -> %eax
@@ -180,12 +180,12 @@ define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_a
 ;
 ; BASIC-X86_64-LABEL: test5:
 ; BASIC-X86_64:    #APP
-; BASIC-X86_64:    # 'rm' input -> -{{[0-9]+}}(%rsp)
+; BASIC-X86_64:    # 'rm' input -> %eax
 ; BASIC-X86_64:    #NO_APP
 ;
 ; BASIC-I386-LABEL: test5:
 ; BASIC-I386:    #APP
-; BASIC-I386:    # 'rm' input -> (%esp)
+; BASIC-I386:    # 'rm' input -> %eax
 ; BASIC-I386:    #NO_APP
 ;
 ; FAST-X86_64-LABEL: test5:
@@ -200,12 +200,12 @@ define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_a
 entry:
   %b = getelementptr inbounds i8, ptr %ptr, i64 4
   %i = load i32, ptr %b, align 4
-  tail call void asm sideeffect "# 'rm' input -> $0", "rm,~{dirflag},~{fpsr},~{flags}"(i32 %i) #1
+  tail call void asm sideeffect "# 'rm' input -> $0", "rm,~{dirflag},~{fpsr},~{flags}"(i32 %i)
   %i1 = load i32, ptr %ptr, align 4
   ret i32 %i1
 }
 
-define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_addr #0 {
+define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test6:
 ; GREEDY-X86_64:    #APP
 ; GREEDY-X86_64:    # 'rm' and 'r' input -> %eax %ecx
@@ -218,12 +218,12 @@ define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_a
 ;
 ; BASIC-X86_64-LABEL: test6:
 ; BASIC-X86_64:    #APP
-; BASIC-X86_64:    # 'rm' and 'r' input -> -{{[0-9]+}}(%rsp) %ecx
+; BASIC-X86_64:    # 'rm' and 'r' input -> %ecx %eax
 ; BASIC-X86_64:    #NO_APP
 ;
 ; BASIC-I386-LABEL: test6:
 ; BASIC-I386:    #APP
-; BASIC-I386:    # 'rm' and 'r' input -> (%esp) %ecx
+; BASIC-I386:    # 'rm' and 'r' input -> %ecx %eax
 ; BASIC-I386:    #NO_APP
 ;
 ; FAST-X86_64-LABEL: test6:
@@ -240,12 +240,12 @@ entry:
   %i = load i32, ptr %b, align 4
   %d = getelementptr inbounds i8, ptr %ptr, i64 12
   %i1 = load i32, ptr %d, align 4
-  tail call void asm sideeffect "# 'rm' and 'r' input -> $0 $1", "rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1) #1
+  tail call void asm sideeffect "# 'rm' and 'r' input -> $0 $1", "rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1)
   %i2 = load i32, ptr %ptr, align 4
   ret i32 %i2
 }
 
-define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr #0 {
+define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test7:
 ; GREEDY-X86_64:    #APP
 ; GREEDY-X86_64:    # 'rm' output -> %eax
@@ -258,12 +258,12 @@ define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr #0 {
 ;
 ; BASIC-X86_64-LABEL: test7:
 ; BASIC-X86_64:    #APP
-; BASIC-X86_64:    # 'rm' output -> 4(%rdi)
+; BASIC-X86_64:    # 'rm' output -> %eax
 ; BASIC-X86_64:    #NO_APP
 ;
 ; BASIC-I386-LABEL: test7:
 ; BASIC-I386:    #APP
-; BASIC-I386:    # 'rm' output -> 4(%eax)
+; BASIC-I386:    # 'rm' output -> %eax
 ; BASIC-I386:    #NO_APP
 ;
 ; FAST-X86_64-LABEL: test7:
@@ -277,12 +277,12 @@ define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr #0 {
 ; FAST-I386:    #NO_APP
 entry:
   %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  tail call void asm sideeffect "# 'rm' output -> $0", "=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b) #1
+  tail call void asm sideeffect "# 'rm' output -> $0", "=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b)
   %i = load i32, ptr %ptr, align 4
   ret i32 %i
 }
 
-define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr #0 {
+define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test8:
 ; GREEDY-X86_64:    #APP
 ; GREEDY-X86_64:    # 'rm' tied -> %eax
@@ -315,12 +315,12 @@ define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr #0 {
 entry:
   %b = getelementptr inbounds i8, ptr %ptr, i64 4
   %i = load i32, ptr %b, align 4
-  tail call void asm sideeffect "# 'rm' tied -> $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, i32 %i) #1
+  tail call void asm sideeffect "# 'rm' tied -> $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, i32 %i)
   %i1 = load i32, ptr %ptr, align 4
   ret i32 %i1
 }
 
-define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr #0 {
+define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test9:
 ; GREEDY-X86_64:    #APP
 ; GREEDY-X86_64:    # 'r' output == input location -> %eax
@@ -353,11 +353,8 @@ define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr #0 {
 entry:
   %b = getelementptr inbounds i8, ptr %ptr, i64 4
   %i = load i32, ptr %b, align 4
-  %i1 = tail call i32 asm sideeffect "# 'r' output == input location -> $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %i) #1
+  %i1 = tail call i32 asm sideeffect "# 'r' output == input location -> $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %i)
   store i32 %i1, ptr %b, align 4
   %i2 = load i32, ptr %ptr, align 4
   ret i32 %i2
 }
-
-attributes #0 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll b/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll
new file mode 100644
index 0000000000000..ce1e16a6518e6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll
@@ -0,0 +1,38 @@
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -stop-after=inline-asm-prepare < %s | FileCheck %s
+
+define void @func_rm_input(i32 %x) {
+; CHECK-LABEL: @func_rm_input
+; CHECK: %asm_mem = alloca i32
+; CHECK: store i32 %x, ptr %asm_mem
+; CHECK: call i32 asm sideeffect "mov $1, $0", "=r,m,~{dirflag},~{fpsr},~{flags}"(ptr %asm_mem)
+entry:
+  %0 = call i32 asm sideeffect "mov $1, $0", "=r,rm,~{dirflag},~{fpsr},~{flags}"(i32 %x)
+  ret void
+}
+
+define void @func_rm_output(ptr %p) {
+; CHECK-LABEL: @func_rm_output
+; CHECK: %asm_mem = alloca i32
+; CHECK: call void asm sideeffect "mov $1, $0", "=*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %asm_mem)
+; CHECK: %[[VAL:.*]] = load i32, ptr %asm_mem
+; CHECK: store i32 %[[VAL]], ptr %p
+entry:
+  %0 = call i32 asm sideeffect "mov $1, $0", "=rm,~{dirflag},~{fpsr},~{flags}"()
+  store i32 %0, ptr %p
+  ret void
+}
+
+define void @func_rm_inout(ptr %x_ptr) {
+; CHECK-LABEL: @func_rm_inout
+; CHECK: %x = load i32, ptr %x_ptr
+; CHECK: %asm_mem = alloca i32
+; CHECK: store i32 %x, ptr %asm_mem
+; CHECK: call void asm sideeffect "inc $0", "=*m,0,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %asm_mem, ptr %asm_mem)
+; CHECK: %[[VAL2:.*]] = load i32, ptr %asm_mem
+; CHECK: store i32 %[[VAL2]], ptr %x_ptr
+entry:
+  %x = load i32, ptr %x_ptr
+  %0 = call i32 asm sideeffect "inc $0", "=rm,0,~{dirflag},~{fpsr},~{flags}"(i32 %x)
+  store i32 %0, ptr %x_ptr
+  ret void
+}

>From 00d53a67c0cab310cfd6924eee72027668ba1e8a Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard at gmail.com>
Date: Thu, 15 Jan 2026 02:52:08 -0800
Subject: [PATCH 05/29] Reformat

---
 llvm/lib/CodeGen/InlineAsmPrepare.cpp | 141 +++++++++++++-------------
 1 file changed, 70 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
index 9524bcb302f8f..15a4b2827c9b0 100644
--- a/llvm/lib/CodeGen/InlineAsmPrepare.cpp
+++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
@@ -14,14 +14,14 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/DerivedTypes.h"
 #include <sstream>
 
 using namespace llvm;
@@ -150,7 +150,8 @@ bool InlineAsmPrepare::runOnFunction(Function &F) {
     SmallVector<AllocaInst *, 8> OutputAllocas(Constraints.size(), nullptr);
 
     // Track pairs of Input-Output tied constraints.
-    // TiedOutput[i] = j means Constraint i is an Input tied to Output Constraint j.
+    // TiedOutput[i] = j means Constraint i is an Input tied to Output
+    // Constraint j.
     SmallVector<int, 8> TiedOutput(Constraints.size(), -1);
     for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
       const auto &C = Constraints[I];
@@ -215,18 +216,18 @@ bool InlineAsmPrepare::runOnFunction(Function &F) {
         }
 
         if (!Handled) {
-            if (C.hasRegMemConstraints()) {
-              // Converted to memory constraint.
-              // Create alloca, store input, pass pointer as argument.
-              AllocaInst *Slot = Builder.CreateAlloca(ArgTy, nullptr, "asm_mem");
-              Builder.CreateStore(ArgVal, Slot);
-              NewArgs.push_back(Slot);
-              NewArgTypes.push_back(Slot->getType());
-            } else {
-              // Unchanged
-              NewArgs.push_back(ArgVal);
-              NewArgTypes.push_back(ArgTy);
-            }
+          if (C.hasRegMemConstraints()) {
+            // Converted to memory constraint.
+            // Create alloca, store input, pass pointer as argument.
+            AllocaInst *Slot = Builder.CreateAlloca(ArgTy, nullptr, "asm_mem");
+            Builder.CreateStore(ArgVal, Slot);
+            NewArgs.push_back(Slot);
+            NewArgTypes.push_back(Slot->getType());
+          } else {
+            // Unchanged
+            NewArgs.push_back(ArgVal);
+            NewArgTypes.push_back(ArgTy);
+          }
         }
         ArgNo++;
       }
@@ -242,10 +243,9 @@ bool InlineAsmPrepare::runOnFunction(Function &F) {
     }
 
     FunctionType *NewFTy = FunctionType::get(NewRetTy, NewArgTypes, false);
-    auto *NewIA = InlineAsm::get(
-        NewFTy, IA->getAsmString(), NewConstraintStr,
-        IA->hasSideEffects(), IA->isAlignStack(), IA->getDialect(),
-        IA->canThrow());
+    auto *NewIA = InlineAsm::get(NewFTy, IA->getAsmString(), NewConstraintStr,
+                                 IA->hasSideEffects(), IA->isAlignStack(),
+                                 IA->getDialect(), IA->canThrow());
 
     CallInst *NewCall = Builder.CreateCall(NewFTy, NewIA, NewArgs);
     NewCall->setCallingConv(CB->getCallingConv());
@@ -253,66 +253,65 @@ bool InlineAsmPrepare::runOnFunction(Function &F) {
     NewCall->setDebugLoc(CB->getDebugLoc());
 
     for (const auto &Item : ElementTypeAttrs)
-      NewCall->addParamAttr(Item.first,
-                            Attribute::get(F.getContext(),
-                                           Attribute::ElementType,
-                                           Item.second));
+      NewCall->addParamAttr(
+          Item.first,
+          Attribute::get(F.getContext(), Attribute::ElementType, Item.second));
 
     // Reconstruct the return value and update users.
     if (!CB->use_empty()) {
-        Value *Replacement = nullptr;
-        Type *RetTy = CB->getType();
-
-        if (RetTy->isVoidTy()) {
-            // No return value, nothing to replace.
-        } else if (isa<StructType>(RetTy)) {
-            // Multiple outputs. Reconstruct the struct.
-            Value *Res = UndefValue::get(RetTy);
-            unsigned NewRetIdx = 0;
-            unsigned OriginalOutIdx = 0;
-
-            for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
-                if (Constraints[I].Type != InlineAsm::isOutput)
-                    continue;
-
-                Value *Val = nullptr;
-                if (AllocaInst *Slot = OutputAllocas[I]) {
-                    // Converted to memory. Load from alloca.
-                    Val = Builder.CreateLoad(Slot->getAllocatedType(), Slot);
-                } else {
-                    // Not converted. Extract from NewCall return.
-                    if (NewRetTypes.size() == 1) {
-                         Val = NewCall;
-                    } else {
-                         Val = Builder.CreateExtractValue(NewCall, NewRetIdx);
-                    }
-                    NewRetIdx++;
-                }
-
-                Res = Builder.CreateInsertValue(Res, Val, OriginalOutIdx++);
-            }
-            Replacement = Res;
-        } else {
-            // Single output.
-            // Find the output constraint (should be the first one).
-            unsigned OutConstraintIdx = 0;
-            for (unsigned I = 0; I < Constraints.size(); ++I) {
-                 if (Constraints[I].Type == InlineAsm::isOutput) {
-                     OutConstraintIdx = I;
-                     break;
-                 }
-            }
-
-            if (AllocaInst *Slot = OutputAllocas[OutConstraintIdx]) {
-                Replacement = Builder.CreateLoad(Slot->getAllocatedType(), Slot);
+      Value *Replacement = nullptr;
+      Type *RetTy = CB->getType();
+
+      if (RetTy->isVoidTy()) {
+        // No return value, nothing to replace.
+      } else if (isa<StructType>(RetTy)) {
+        // Multiple outputs. Reconstruct the struct.
+        Value *Res = UndefValue::get(RetTy);
+        unsigned NewRetIdx = 0;
+        unsigned OriginalOutIdx = 0;
+
+        for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
+          if (Constraints[I].Type != InlineAsm::isOutput)
+            continue;
+
+          Value *Val = nullptr;
+          if (AllocaInst *Slot = OutputAllocas[I]) {
+            // Converted to memory. Load from alloca.
+            Val = Builder.CreateLoad(Slot->getAllocatedType(), Slot);
+          } else {
+            // Not converted. Extract from NewCall return.
+            if (NewRetTypes.size() == 1) {
+              Val = NewCall;
             } else {
-                Replacement = NewCall;
+              Val = Builder.CreateExtractValue(NewCall, NewRetIdx);
             }
+            NewRetIdx++;
+          }
+
+          Res = Builder.CreateInsertValue(Res, Val, OriginalOutIdx++);
+        }
+        Replacement = Res;
+      } else {
+        // Single output.
+        // Find the output constraint (should be the first one).
+        unsigned OutConstraintIdx = 0;
+        for (unsigned I = 0; I < Constraints.size(); ++I) {
+          if (Constraints[I].Type == InlineAsm::isOutput) {
+            OutConstraintIdx = I;
+            break;
+          }
         }
 
-        if (Replacement) {
-            CB->replaceAllUsesWith(Replacement);
+        if (AllocaInst *Slot = OutputAllocas[OutConstraintIdx]) {
+          Replacement = Builder.CreateLoad(Slot->getAllocatedType(), Slot);
+        } else {
+          Replacement = NewCall;
         }
+      }
+
+      if (Replacement) {
+        CB->replaceAllUsesWith(Replacement);
+      }
     }
 
     CB->eraseFromParent();

>From 9fb25404cda7fd098955e930847e77fa31ac97b5 Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard at gmail.com>
Date: Thu, 15 Jan 2026 02:56:17 -0800
Subject: [PATCH 06/29] Use poison instead of undef

---
 llvm/lib/CodeGen/InlineAsmPrepare.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
index 15a4b2827c9b0..c6848d1fe3539 100644
--- a/llvm/lib/CodeGen/InlineAsmPrepare.cpp
+++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
@@ -266,7 +266,7 @@ bool InlineAsmPrepare::runOnFunction(Function &F) {
         // No return value, nothing to replace.
       } else if (isa<StructType>(RetTy)) {
         // Multiple outputs. Reconstruct the struct.
-        Value *Res = UndefValue::get(RetTy);
+        Value *Res = PoisonValue::get(RetTy);
         unsigned NewRetIdx = 0;
         unsigned OriginalOutIdx = 0;
 

>From ca0ae4918c1c83a4c4ae098fe9d031303b3c42ce Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard at gmail.com>
Date: Thu, 15 Jan 2026 03:02:21 -0800
Subject: [PATCH 07/29] fix LLVM ABI issues.

---
 llvm/include/llvm/CodeGen/InlineAsmPrepare.h | 3 ++-
 llvm/include/llvm/CodeGen/Passes.h           | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
index a400a78390dff..5ff22cde3dc67 100644
--- a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
+++ b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
@@ -9,13 +9,14 @@
 #ifndef LLVM_CODEGEN_INLINEASMPREPARE_H
 #define LLVM_CODEGEN_INLINEASMPREPARE_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 
 class InlineAsmPreparePass : public PassInfoMixin<InlineAsmPreparePass> {
 public:
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 9e1e34269baca..ae37c5b4ba272 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -567,7 +567,7 @@ LLVM_ABI FunctionPass *createCFIFixup();
 LLVM_ABI FunctionPass *createCFIInstrInserter();
 
 // Expands floating point instructions.
-FunctionPass *createExpandIRInstsPass(CodeGenOptLevel);
+LLVM_ABI FunctionPass *createExpandIRInstsPass(CodeGenOptLevel);
 
 /// Creates CFGuard longjmp target identification pass.
 /// \see CFGuardLongjmp.cpp

>From 189d0242323928b903e2d81f9d046152eccfa567 Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard at gmail.com>
Date: Thu, 15 Jan 2026 03:04:48 -0800
Subject: [PATCH 08/29] Correct header order.

---
 llvm/include/llvm/CodeGen/InlineAsmPrepare.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
index 5ff22cde3dc67..e5ff4db562577 100644
--- a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
+++ b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_CODEGEN_INLINEASMPREPARE_H
 #define LLVM_CODEGEN_INLINEASMPREPARE_H
 
-#include "llvm/Support/Compiler.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
 

>From 431a09a7c8fbc4c1af7f0b51441e63a36959f30c Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard at gmail.com>
Date: Fri, 16 Jan 2026 18:25:37 -0800
Subject: [PATCH 09/29] Follow the style guide re-for loops.

---
 llvm/lib/CodeGen/InlineAsmPrepare.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
index c6848d1fe3539..a8b92a960e78b 100644
--- a/llvm/lib/CodeGen/InlineAsmPrepare.cpp
+++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
@@ -56,14 +56,14 @@ FunctionPass *llvm::createInlineAsmPass() { return new InlineAsmPrepare(); }
 static SmallVector<CallBase *, 4> findInlineAsms(Function &F) {
   SmallVector<CallBase *, 4> InlineAsms;
 
-  for_each(F, [&](BasicBlock &BB) {
-    for_each(BB, [&](Instruction &I) {
+  for (auto &BB : F) {
+    for (auto &I : BB) {
       CallBase *CB = dyn_cast<CallBase>(&I);
       if (!CB || !CB->isInlineAsm())
-        return;
+        continue;
       InlineAsms.push_back(CB);
-    });
-  });
+    }
+  }
 
   return InlineAsms;
 }

>From c10134578f6de55007df4dbeced63f81f8b51a4a Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard at gmail.com>
Date: Sat, 17 Jan 2026 05:28:35 -0800
Subject: [PATCH 10/29] Add the appropriate "PreservedAnalyses" data.

Also perform some random code refactoring.
---
 llvm/lib/CodeGen/InlineAsmPrepare.cpp      | 388 ++++++++++++---------
 llvm/test/CodeGen/AArch64/O0-pipeline.ll   |   2 +-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll   |   2 +-
 llvm/test/CodeGen/LoongArch/O0-pipeline.ll |   2 +-
 llvm/test/CodeGen/PowerPC/O0-pipeline.ll   |   2 +-
 llvm/test/CodeGen/RISCV/O0-pipeline.ll     |   2 +-
 llvm/test/CodeGen/SPIRV/llc-pipeline.ll    |   2 +-
 llvm/test/CodeGen/X86/O0-pipeline.ll       |   2 +-
 8 files changed, 227 insertions(+), 175 deletions(-)

diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
index a8b92a960e78b..bd0b165713ebc 100644
--- a/llvm/lib/CodeGen/InlineAsmPrepare.cpp
+++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
@@ -6,14 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//
+// This pass prepares inline assembly for code generation with the fast register
+// allocator---e.g., by converting "rm" (register-or-memory) constraints to "m"
+// (memory-only) constraints on x86 platforms, simplifying register allocation
+// by forcing operands to memory locations, avoiding the complexity of handling
+// dual register/memory options.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/InlineAsmPrepare.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -36,7 +38,9 @@ class InlineAsmPrepare : public FunctionPass {
 public:
   InlineAsmPrepare() : FunctionPass(ID) {}
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
   bool runOnFunction(Function &F) override;
 
   static char ID;
@@ -47,23 +51,18 @@ char InlineAsmPrepare::ID = 0;
 } // end anonymous namespace
 
 INITIALIZE_PASS(InlineAsmPrepare, DEBUG_TYPE,
-                "Convert inline asm \"rm\" insts for fast register allocation",
-                false, false)
+                "Prepare inline asm insts for fast register allocation", false,
+                false)
 FunctionPass *llvm::createInlineAsmPass() { return new InlineAsmPrepare(); }
 
-// For each inline asm, the "rm" constraint needs to default to "m" for the
-// fast register allocator.
+/// Find all inline assembly calls in the given function.
 static SmallVector<CallBase *, 4> findInlineAsms(Function &F) {
   SmallVector<CallBase *, 4> InlineAsms;
 
-  for (auto &BB : F) {
-    for (auto &I : BB) {
-      CallBase *CB = dyn_cast<CallBase>(&I);
-      if (!CB || !CB->isInlineAsm())
-        continue;
-      InlineAsms.push_back(CB);
-    }
-  }
+  for (BasicBlock &BB : F)
+    for (Instruction &I : BB)
+      if (CallBase *CB = dyn_cast<CallBase>(&I); CB && CB->isInlineAsm())
+        InlineAsms.push_back(CB);
 
   return InlineAsms;
 }
@@ -72,7 +71,7 @@ static bool isRegMemConstraint(StringRef Constraint) {
   return Constraint.size() == 2 && (Constraint == "rm" || Constraint == "mr");
 }
 
-// Convert instances of the "rm" constraints into "m".
+/// Convert instances of the "rm" constraints into "m".
 static std::string convertConstraintsToMemory(StringRef ConstraintStr) {
   auto I = ConstraintStr.begin(), E = ConstraintStr.end();
   std::ostringstream Out;
@@ -116,6 +115,187 @@ static std::string convertConstraintsToMemory(StringRef ConstraintStr) {
   return Out.str();
 }
 
+namespace {
+
+/// Build a map of tied constraints.
+/// TiedOutput[i] = j means Constraint i is an Input tied to Output Constraint
+/// j.
+static void
+buildTiedConstraintMap(const InlineAsm::ConstraintInfoVector &Constraints,
+                       SmallVectorImpl<int> &TiedOutput) {
+  for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
+    const InlineAsm::ConstraintInfo &C = Constraints[I];
+    if (C.Type == InlineAsm::isOutput && C.hasMatchingInput()) {
+      int InputIdx = C.MatchingInput;
+      if (InputIdx >= 0 && InputIdx < (int)Constraints.size())
+        TiedOutput[InputIdx] = I;
+    }
+    if (C.Type == InlineAsm::isInput && C.hasMatchingInput()) {
+      int OutputIdx = C.MatchingInput;
+      if (OutputIdx >= 0 && OutputIdx < (int)Constraints.size())
+        TiedOutput[I] = OutputIdx;
+    }
+  }
+}
+
+/// Process an output constraint, creating allocas for converted constraints.
+static void processOutputConstraint(
+    const InlineAsm::ConstraintInfo &C, Type *RetTy, unsigned OutputIdx,
+    IRBuilder<> &EntryBuilder, SmallVectorImpl<Value *> &NewArgs,
+    SmallVectorImpl<Type *> &NewArgTypes, SmallVectorImpl<Type *> &NewRetTypes,
+    SmallVectorImpl<std::pair<unsigned, Type *>> &ElementTypeAttrs,
+    SmallVectorImpl<AllocaInst *> &OutputAllocas, unsigned ConstraintIdx) {
+  Type *SlotTy = RetTy;
+  if (StructType *ST = dyn_cast<StructType>(RetTy))
+    SlotTy = ST->getElementType(OutputIdx);
+
+  if (C.hasRegMemConstraints()) {
+    // Converted to memory constraint. Create alloca and pass pointer as
+    // argument.
+    AllocaInst *Slot = EntryBuilder.CreateAlloca(SlotTy, nullptr, "asm_mem");
+    NewArgs.push_back(Slot);
+    NewArgTypes.push_back(Slot->getType());
+    ElementTypeAttrs.push_back({NewArgs.size() - 1, SlotTy});
+    OutputAllocas[ConstraintIdx] = Slot;
+    // No return value for this output since it's now an out-parameter.
+  } else {
+    // Unchanged, still an output return value.
+    NewRetTypes.push_back(SlotTy);
+  }
+}
+
+/// Process an input constraint, handling tied constraints and conversions.
+static void processInputConstraint(
+    const InlineAsm::ConstraintInfo &C, Value *ArgVal,
+    const SmallVectorImpl<int> &TiedOutput,
+    const SmallVectorImpl<AllocaInst *> &OutputAllocas, unsigned ConstraintIdx,
+    IRBuilder<> &Builder, IRBuilder<> &EntryBuilder,
+    SmallVectorImpl<Value *> &NewArgs, SmallVectorImpl<Type *> &NewArgTypes) {
+  Type *ArgTy = ArgVal->getType();
+  bool Handled = false;
+
+  if (TiedOutput[ConstraintIdx] != -1) {
+    int MatchIdx = TiedOutput[ConstraintIdx];
+    if (AllocaInst *Slot = OutputAllocas[MatchIdx]) {
+      // The matched output was converted to memory.
+      // Store this input into the alloca.
+      Builder.CreateStore(ArgVal, Slot);
+      // Pass the alloca pointer as the argument, instead of ArgVal.
+      // This ensures the tied "0" constraint matches the "*m" output.
+      NewArgs.push_back(Slot);
+      NewArgTypes.push_back(Slot->getType());
+      Handled = true;
+    }
+  }
+
+  if (!Handled) {
+    if (C.hasRegMemConstraints()) {
+      // Converted to memory constraint.
+      // Create alloca, store input, pass pointer as argument.
+      AllocaInst *Slot = EntryBuilder.CreateAlloca(ArgTy, nullptr, "asm_mem");
+      Builder.CreateStore(ArgVal, Slot);
+      NewArgs.push_back(Slot);
+      NewArgTypes.push_back(Slot->getType());
+    } else {
+      // Unchanged
+      NewArgs.push_back(ArgVal);
+      NewArgTypes.push_back(ArgTy);
+    }
+  }
+}
+
+/// Build the return type from the collected return types.
+static Type *buildReturnType(const SmallVectorImpl<Type *> &NewRetTypes,
+                             LLVMContext &Context) {
+  if (NewRetTypes.empty())
+    return Type::getVoidTy(Context);
+  if (NewRetTypes.size() == 1)
+    return NewRetTypes[0];
+  return StructType::get(Context, NewRetTypes);
+}
+
+/// Create the new inline assembly call with converted constraints.
+static CallInst *createNewInlineAsm(
+    InlineAsm *IA, const std::string &NewConstraintStr, Type *NewRetTy,
+    const SmallVectorImpl<Type *> &NewArgTypes,
+    const SmallVectorImpl<Value *> &NewArgs,
+    const SmallVectorImpl<std::pair<unsigned, Type *>> &ElementTypeAttrs,
+    CallBase *CB, IRBuilder<> &Builder, LLVMContext &Context) {
+  FunctionType *NewFTy = FunctionType::get(NewRetTy, NewArgTypes, false);
+  InlineAsm *NewIA = InlineAsm::get(
+      NewFTy, IA->getAsmString(), NewConstraintStr, IA->hasSideEffects(),
+      IA->isAlignStack(), IA->getDialect(), IA->canThrow());
+
+  CallInst *NewCall = Builder.CreateCall(NewFTy, NewIA, NewArgs);
+  NewCall->setCallingConv(CB->getCallingConv());
+  NewCall->setAttributes(CB->getAttributes());
+  NewCall->setDebugLoc(CB->getDebugLoc());
+
+  for (const std::pair<unsigned, Type *> &Item : ElementTypeAttrs)
+    NewCall->addParamAttr(
+        Item.first,
+        Attribute::get(Context, Attribute::ElementType, Item.second));
+
+  return NewCall;
+}
+
+/// Reconstruct the return value from the new call and allocas.
+static Value *
+reconstructReturnValue(Type *RetTy, CallInst *NewCall,
+                       const InlineAsm::ConstraintInfoVector &Constraints,
+                       const SmallVectorImpl<AllocaInst *> &OutputAllocas,
+                       const SmallVectorImpl<Type *> &NewRetTypes,
+                       IRBuilder<> &Builder) {
+  if (RetTy->isVoidTy())
+    return nullptr;
+
+  if (isa<StructType>(RetTy)) {
+    // Multiple outputs. Reconstruct the struct.
+    Value *Res = PoisonValue::get(RetTy);
+    unsigned NewRetIdx = 0;
+    unsigned OriginalOutIdx = 0;
+
+    for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
+      if (Constraints[I].Type != InlineAsm::isOutput)
+        continue;
+
+      Value *Val = nullptr;
+      if (AllocaInst *Slot = OutputAllocas[I]) {
+        // Converted to memory. Load from alloca.
+        Val = Builder.CreateLoad(Slot->getAllocatedType(), Slot);
+      } else {
+        // Not converted. Extract from NewCall return.
+        if (NewRetTypes.size() == 1) {
+          Val = NewCall;
+        } else {
+          Val = Builder.CreateExtractValue(NewCall, NewRetIdx);
+        }
+        NewRetIdx++;
+      }
+
+      Res = Builder.CreateInsertValue(Res, Val, OriginalOutIdx++);
+    }
+    return Res;
+  }
+
+  // Single output.
+  // Find the output constraint (should be the first one).
+  unsigned OutConstraintIdx = 0;
+  for (unsigned I = 0; I < Constraints.size(); ++I) {
+    if (Constraints[I].Type == InlineAsm::isOutput) {
+      OutConstraintIdx = I;
+      break;
+    }
+  }
+
+  if (AllocaInst *Slot = OutputAllocas[OutConstraintIdx])
+    return Builder.CreateLoad(Slot->getAllocatedType(), Slot);
+
+  return NewCall;
+}
+
+} // namespace
+
 bool InlineAsmPrepare::runOnFunction(Function &F) {
   // Only process "rm" on x86 platforms.
   if (!F.getParent()->getTargetTriple().isX86())
@@ -136,182 +316,54 @@ bool InlineAsmPrepare::runOnFunction(Function &F) {
       continue;
 
     IRBuilder<> Builder(CB);
-    // IRBuilder<> EntryBuilder(&F.getEntryBlock(), F.getEntryBlock().begin());
+    IRBuilder<> EntryBuilder(&F.getEntryBlock(), F.getEntryBlock().begin());
 
     // Collect new arguments and return types.
     SmallVector<Value *, 8> NewArgs;
     SmallVector<Type *, 8> NewArgTypes;
     SmallVector<Type *, 2> NewRetTypes;
-
     SmallVector<std::pair<unsigned, Type *>, 8> ElementTypeAttrs;
 
     // Track allocas created for converted outputs.
-    // Maps constraint index to the AllocaInst created for it (if any).
     SmallVector<AllocaInst *, 8> OutputAllocas(Constraints.size(), nullptr);
 
-    // Track pairs of Input-Output tied constraints.
-    // TiedOutput[i] = j means Constraint i is an Input tied to Output
-    // Constraint j.
+    // Build tied constraint map.
     SmallVector<int, 8> TiedOutput(Constraints.size(), -1);
-    for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
-      const auto &C = Constraints[I];
-      if (C.Type == InlineAsm::isOutput && C.hasMatchingInput()) {
-        int InputIdx = C.MatchingInput;
-        if (InputIdx >= 0 && InputIdx < (int)Constraints.size())
-          TiedOutput[InputIdx] = I;
-      }
-      if (C.Type == InlineAsm::isInput && C.hasMatchingInput()) {
-        int OutputIdx = C.MatchingInput;
-        if (OutputIdx >= 0 && OutputIdx < (int)Constraints.size())
-          TiedOutput[I] = OutputIdx;
-      }
-    }
+    buildTiedConstraintMap(Constraints, TiedOutput);
 
+    // Process constraints.
     unsigned ArgNo = 0;
     unsigned OutputIdx = 0;
     for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
-      const auto &C = Constraints[I];
+      const InlineAsm::ConstraintInfo &C = Constraints[I];
 
       if (C.Type == InlineAsm::isOutput) {
-        // Output-only or Output with matching input (Read-Write)
-        Type *RetTy = CB->getType();
-        Type *SlotTy = RetTy;
-
-        if (StructType *ST = dyn_cast<StructType>(RetTy))
-          SlotTy = ST->getElementType(OutputIdx);
-
-        if (C.hasRegMemConstraints()) {
-          // Converted to memory constraint. Create alloca and pass pointer as
-          // argument.
-          AllocaInst *Slot = Builder.CreateAlloca(SlotTy, nullptr, "asm_mem");
-          NewArgs.push_back(Slot);
-          NewArgTypes.push_back(Slot->getType());
-          ElementTypeAttrs.push_back({NewArgs.size() - 1, SlotTy});
-          OutputAllocas[I] = Slot;
-          // No return value for this output since it's now an out-parameter.
-        } else {
-          // Unchanged, still an output return value.
-          NewRetTypes.push_back(SlotTy);
-        }
-
+        processOutputConstraint(C, CB->getType(), OutputIdx, EntryBuilder,
+                                NewArgs, NewArgTypes, NewRetTypes,
+                                ElementTypeAttrs, OutputAllocas, I);
         OutputIdx++;
       } else if (C.Type == InlineAsm::isInput) {
-        // Input
         Value *ArgVal = CB->getArgOperand(ArgNo);
-        Type *ArgTy = ArgVal->getType();
-        bool Handled = false;
-
-        if (TiedOutput[I] != -1) {
-          int MatchIdx = TiedOutput[I];
-          if (AllocaInst *Slot = OutputAllocas[MatchIdx]) {
-            // The matched output was converted to memory.
-            // Store this input into the alloca.
-            Builder.CreateStore(ArgVal, Slot);
-            // Pass the alloca pointer as the argument, instead of ArgVal.
-            // This ensures the tied "0" constraint matches the "*m" output.
-            NewArgs.push_back(Slot);
-            NewArgTypes.push_back(Slot->getType());
-            Handled = true;
-          }
-        }
-
-        if (!Handled) {
-          if (C.hasRegMemConstraints()) {
-            // Converted to memory constraint.
-            // Create alloca, store input, pass pointer as argument.
-            AllocaInst *Slot = Builder.CreateAlloca(ArgTy, nullptr, "asm_mem");
-            Builder.CreateStore(ArgVal, Slot);
-            NewArgs.push_back(Slot);
-            NewArgTypes.push_back(Slot->getType());
-          } else {
-            // Unchanged
-            NewArgs.push_back(ArgVal);
-            NewArgTypes.push_back(ArgTy);
-          }
-        }
+        processInputConstraint(C, ArgVal, TiedOutput, OutputAllocas, I, Builder,
+                               EntryBuilder, NewArgs, NewArgTypes);
         ArgNo++;
       }
     }
 
-    Type *NewRetTy = nullptr;
-    if (NewRetTypes.empty()) {
-      NewRetTy = Type::getVoidTy(F.getContext());
-    } else if (NewRetTypes.size() == 1) {
-      NewRetTy = NewRetTypes[0];
-    } else {
-      NewRetTy = StructType::get(F.getContext(), NewRetTypes);
-    }
-
-    FunctionType *NewFTy = FunctionType::get(NewRetTy, NewArgTypes, false);
-    auto *NewIA = InlineAsm::get(NewFTy, IA->getAsmString(), NewConstraintStr,
-                                 IA->hasSideEffects(), IA->isAlignStack(),
-                                 IA->getDialect(), IA->canThrow());
+    // Build the new return type.
+    Type *NewRetTy = buildReturnType(NewRetTypes, F.getContext());
 
-    CallInst *NewCall = Builder.CreateCall(NewFTy, NewIA, NewArgs);
-    NewCall->setCallingConv(CB->getCallingConv());
-    NewCall->setAttributes(CB->getAttributes());
-    NewCall->setDebugLoc(CB->getDebugLoc());
-
-    for (const auto &Item : ElementTypeAttrs)
-      NewCall->addParamAttr(
-          Item.first,
-          Attribute::get(F.getContext(), Attribute::ElementType, Item.second));
+    // Create the new inline assembly call.
+    CallInst *NewCall =
+        createNewInlineAsm(IA, NewConstraintStr, NewRetTy, NewArgTypes, NewArgs,
+                           ElementTypeAttrs, CB, Builder, F.getContext());
 
     // Reconstruct the return value and update users.
     if (!CB->use_empty()) {
-      Value *Replacement = nullptr;
-      Type *RetTy = CB->getType();
-
-      if (RetTy->isVoidTy()) {
-        // No return value, nothing to replace.
-      } else if (isa<StructType>(RetTy)) {
-        // Multiple outputs. Reconstruct the struct.
-        Value *Res = PoisonValue::get(RetTy);
-        unsigned NewRetIdx = 0;
-        unsigned OriginalOutIdx = 0;
-
-        for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
-          if (Constraints[I].Type != InlineAsm::isOutput)
-            continue;
-
-          Value *Val = nullptr;
-          if (AllocaInst *Slot = OutputAllocas[I]) {
-            // Converted to memory. Load from alloca.
-            Val = Builder.CreateLoad(Slot->getAllocatedType(), Slot);
-          } else {
-            // Not converted. Extract from NewCall return.
-            if (NewRetTypes.size() == 1) {
-              Val = NewCall;
-            } else {
-              Val = Builder.CreateExtractValue(NewCall, NewRetIdx);
-            }
-            NewRetIdx++;
-          }
-
-          Res = Builder.CreateInsertValue(Res, Val, OriginalOutIdx++);
-        }
-        Replacement = Res;
-      } else {
-        // Single output.
-        // Find the output constraint (should be the first one).
-        unsigned OutConstraintIdx = 0;
-        for (unsigned I = 0; I < Constraints.size(); ++I) {
-          if (Constraints[I].Type == InlineAsm::isOutput) {
-            OutConstraintIdx = I;
-            break;
-          }
-        }
-
-        if (AllocaInst *Slot = OutputAllocas[OutConstraintIdx]) {
-          Replacement = Builder.CreateLoad(Slot->getAllocatedType(), Slot);
-        } else {
-          Replacement = NewCall;
-        }
-      }
-
-      if (Replacement) {
+      if (Value *Replacement =
+              reconstructReturnValue(CB->getType(), NewCall, Constraints,
+                                     OutputAllocas, NewRetTypes, Builder))
         CB->replaceAllUsesWith(Replacement);
-      }
     }
 
     CB->eraseFromParent();
@@ -323,11 +375,11 @@ bool InlineAsmPrepare::runOnFunction(Function &F) {
 
 PreservedAnalyses InlineAsmPreparePass::run(Function &F,
                                             FunctionAnalysisManager &FAM) {
-  InlineAsmPrepare IAP;
-
-  bool Changed = IAP.runOnFunction(F);
+  bool Changed = InlineAsmPrepare().runOnFunction(F);
   if (!Changed)
     return PreservedAnalyses::all();
 
-  return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
index 5f09a0fb04247..bb81786d97958 100644
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -33,7 +33,7 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       AArch64 Stack Tagging
 ; CHECK-NEXT:       Exception handling preparation
-; CHECK-NEXT:       Convert inline asm "rm" insts for fast register allocation
+; CHECK-NEXT:       Prepare inline asm insts for fast register allocation
 ; CHECK-NEXT:       Prepare callbr
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 3974a1478aca3..e518ad88b4176 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -94,7 +94,7 @@
 ; GCN-O0-NEXT:    Call Graph SCC Pass Manager
 ; GCN-O0-NEXT:      DummyCGSCCPass
 ; GCN-O0-NEXT:      FunctionPass Manager
-; GCN-O0-NEXT:        Convert inline asm "rm" insts for fast register allocation
+; GCN-O0-NEXT:        Prepare inline asm insts for fast register allocation
 ; GCN-O0-NEXT:        Prepare callbr
 ; GCN-O0-NEXT:        Safe Stack instrumentation pass
 ; GCN-O0-NEXT:        Insert stack protectors
diff --git a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
index eeb1488dcf4f3..40d54aa4d4cf3 100644
--- a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
@@ -31,7 +31,7 @@
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Exception handling preparation
-; CHECK-NEXT:       Convert inline asm "rm" insts for fast register allocation
+; CHECK-NEXT:       Prepare inline asm insts for fast register allocation
 ; CHECK-NEXT:       Prepare callbr
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
diff --git a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
index fd2595a3c181b..a1633aa141698 100644
--- a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
@@ -30,7 +30,7 @@
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Exception handling preparation
-; CHECK-NEXT:       Convert inline asm "rm" insts for fast register allocation
+; CHECK-NEXT:       Prepare inline asm insts for fast register allocation
 ; CHECK-NEXT:       Prepare callbr
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
index d8c899ddafb2a..9b8cd17f1a5f0 100644
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -32,7 +32,7 @@
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Exception handling preparation
-; CHECK-NEXT:       Convert inline asm "rm" insts for fast register allocation
+; CHECK-NEXT:       Prepare inline asm insts for fast register allocation
 ; CHECK-NEXT:       Prepare callbr
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
diff --git a/llvm/test/CodeGen/SPIRV/llc-pipeline.ll b/llvm/test/CodeGen/SPIRV/llc-pipeline.ll
index f20c224b3e1d4..6b9b84cabc070 100644
--- a/llvm/test/CodeGen/SPIRV/llc-pipeline.ll
+++ b/llvm/test/CodeGen/SPIRV/llc-pipeline.ll
@@ -44,7 +44,7 @@
 ; SPIRV-O0-NEXT:    SPIRV emit intrinsics
 ; SPIRV-O0-NEXT:    FunctionPass Manager
 ; SPIRV-O0-NEXT:      SPIRV legalize bitcast pass
-; SPIRV-O0-NEXT:      Convert inline asm "rm" insts for fast register allocation
+; SPIRV-O0-NEXT:      Prepare inline asm insts for fast register allocation
 ; SPIRV-O0-NEXT:      Prepare callbr
 ; SPIRV-O0-NEXT:      Safe Stack instrumentation pass
 ; SPIRV-O0-NEXT:      Insert stack protectors
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index 4310ca2c4403d..ec1d7bcee1f6d 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -32,7 +32,7 @@
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Expand indirectbr instructions
 ; CHECK-NEXT:       Exception handling preparation
-; CHECK-NEXT:       Convert inline asm "rm" insts for fast register allocation
+; CHECK-NEXT:       Prepare inline asm insts for fast register allocation
 ; CHECK-NEXT:       Prepare callbr
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors

>From e12d0b34acec23b57389ed3afb8c1f1d1e9c732b Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Tue, 27 Jan 2026 22:56:23 -0800
Subject: [PATCH 11/29] Don't make this x86-specific. Also don't completely
 modify the constraints string, just add an 'indirect' thingy.

---
 clang/lib/CodeGen/CGStmt.cpp                  |   7 +-
 llvm/lib/CodeGen/InlineAsmPrepare.cpp         |  26 +-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |   7 +-
 llvm/test/CodeGen/X86/asm-constraints-rm.ll   | 340 ++++++------------
 .../CodeGen/X86/inline-asm-prepare-memory.ll  |  38 +-
 5 files changed, 156 insertions(+), 262 deletions(-)

diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 64eba8040f113..123e5d889e957 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -2929,12 +2929,11 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
     QualType QTy = OutExpr->getType();
     const bool IsScalarOrAggregate = hasScalarEvaluationKind(QTy) ||
                                      hasAggregateEvaluationKind(QTy);
-    const bool X86RegisterMemoryConstraints =
-        getTarget().getTriple().isX86() &&
-        (OutputConstraint == "rm" || OutputConstraint == "mr");
+    const bool RegisterMemoryConstraints =
+        OutputConstraint == "rm" || OutputConstraint == "mr";
 
     if (IsScalarOrAggregate &&
-        (!Info.allowsMemory() || X86RegisterMemoryConstraints)) {
+        (!Info.allowsMemory() || RegisterMemoryConstraints)) {
       Constraints += "=" + OutputConstraint;
       ResultRegQualTys.push_back(QTy);
       ResultRegDests.push_back(Dest);
diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
index bd0b165713ebc..f65cc84e3fbfd 100644
--- a/llvm/lib/CodeGen/InlineAsmPrepare.cpp
+++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
@@ -8,9 +8,9 @@
 //
 // This pass prepares inline assembly for code generation with the fast register
 // allocator---e.g., by converting "rm" (register-or-memory) constraints to "m"
-// (memory-only) constraints on x86 platforms, simplifying register allocation
-// by forcing operands to memory locations, avoiding the complexity of handling
-// dual register/memory options.
+// (memory-only) constraints, simplifying register allocation by forcing
+// operands to memory locations, avoiding the complexity of handling dual
+// register/memory options.
 //
 //===----------------------------------------------------------------------===//
 
@@ -72,9 +72,11 @@ static bool isRegMemConstraint(StringRef Constraint) {
 }
 
 /// Convert instances of the "rm" constraints into "m".
-static std::string convertConstraintsToMemory(StringRef ConstraintStr) {
+static std::pair<std::string, bool>
+convertConstraintsToMemory(StringRef ConstraintStr) {
   auto I = ConstraintStr.begin(), E = ConstraintStr.end();
   std::ostringstream Out;
+  bool HasRegMem = false;
 
   while (I != E) {
     bool IsOutput = false;
@@ -98,13 +100,13 @@ static std::string convertConstraintsToMemory(StringRef ConstraintStr) {
     auto Comma = std::find(I, E, ',');
     std::string Sub(I, Comma);
     if (isRegMemConstraint(Sub)) {
+      HasRegMem = true;
       if (IsOutput && !HasIndirect)
         Out << '*';
-      Out << 'm';
-    } else {
-      Out << Sub;
     }
 
+    Out << Sub;
+
     if (Comma == E)
       break;
 
@@ -112,7 +114,7 @@ static std::string convertConstraintsToMemory(StringRef ConstraintStr) {
     I = Comma + 1;
   }
 
-  return Out.str();
+  return std::make_pair(Out.str(), HasRegMem);
 }
 
 namespace {
@@ -297,10 +299,6 @@ reconstructReturnValue(Type *RetTy, CallInst *NewCall,
 } // namespace
 
 bool InlineAsmPrepare::runOnFunction(Function &F) {
-  // Only process "rm" on x86 platforms.
-  if (!F.getParent()->getTargetTriple().isX86())
-    return false;
-
   SmallVector<CallBase *, 4> IAs = findInlineAsms(F);
   if (IAs.empty())
     return false;
@@ -310,9 +308,9 @@ bool InlineAsmPrepare::runOnFunction(Function &F) {
     InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
     const InlineAsm::ConstraintInfoVector &Constraints = IA->ParseConstraints();
 
-    std::string NewConstraintStr =
+    auto [NewConstraintStr, HasRegMem] =
         convertConstraintsToMemory(IA->getConstraintString());
-    if (NewConstraintStr == IA->getConstraintString())
+    if (!HasRegMem)
       continue;
 
     IRBuilder<> Builder(CB);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 158925992ce10..00502d997686e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -36,7 +36,6 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/TargetParser/Triple.h"
 #include <cctype>
 #include <deque>
 using namespace llvm;
@@ -5959,7 +5958,6 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
   unsigned ResNo = 0; // ResNo - The result number of the next output.
   unsigned LabelNo = 0; // LabelNo - CallBr indirect dest number.
 
-  const Triple &T = getTargetMachine().getTargetTriple();
   for (InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
     ConstraintOperands.emplace_back(std::move(CI));
     AsmOperandInfo &OpInfo = ConstraintOperands.back();
@@ -5975,7 +5973,7 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
     // would vastly prefer to use 'r' over 'm', but can't because of LLVM's
     // architecture picks the most "conservative" constraint to ensure that (in
     // the case of "rm") register pressure cause bad things to happen.
-    if (T.isX86() && !OpInfo.hasMatchingInput() && OpInfo.Codes.size() == 2 &&
+    if (!OpInfo.hasMatchingInput() && OpInfo.Codes.size() == 2 &&
         llvm::is_contained(OpInfo.Codes, "r") &&
         llvm::is_contained(OpInfo.Codes, "m"))
       OpInfo.MayFoldRegister = true;
@@ -6277,7 +6275,8 @@ TargetLowering::ConstraintGroup TargetLowering::getConstraintPreferences(
   // If we can fold the register (i.e. it has an "rm" constraint), opt for the
   // 'r' constraint, and allow the register allocator to spill if need be.
   // Applies only to the greedy and default register allocators.
-  if (OpInfo.MayFoldRegister) {
+  const TargetMachine &TM = getTargetMachine();
+  if (TM.getOptLevel() != CodeGenOptLevel::None && OpInfo.MayFoldRegister) {
     Ret.emplace_back(ConstraintPair("r", getConstraintType("r")));
     Ret.emplace_back(ConstraintPair("m", getConstraintType("m")));
     return Ret;
diff --git a/llvm/test/CodeGen/X86/asm-constraints-rm.ll b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
index 66ca437317997..59c4672a97407 100644
--- a/llvm/test/CodeGen/X86/asm-constraints-rm.ll
+++ b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
@@ -1,360 +1,258 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "^\t#" --version 4
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -regalloc=greedy < %s | FileCheck --check-prefix=GREEDY-X86_64 %s
-; RUN: llc -mtriple=i386-unknown-linux-gnu -regalloc=greedy   < %s | FileCheck --check-prefix=GREEDY-I386 %s
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -regalloc=basic  < %s | FileCheck --check-prefix=BASIC-X86_64 %s
-; RUN: llc -mtriple=i386-unknown-linux-gnu -regalloc=basic    < %s | FileCheck --check-prefix=BASIC-I386 %s
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -regalloc=fast   < %s | FileCheck --check-prefix=FAST-X86_64 %s
-; RUN: llc -mtriple=i386-unknown-linux-gnu -regalloc=fast     < %s | FileCheck --check-prefix=FAST-I386 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=greedy < %s | FileCheck --check-prefix=GREEDY-X86_64 %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=greedy   < %s | FileCheck --check-prefix=GREEDY-I386 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=basic  < %s | FileCheck --check-prefix=BASIC-X86_64 %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=basic    < %s | FileCheck --check-prefix=BASIC-I386 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O0 -regalloc=fast   < %s | FileCheck --check-prefix=FAST-X86_64 %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -O0 -regalloc=fast     < %s | FileCheck --check-prefix=FAST-I386 %s
 
 ; The non-fast register allocators should use registers when there isn't
 ; register pressure.
 
 define dso_local i32 @test1(ptr nocapture noundef readonly %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test1:
-; GREEDY-X86_64:    #APP
-; GREEDY-X86_64:    # 'rm' input no pressure -> %eax %ecx
-; GREEDY-X86_64:    #NO_APP
 ;
 ; GREEDY-I386-LABEL: test1:
-; GREEDY-I386:    #APP
-; GREEDY-I386:    # 'rm' input no pressure -> %ecx %edx
-; GREEDY-I386:    #NO_APP
 ;
 ; BASIC-X86_64-LABEL: test1:
-; BASIC-X86_64:    #APP
-; BASIC-X86_64:    # 'rm' input no pressure -> %ecx %eax
-; BASIC-X86_64:    #NO_APP
 ;
 ; BASIC-I386-LABEL: test1:
-; BASIC-I386:    #APP
-; BASIC-I386:    # 'rm' input no pressure -> %ecx %eax
-; BASIC-I386:    #NO_APP
 ;
 ; FAST-X86_64-LABEL: test1:
-; FAST-X86_64:    #APP
-; FAST-X86_64:    # 'rm' input no pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
-; FAST-X86_64:    #NO_APP
 ;
 ; FAST-I386-LABEL: test1:
-; FAST-I386:    #APP
-; FAST-I386:    # 'rm' input no pressure -> {{[0-9]+}}(%esp) (%esp)
-; FAST-I386:    #NO_APP
 entry:
-  %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  %i = load i32, ptr %b, align 4
-  %d = getelementptr inbounds i8, ptr %ptr, i64 12
-  %i1 = load i32, ptr %d, align 4
-  tail call void asm sideeffect "# 'rm' input no pressure -> $0 $1", "rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1)
-  %i2 = load i32, ptr %ptr, align 4
-  ret i32 %i2
+  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %0 = load i32, ptr %b, align 4
+  %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
+  %1 = load i32, ptr %d, align 4
+  tail call void asm sideeffect "# dual 'rm' input no pressure -> $0 $1", "rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
+  %2 = load i32, ptr %ptr, align 4
+  ret i32 %2
 }
 
 define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test2:
-; GREEDY-X86_64:    #APP # 8-byte Folded Reload
-; GREEDY-X86_64:    # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
-; GREEDY-X86_64:    #NO_APP
 ;
 ; GREEDY-I386-LABEL: test2:
-; GREEDY-I386:    #APP # 8-byte Folded Reload
-; GREEDY-I386:    # 'rm' input pressure -> {{[0-9]+}}(%esp) (%esp)
-; GREEDY-I386:    #NO_APP
 ;
 ; BASIC-X86_64-LABEL: test2:
-; BASIC-X86_64:    #APP # 8-byte Folded Reload
-; BASIC-X86_64:    # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
-; BASIC-X86_64:    #NO_APP
 ;
 ; BASIC-I386-LABEL: test2:
-; BASIC-I386:    #APP # 8-byte Folded Reload
-; BASIC-I386:    # 'rm' input pressure -> (%esp) {{[0-9]+}}(%esp)
-; BASIC-I386:    #NO_APP
 ;
 ; FAST-X86_64-LABEL: test2:
-; FAST-X86_64:    #APP
-; FAST-X86_64:    # 'rm' input pressure -> -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
-; FAST-X86_64:    #NO_APP
 ;
 ; FAST-I386-LABEL: test2:
-; FAST-I386:    #APP
-; FAST-I386:    # 'rm' input pressure -> {{[0-9]+}}(%esp) {{[0-9]+}}(%esp)
-; FAST-I386:    #NO_APP
 entry:
-  %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  %i = load i32, ptr %b, align 4
-  %d = getelementptr inbounds i8, ptr %ptr, i64 12
-  %i1 = load i32, ptr %d, align 4
-  tail call void asm sideeffect "# 'rm' input pressure -> $0 $1", "rm,rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1)
-  %i2 = load i32, ptr %ptr, align 4
-  ret i32 %i2
+  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %0 = load i32, ptr %b, align 4
+  %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
+  %1 = load i32, ptr %d, align 4
+  tail call void asm sideeffect "# dual 'rm' input pressure -> $0 $1", "rm,rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
+  %2 = load i32, ptr %ptr, align 4
+  ret i32 %2
 }
 
 define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test3:
-; GREEDY-X86_64:    #APP
-; GREEDY-X86_64:    # 'rm' output no pressure -> %eax %ecx
-; GREEDY-X86_64:    #NO_APP
 ;
 ; GREEDY-I386-LABEL: test3:
-; GREEDY-I386:    #APP
-; GREEDY-I386:    # 'rm' output no pressure -> %ecx %edx
-; GREEDY-I386:    #NO_APP
 ;
 ; BASIC-X86_64-LABEL: test3:
-; BASIC-X86_64:    #APP
-; BASIC-X86_64:    # 'rm' output no pressure -> %eax %ecx
-; BASIC-X86_64:    #NO_APP
 ;
 ; BASIC-I386-LABEL: test3:
-; BASIC-I386:    #APP
-; BASIC-I386:    # 'rm' output no pressure -> %eax %ecx
-; BASIC-I386:    #NO_APP
 ;
 ; FAST-X86_64-LABEL: test3:
-; FAST-X86_64:    #APP
-; FAST-X86_64:    # 'rm' output no pressure -> 4(%rdi) 12(%rdi)
-; FAST-X86_64:    #NO_APP
 ;
 ; FAST-I386-LABEL: test3:
-; FAST-I386:    #APP
-; FAST-I386:    # 'rm' output no pressure -> 4(%eax) 12(%eax)
-; FAST-I386:    #NO_APP
 entry:
-  %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  %d = getelementptr inbounds i8, ptr %ptr, i64 12
-  tail call void asm sideeffect "# 'rm' output no pressure -> $0 $1", "=*rm,=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d)
-  %i = load i32, ptr %ptr, align 4
-  ret i32 %i
+  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
+  %0 = tail call { i32, i32 } asm sideeffect "# dual 'rm' output no pressure -> $0 $1", "=rm,=rm,~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i32, i32 } %0, 0
+  %asmresult1 = extractvalue { i32, i32 } %0, 1
+  store i32 %asmresult, ptr %b, align 4
+  store i32 %asmresult1, ptr %d, align 4
+  %1 = load i32, ptr %ptr, align 4
+  ret i32 %1
 }
 
 define dso_local i32 @test4(ptr noundef %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test4:
-; GREEDY-X86_64:    #APP
-; GREEDY-X86_64:    # tied 'rm' no pressure -> %eax %ecx %eax %ecx
-; GREEDY-X86_64:    #NO_APP
 ;
 ; GREEDY-I386-LABEL: test4:
-; GREEDY-I386:    #APP
-; GREEDY-I386:    # tied 'rm' no pressure -> %ecx %edx %ecx %edx
-; GREEDY-I386:    #NO_APP
 ;
 ; BASIC-X86_64-LABEL: test4:
-; BASIC-X86_64:    #APP
-; BASIC-X86_64:    # tied 'rm' no pressure -> %eax %ecx %eax %ecx
-; BASIC-X86_64:    #NO_APP
 ;
 ; BASIC-I386-LABEL: test4:
-; BASIC-I386:    #APP
-; BASIC-I386:    # tied 'rm' no pressure -> %eax %ecx %eax %ecx
-; BASIC-I386:    #NO_APP
 ;
 ; FAST-X86_64-LABEL: test4:
-; FAST-X86_64:    #APP
-; FAST-X86_64:    # tied 'rm' no pressure -> %ecx %eax %ecx %eax
-; FAST-X86_64:    #NO_APP
 ;
 ; FAST-I386-LABEL: test4:
-; FAST-I386:    #APP
-; FAST-I386:    # tied 'rm' no pressure -> %edx %ecx %edx %ecx
-; FAST-I386:    #NO_APP
 entry:
-  %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  %i = load i32, ptr %b, align 4
-  %d = getelementptr inbounds i8, ptr %ptr, i64 12
-  %i1 = load i32, ptr %d, align 4
-  tail call void asm sideeffect "# tied 'rm' no pressure -> $0 $1 $2 $3", "=*rm,=*rm,0,1,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %d, i32 %i, i32 %i1)
-  %i2 = load i32, ptr %ptr, align 4
-  ret i32 %i2
+  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
+  %0 = tail call { i32, i32 } asm sideeffect "# dual 'rm' output pressure -> $0 $1", "=rm,=rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i32, i32 } %0, 0
+  %asmresult1 = extractvalue { i32, i32 } %0, 1
+  store i32 %asmresult, ptr %b, align 4
+  store i32 %asmresult1, ptr %d, align 4
+  %1 = load i32, ptr %ptr, align 4
+  ret i32 %1
 }
 
 define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test5:
-; GREEDY-X86_64:    #APP
-; GREEDY-X86_64:    # 'rm' input -> %eax
-; GREEDY-X86_64:    #NO_APP
 ;
 ; GREEDY-I386-LABEL: test5:
-; GREEDY-I386:    #APP
-; GREEDY-I386:    # 'rm' input -> %ecx
-; GREEDY-I386:    #NO_APP
 ;
 ; BASIC-X86_64-LABEL: test5:
-; BASIC-X86_64:    #APP
-; BASIC-X86_64:    # 'rm' input -> %eax
-; BASIC-X86_64:    #NO_APP
 ;
 ; BASIC-I386-LABEL: test5:
-; BASIC-I386:    #APP
-; BASIC-I386:    # 'rm' input -> %eax
-; BASIC-I386:    #NO_APP
 ;
 ; FAST-X86_64-LABEL: test5:
-; FAST-X86_64:    #APP
-; FAST-X86_64:    # 'rm' input -> -{{[0-9]+}}(%rsp)
-; FAST-X86_64:    #NO_APP
 ;
 ; FAST-I386-LABEL: test5:
-; FAST-I386:    #APP
-; FAST-I386:    # 'rm' input -> (%esp)
-; FAST-I386:    #NO_APP
 entry:
-  %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  %i = load i32, ptr %b, align 4
-  tail call void asm sideeffect "# 'rm' input -> $0", "rm,~{dirflag},~{fpsr},~{flags}"(i32 %i)
-  %i1 = load i32, ptr %ptr, align 4
-  ret i32 %i1
+  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %0 = load i32, ptr %b, align 4
+  %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
+  %1 = load i32, ptr %d, align 4
+  %2 = tail call { i32, i32 } asm sideeffect "# dual tied 'rm' no pressure -> $0 $1 $2 $3", "=rm,=rm,0,1,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
+  %asmresult = extractvalue { i32, i32 } %2, 0
+  %asmresult1 = extractvalue { i32, i32 } %2, 1
+  store i32 %asmresult, ptr %b, align 4
+  store i32 %asmresult1, ptr %d, align 4
+  %3 = load i32, ptr %ptr, align 4
+  ret i32 %3
 }
 
 define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test6:
-; GREEDY-X86_64:    #APP
-; GREEDY-X86_64:    # 'rm' and 'r' input -> %eax %ecx
-; GREEDY-X86_64:    #NO_APP
 ;
 ; GREEDY-I386-LABEL: test6:
-; GREEDY-I386:    #APP
-; GREEDY-I386:    # 'rm' and 'r' input -> %ecx %edx
-; GREEDY-I386:    #NO_APP
 ;
 ; BASIC-X86_64-LABEL: test6:
-; BASIC-X86_64:    #APP
-; BASIC-X86_64:    # 'rm' and 'r' input -> %ecx %eax
-; BASIC-X86_64:    #NO_APP
 ;
 ; BASIC-I386-LABEL: test6:
-; BASIC-I386:    #APP
-; BASIC-I386:    # 'rm' and 'r' input -> %ecx %eax
-; BASIC-I386:    #NO_APP
 ;
 ; FAST-X86_64-LABEL: test6:
-; FAST-X86_64:    #APP
-; FAST-X86_64:    # 'rm' and 'r' input -> -{{[0-9]+}}(%rsp) %eax
-; FAST-X86_64:    #NO_APP
 ;
 ; FAST-I386-LABEL: test6:
-; FAST-I386:    #APP
-; FAST-I386:    # 'rm' and 'r' input -> (%esp) %ecx
-; FAST-I386:    #NO_APP
 entry:
-  %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  %i = load i32, ptr %b, align 4
-  %d = getelementptr inbounds i8, ptr %ptr, i64 12
-  %i1 = load i32, ptr %d, align 4
-  tail call void asm sideeffect "# 'rm' and 'r' input -> $0 $1", "rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %i, i32 %i1)
-  %i2 = load i32, ptr %ptr, align 4
-  ret i32 %i2
+  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %0 = load i32, ptr %b, align 4
+  %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
+  %1 = load i32, ptr %d, align 4
+  %2 = tail call { i32, i32 } asm sideeffect "# dual tied 'rm' pressure -> $0 $1 $2 $3", "=rm,=rm,0,1,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
+  %asmresult = extractvalue { i32, i32 } %2, 0
+  %asmresult1 = extractvalue { i32, i32 } %2, 1
+  store i32 %asmresult, ptr %b, align 4
+  store i32 %asmresult1, ptr %d, align 4
+  %3 = load i32, ptr %ptr, align 4
+  ret i32 %3
 }
 
 define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test7:
-; GREEDY-X86_64:    #APP
-; GREEDY-X86_64:    # 'rm' output -> %eax
-; GREEDY-X86_64:    #NO_APP
 ;
 ; GREEDY-I386-LABEL: test7:
-; GREEDY-I386:    #APP
-; GREEDY-I386:    # 'rm' output -> %ecx
-; GREEDY-I386:    #NO_APP
 ;
 ; BASIC-X86_64-LABEL: test7:
-; BASIC-X86_64:    #APP
-; BASIC-X86_64:    # 'rm' output -> %eax
-; BASIC-X86_64:    #NO_APP
 ;
 ; BASIC-I386-LABEL: test7:
-; BASIC-I386:    #APP
-; BASIC-I386:    # 'rm' output -> %eax
-; BASIC-I386:    #NO_APP
 ;
 ; FAST-X86_64-LABEL: test7:
-; FAST-X86_64:    #APP
-; FAST-X86_64:    # 'rm' output -> 4(%rdi)
-; FAST-X86_64:    #NO_APP
 ;
 ; FAST-I386-LABEL: test7:
-; FAST-I386:    #APP
-; FAST-I386:    # 'rm' output -> 4(%eax)
-; FAST-I386:    #NO_APP
 entry:
-  %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  tail call void asm sideeffect "# 'rm' output -> $0", "=*rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b)
-  %i = load i32, ptr %ptr, align 4
-  ret i32 %i
+  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %0 = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# single 'rm' input -> $0", "rm,~{dirflag},~{fpsr},~{flags}"(i32 %0)
+  %1 = load i32, ptr %ptr, align 4
+  ret i32 %1
 }
 
 define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test8:
-; GREEDY-X86_64:    #APP
-; GREEDY-X86_64:    # 'rm' tied -> %eax
-; GREEDY-X86_64:    #NO_APP
 ;
 ; GREEDY-I386-LABEL: test8:
-; GREEDY-I386:    #APP
-; GREEDY-I386:    # 'rm' tied -> %ecx
-; GREEDY-I386:    #NO_APP
 ;
 ; BASIC-X86_64-LABEL: test8:
-; BASIC-X86_64:    #APP
-; BASIC-X86_64:    # 'rm' tied -> %eax
-; BASIC-X86_64:    #NO_APP
 ;
 ; BASIC-I386-LABEL: test8:
-; BASIC-I386:    #APP
-; BASIC-I386:    # 'rm' tied -> %eax
-; BASIC-I386:    #NO_APP
 ;
 ; FAST-X86_64-LABEL: test8:
-; FAST-X86_64:    #APP
-; FAST-X86_64:    # 'rm' tied -> %eax
-; FAST-X86_64:    #NO_APP
 ;
 ; FAST-I386-LABEL: test8:
-; FAST-I386:    #APP
-; FAST-I386:    # 'rm' tied -> %ecx
-; FAST-I386:    #NO_APP
 entry:
-  %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  %i = load i32, ptr %b, align 4
-  tail call void asm sideeffect "# 'rm' tied -> $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %b, i32 %i)
-  %i1 = load i32, ptr %ptr, align 4
-  ret i32 %i1
+  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %0 = load i32, ptr %b, align 4
+  %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
+  %1 = load i32, ptr %d, align 4
+  tail call void asm sideeffect "# dual 'rm' and 'r' input -> $0 $1", "rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
+  %2 = load i32, ptr %ptr, align 4
+  ret i32 %2
 }
 
 define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr {
 ; GREEDY-X86_64-LABEL: test9:
-; GREEDY-X86_64:    #APP
-; GREEDY-X86_64:    # 'r' output == input location -> %eax
-; GREEDY-X86_64:    #NO_APP
 ;
 ; GREEDY-I386-LABEL: test9:
-; GREEDY-I386:    #APP
-; GREEDY-I386:    # 'r' output == input location -> %ecx
-; GREEDY-I386:    #NO_APP
 ;
 ; BASIC-X86_64-LABEL: test9:
-; BASIC-X86_64:    #APP
-; BASIC-X86_64:    # 'r' output == input location -> %eax
-; BASIC-X86_64:    #NO_APP
 ;
 ; BASIC-I386-LABEL: test9:
-; BASIC-I386:    #APP
-; BASIC-I386:    # 'r' output == input location -> %eax
-; BASIC-I386:    #NO_APP
 ;
 ; FAST-X86_64-LABEL: test9:
-; FAST-X86_64:    #APP
-; FAST-X86_64:    # 'r' output == input location -> %eax
-; FAST-X86_64:    #NO_APP
 ;
 ; FAST-I386-LABEL: test9:
-; FAST-I386:    #APP
-; FAST-I386:    # 'r' output == input location -> %ecx
-; FAST-I386:    #NO_APP
 entry:
-  %b = getelementptr inbounds i8, ptr %ptr, i64 4
-  %i = load i32, ptr %b, align 4
-  %i1 = tail call i32 asm sideeffect "# 'r' output == input location -> $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %i)
-  store i32 %i1, ptr %b, align 4
-  %i2 = load i32, ptr %ptr, align 4
-  ret i32 %i2
+  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %0 = tail call i32 asm sideeffect "# single 'rm' output -> $0", "=rm,~{dirflag},~{fpsr},~{flags}"()
+  store i32 %0, ptr %b, align 4
+  %1 = load i32, ptr %ptr, align 4
+  ret i32 %1
+}
+
+define dso_local i32 @test10(ptr noundef captures(none) %ptr) local_unnamed_addr {
+; GREEDY-X86_64-LABEL: test10:
+;
+; GREEDY-I386-LABEL: test10:
+;
+; BASIC-X86_64-LABEL: test10:
+;
+; BASIC-I386-LABEL: test10:
+;
+; FAST-X86_64-LABEL: test10:
+;
+; FAST-I386-LABEL: test10:
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %0 = load i32, ptr %b, align 4
+  %1 = tail call i32 asm sideeffect "# simgle tied 'rm' input -> $0 $1", "=rm,0,~{dirflag},~{fpsr},~{flags}"(i32 %0)
+  store i32 %1, ptr %b, align 4
+  %2 = load i32, ptr %ptr, align 4
+  ret i32 %2
+}
+
+define dso_local i32 @test11(ptr noundef captures(none) %ptr) local_unnamed_addr {
+; GREEDY-X86_64-LABEL: test11:
+;
+; GREEDY-I386-LABEL: test11:
+;
+; BASIC-X86_64-LABEL: test11:
+;
+; BASIC-I386-LABEL: test11:
+;
+; FAST-X86_64-LABEL: test11:
+;
+; FAST-I386-LABEL: test11:
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %0 = load i32, ptr %b, align 4
+  %1 = tail call i32 asm sideeffect "# dual 'r' output == input location -> $0 $1", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %0)
+  store i32 %1, ptr %b, align 4
+  %2 = load i32, ptr %ptr, align 4
+  ret i32 %2
 }
diff --git a/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll b/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll
index ce1e16a6518e6..355bc030aea36 100644
--- a/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll
@@ -1,35 +1,35 @@
 ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -stop-after=inline-asm-prepare < %s | FileCheck %s
 
-define void @func_rm_input(i32 %x) {
-; CHECK-LABEL: @func_rm_input
-; CHECK: %asm_mem = alloca i32
-; CHECK: store i32 %x, ptr %asm_mem
-; CHECK: call i32 asm sideeffect "mov $1, $0", "=r,m,~{dirflag},~{fpsr},~{flags}"(ptr %asm_mem)
+define void @test1(i32 %x) {
+; CHECK-LABEL: @test1
+; CHECK:         %asm_mem = alloca i32
+; CHECK-NEXT:    store i32 %x, ptr %asm_mem
+; CHECK-NEXT:    call i32 asm sideeffect "mov $1, $0", "=r,*rm,~{dirflag},~{fpsr},~{flags}"(ptr %asm_mem)
 entry:
   %0 = call i32 asm sideeffect "mov $1, $0", "=r,rm,~{dirflag},~{fpsr},~{flags}"(i32 %x)
   ret void
 }
 
-define void @func_rm_output(ptr %p) {
-; CHECK-LABEL: @func_rm_output
-; CHECK: %asm_mem = alloca i32
-; CHECK: call void asm sideeffect "mov $1, $0", "=*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %asm_mem)
-; CHECK: %[[VAL:.*]] = load i32, ptr %asm_mem
-; CHECK: store i32 %[[VAL]], ptr %p
+define void @test2(ptr %p) {
+; CHECK-LABEL: @test2
+; CHECK:         %asm_mem = alloca i32
+; CHECK-NEXT:    call void asm sideeffect "mov $1, $0", "=*rm,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %asm_mem)
+; CHECK-NEXT:    %[[VAL1:.*]] = load i32, ptr %asm_mem
+; CHECK-NEXT:    store i32 %[[VAL1]], ptr %p
 entry:
   %0 = call i32 asm sideeffect "mov $1, $0", "=rm,~{dirflag},~{fpsr},~{flags}"()
   store i32 %0, ptr %p
   ret void
 }
 
-define void @func_rm_inout(ptr %x_ptr) {
-; CHECK-LABEL: @func_rm_inout
-; CHECK: %x = load i32, ptr %x_ptr
-; CHECK: %asm_mem = alloca i32
-; CHECK: store i32 %x, ptr %asm_mem
-; CHECK: call void asm sideeffect "inc $0", "=*m,0,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %asm_mem, ptr %asm_mem)
-; CHECK: %[[VAL2:.*]] = load i32, ptr %asm_mem
-; CHECK: store i32 %[[VAL2]], ptr %x_ptr
+define void @test3(ptr %x_ptr) {
+; CHECK-LABEL: @test3
+; CHECK:         %asm_mem = alloca i32
+; CHECK-NEXT:    %x = load i32, ptr %x_ptr
+; CHECK-NEXT:    store i32 %x, ptr %asm_mem
+; CHECK-NEXT:    call void asm sideeffect "inc $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %asm_mem, ptr %asm_mem)
+; CHECK-NEXT:    %[[VAL2:.*]] = load i32, ptr %asm_mem
+; CHECK-NEXT:    store i32 %[[VAL2]], ptr %x_ptr
 entry:
   %x = load i32, ptr %x_ptr
   %0 = call i32 asm sideeffect "inc $0", "=rm,0,~{dirflag},~{fpsr},~{flags}"(i32 %x)

>From 8d313c1b272ca9433d7e02fa54f4b2a6e75b1b8d Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Mon, 2 Feb 2026 04:16:28 -0800
Subject: [PATCH 12/29] Update BPF testcases. These were somehow affected by
 this new pass, but the pass isn't called during their tests...

---
 llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll | 14 ++++++++------
 llvm/test/CodeGen/BPF/BTF/func-typedef.ll  |  8 +++++---
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll b/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
index f8c3de5576cb9..eb42d2dd8d9c3 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
@@ -28,7 +28,7 @@ entry:
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   104
 ; CHECK-NEXT:        .long   104
-; CHECK-NEXT:        .long   32
+; CHECK-NEXT:        .long   49
 ; CHECK-NEXT:        .long   1                       # BTF_KIND_INT(id = 1)
 ; CHECK-NEXT:        .long   16777216                # 0x1000000
 ; CHECK-NEXT:        .long   4
@@ -49,10 +49,10 @@ entry:
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   1
-; CHECK-NEXT:        .long   26                      # BTF_KIND_STRUCT(id = 6)
+; CHECK-NEXT:        .long   43                      # BTF_KIND_STRUCT(id = 6)
 ; CHECK-NEXT:        .long   67108865                # 0x4000001
 ; CHECK-NEXT:        .long   8
-; CHECK-NEXT:        .long   29
+; CHECK-NEXT:        .long   46
 ; CHECK-NEXT:        .long   4
 ; CHECK-NEXT:        .long   0                       # 0x0
 ; CHECK-NEXT:        .byte   0                       # string offset=0
@@ -66,9 +66,11 @@ entry:
 ; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .ascii  "/tmp/t.c"              # string offset=17
 ; CHECK-NEXT:        .byte   0
-; CHECK-NEXT:        .ascii  "t1"                    # string offset=26
+; CHECK-NEXT:        .ascii  "int main(void) {"      # string offset=26
+; CHECK-NEXT:        .byte   0
+; CHECK-NEXT:        .ascii  "t1"                    # string offset=43
 ; CHECK-NEXT:        .byte   0
-; CHECK-NEXT:        .ascii  "a1"                    # string offset=29
+; CHECK-NEXT:        .ascii  "a1"                    # string offset=46
 ; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .section        .BTF.ext,"", at progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -91,7 +93,7 @@ entry:
 ; CHECK-NEXT:        .long   1
 ; CHECK-NEXT:        .long   .Ltmp{{[0-9]+}}
 ; CHECK-NEXT:        .long   17
-; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   26
 ; CHECK-NEXT:        .long   3091                    # Line 3 Col 19
 
 ; Function Attrs: nounwind readnone speculatable
diff --git a/llvm/test/CodeGen/BPF/BTF/func-typedef.ll b/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
index 388deeb845bf9..5318e50957e4a 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
@@ -23,7 +23,7 @@ entry:
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   72
 ; CHECK-NEXT:        .long   72
-; CHECK-NEXT:        .long   35
+; CHECK-NEXT:        .long   52
 ; CHECK-NEXT:        .long   1                       # BTF_KIND_TYPEDEF(id = 1)
 ; CHECK-NEXT:        .long   134217728               # 0x8000000
 ; CHECK-NEXT:        .long   2
@@ -57,6 +57,8 @@ entry:
 ; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .ascii  "/tmp/t.c"              # string offset=26
 ; CHECK-NEXT:        .byte   0
+; CHECK-NEXT:        .ascii  "int main(void) {"      # string offset=35
+; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .section        .BTF.ext,"", at progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
 ; CHECK-NEXT:        .byte   1
@@ -78,11 +80,11 @@ entry:
 ; CHECK-NEXT:        .long   2
 ; CHECK-NEXT:        .long   .Lfunc_begin0
 ; CHECK-NEXT:        .long   26
-; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   35
 ; CHECK-NEXT:        .long   3072                    # Line 3 Col 0
 ; CHECK-NEXT:        .long   .Ltmp{{[0-9]+}}
 ; CHECK-NEXT:        .long   26
-; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   35
 ; CHECK-NEXT:        .long   3092                    # Line 3 Col 20
 
 ; Function Attrs: nounwind readnone speculatable

>From 8069f372f15ea46862ede0ea8bc5e89f18181d0b Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Mon, 2 Feb 2026 04:26:40 -0800
Subject: [PATCH 13/29] Update asm call.

---
 llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll b/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll
index 355bc030aea36..ea4c90aaecafe 100644
--- a/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll
@@ -4,7 +4,7 @@ define void @test1(i32 %x) {
 ; CHECK-LABEL: @test1
 ; CHECK:         %asm_mem = alloca i32
 ; CHECK-NEXT:    store i32 %x, ptr %asm_mem
-; CHECK-NEXT:    call i32 asm sideeffect "mov $1, $0", "=r,*rm,~{dirflag},~{fpsr},~{flags}"(ptr %asm_mem)
+; CHECK-NEXT:    %0 = call i32 asm sideeffect "mov $1, $0", "=r,rm,~{dirflag},~{fpsr},~{flags}"(ptr %asm_mem)
 entry:
   %0 = call i32 asm sideeffect "mov $1, $0", "=r,rm,~{dirflag},~{fpsr},~{flags}"(i32 %x)
   ret void

>From 7c53f5e460310c5c9d334904c96f5aa2c3193de5 Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Mon, 2 Feb 2026 11:31:14 -0800
Subject: [PATCH 14/29] Revert "Update BPF testcases. These were somehow
 affected by this new pass, but the pass isn't called during their tests..."

This reverts commit 8d313c1b272ca9433d7e02fa54f4b2a6e75b1b8d.
---
 llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll | 14 ++++++--------
 llvm/test/CodeGen/BPF/BTF/func-typedef.ll  |  8 +++-----
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll b/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
index eb42d2dd8d9c3..f8c3de5576cb9 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
@@ -28,7 +28,7 @@ entry:
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   104
 ; CHECK-NEXT:        .long   104
-; CHECK-NEXT:        .long   49
+; CHECK-NEXT:        .long   32
 ; CHECK-NEXT:        .long   1                       # BTF_KIND_INT(id = 1)
 ; CHECK-NEXT:        .long   16777216                # 0x1000000
 ; CHECK-NEXT:        .long   4
@@ -49,10 +49,10 @@ entry:
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   1
-; CHECK-NEXT:        .long   43                      # BTF_KIND_STRUCT(id = 6)
+; CHECK-NEXT:        .long   26                      # BTF_KIND_STRUCT(id = 6)
 ; CHECK-NEXT:        .long   67108865                # 0x4000001
 ; CHECK-NEXT:        .long   8
-; CHECK-NEXT:        .long   46
+; CHECK-NEXT:        .long   29
 ; CHECK-NEXT:        .long   4
 ; CHECK-NEXT:        .long   0                       # 0x0
 ; CHECK-NEXT:        .byte   0                       # string offset=0
@@ -66,11 +66,9 @@ entry:
 ; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .ascii  "/tmp/t.c"              # string offset=17
 ; CHECK-NEXT:        .byte   0
-; CHECK-NEXT:        .ascii  "int main(void) {"      # string offset=26
-; CHECK-NEXT:        .byte   0
-; CHECK-NEXT:        .ascii  "t1"                    # string offset=43
+; CHECK-NEXT:        .ascii  "t1"                    # string offset=26
 ; CHECK-NEXT:        .byte   0
-; CHECK-NEXT:        .ascii  "a1"                    # string offset=46
+; CHECK-NEXT:        .ascii  "a1"                    # string offset=29
 ; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .section        .BTF.ext,"", at progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -93,7 +91,7 @@ entry:
 ; CHECK-NEXT:        .long   1
 ; CHECK-NEXT:        .long   .Ltmp{{[0-9]+}}
 ; CHECK-NEXT:        .long   17
-; CHECK-NEXT:        .long   26
+; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   3091                    # Line 3 Col 19
 
 ; Function Attrs: nounwind readnone speculatable
diff --git a/llvm/test/CodeGen/BPF/BTF/func-typedef.ll b/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
index 5318e50957e4a..388deeb845bf9 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
@@ -23,7 +23,7 @@ entry:
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   72
 ; CHECK-NEXT:        .long   72
-; CHECK-NEXT:        .long   52
+; CHECK-NEXT:        .long   35
 ; CHECK-NEXT:        .long   1                       # BTF_KIND_TYPEDEF(id = 1)
 ; CHECK-NEXT:        .long   134217728               # 0x8000000
 ; CHECK-NEXT:        .long   2
@@ -57,8 +57,6 @@ entry:
 ; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .ascii  "/tmp/t.c"              # string offset=26
 ; CHECK-NEXT:        .byte   0
-; CHECK-NEXT:        .ascii  "int main(void) {"      # string offset=35
-; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .section        .BTF.ext,"", at progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
 ; CHECK-NEXT:        .byte   1
@@ -80,11 +78,11 @@ entry:
 ; CHECK-NEXT:        .long   2
 ; CHECK-NEXT:        .long   .Lfunc_begin0
 ; CHECK-NEXT:        .long   26
-; CHECK-NEXT:        .long   35
+; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   3072                    # Line 3 Col 0
 ; CHECK-NEXT:        .long   .Ltmp{{[0-9]+}}
 ; CHECK-NEXT:        .long   26
-; CHECK-NEXT:        .long   35
+; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   3092                    # Line 3 Col 20
 
 ; Function Attrs: nounwind readnone speculatable

>From c0dc0d739072e837e8029093f7f766c151c98f36 Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Tue, 3 Feb 2026 15:58:26 -0800
Subject: [PATCH 15/29] Update comments to remove incorrect info.

---
 clang/lib/CodeGen/CGStmt.cpp                  |  4 ++--
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 19 ++++++++++++-------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 6d6d6fae2d5a9..5d7c293638b74 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -2894,8 +2894,8 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
 
     // - If this is a register output, then make the inline asm return it
     //   by-value.
-    // - If this is an "rm" constraint on x86, then treat it like a register
-    //   output. (We'll correct this before ISel if using the FastRA.)
+    // - If this is an "rm" constraint, then treat it like a register output.
+    //   (We'll correct this before ISel if using the fast register allocator.)
     // - If this is a memory result, return the value by-reference.
     QualType QTy = OutExpr->getType();
     const bool IsScalarOrAggregate = hasScalarEvaluationKind(QTy) ||
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 958bb41d532a8..3c60cb35871dd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5980,11 +5980,11 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
 
     OpInfo.ConstraintVT = MVT::Other;
 
-    // Special treatment for all platforms (currently only x86) that can fold a
-    // register into a spill. This is used for the "rm" constraint, where we
-    // would vastly prefer to use 'r' over 'm', but can't because of LLVM's
-    // architecture picks the most "conservative" constraint to ensure that (in
-    // the case of "rm") register pressure cause bad things to happen.
+    // Special treatment for all platforms that can fold a register into a
+    // spill. This is used for the "rm" constraint, where we would vastly
+    // prefer to use 'r' over 'm'. The non-fast register allocators are able to
+    // handle the 'r' default by folding. The fast register allocator needs
+    // special handling to convert the instruction to use 'm' instead.
     if (!OpInfo.hasMatchingInput() && OpInfo.Codes.size() == 2 &&
         llvm::is_contained(OpInfo.Codes, "r") &&
         llvm::is_contained(OpInfo.Codes, "m"))
@@ -6286,9 +6286,14 @@ TargetLowering::ConstraintGroup TargetLowering::getConstraintPreferences(
 
   // If we can fold the register (i.e. it has an "rm" constraint), opt for the
   // 'r' constraint, and allow the register allocator to spill if need be.
-  // Applies only to the greedy and default register allocators.
+  //
+  // Note: This code is a holdover from when the Clang front-end defaulted to
+  // using the memory constriaint. This should be reviewed at some point to
+  // remove that assumption from the back-end.
   const TargetMachine &TM = getTargetMachine();
-  if (TM.getOptLevel() != CodeGenOptLevel::None && OpInfo.MayFoldRegister) {
+  if (TM.getOptLevel() != CodeGenOptLevel::None && OpInfo.MayFoldRegister &&
+      llvm::is_contained(OpInfo.Codes, "r") &&
+      llvm::is_contained(OpInfo.Codes, "m")) {
     Ret.emplace_back(ConstraintPair("r", getConstraintType("r")));
     Ret.emplace_back(ConstraintPair("m", getConstraintType("m")));
     return Ret;

>From 9fff5bd3a3b96cf40b0bf313a5bca7e595b7a57d Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Tue, 3 Feb 2026 16:00:50 -0800
Subject: [PATCH 16/29] Add that it's not used for other RAs.

---
 llvm/lib/CodeGen/InlineAsmPrepare.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
index f65cc84e3fbfd..624067a66b704 100644
--- a/llvm/lib/CodeGen/InlineAsmPrepare.cpp
+++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
@@ -10,7 +10,8 @@
 // allocator---e.g., by converting "rm" (register-or-memory) constraints to "m"
 // (memory-only) constraints, simplifying register allocation by forcing
 // operands to memory locations, avoiding the complexity of handling dual
-// register/memory options.
+// register/memory options. The other register allocators are equipped to
+// handle folding registers all ready.
 //
 //===----------------------------------------------------------------------===//
 

>From 7298eb93e08b544ea8091d7eb475169a2a0f2313 Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Wed, 4 Feb 2026 18:57:30 +0000
Subject: [PATCH 17/29] Add 'isRequired' to the pass class.

---
 llvm/include/llvm/CodeGen/InlineAsmPrepare.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
index e5ff4db562577..76f62c9df3579 100644
--- a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
+++ b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
@@ -17,6 +17,8 @@ namespace llvm {
 class InlineAsmPreparePass : public PassInfoMixin<InlineAsmPreparePass> {
 public:
   LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+
+  static bool isRequired() { return true; }
 };
 
 } // namespace llvm

>From ba9538ba7bca9223526b0b0b08ba72485ece2f4a Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Wed, 4 Feb 2026 19:06:22 +0000
Subject: [PATCH 18/29] Spelling.

---
 llvm/include/llvm/IR/InlineAsm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/IR/InlineAsm.h b/llvm/include/llvm/IR/InlineAsm.h
index 1f7bacee49721..5f9e77b321708 100644
--- a/llvm/include/llvm/IR/InlineAsm.h
+++ b/llvm/include/llvm/IR/InlineAsm.h
@@ -182,7 +182,7 @@ class InlineAsm final : public Value {
       return Type == isInput || (Type == isOutput && isIndirect);
     }
 
-    /// hassRegMemConstraints - Returns true if and only if the constraint
+    /// hasRegMemConstraints - Returns true if and only if the constraint
     /// codes are "rm". This is useful when converting between a register form
     /// to a memory form.
     bool hasRegMemConstraints() const {

>From f98974a4f145269e1ba93b9e22cc4e56fbe28e3a Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Wed, 4 Feb 2026 19:14:46 +0000
Subject: [PATCH 19/29] Use better datastructures

---
 llvm/lib/CodeGen/InlineAsmPrepare.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
index 624067a66b704..bd377f6ebaa31 100644
--- a/llvm/lib/CodeGen/InlineAsmPrepare.cpp
+++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
@@ -25,7 +25,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include <sstream>
 
 using namespace llvm;
 
@@ -76,24 +75,25 @@ static bool isRegMemConstraint(StringRef Constraint) {
 static std::pair<std::string, bool>
 convertConstraintsToMemory(StringRef ConstraintStr) {
   auto I = ConstraintStr.begin(), E = ConstraintStr.end();
-  std::ostringstream Out;
+  std::string Out;
+  raw_string_ostream O(Out);
   bool HasRegMem = false;
 
   while (I != E) {
     bool IsOutput = false;
     bool HasIndirect = false;
     if (*I == '=') {
-      Out << *I;
+      O << *I;
       IsOutput = true;
       ++I;
     }
     if (*I == '*') {
-      Out << '*';
+      O << '*';
       HasIndirect = true;
       ++I;
     }
     if (*I == '+') {
-      Out << '+';
+      O << '+';
       IsOutput = true;
       ++I;
     }
@@ -103,19 +103,19 @@ convertConstraintsToMemory(StringRef ConstraintStr) {
     if (isRegMemConstraint(Sub)) {
       HasRegMem = true;
       if (IsOutput && !HasIndirect)
-        Out << '*';
+        O << '*';
     }
 
-    Out << Sub;
+    O << Sub;
 
     if (Comma == E)
       break;
 
-    Out << ',';
+    O << ',';
     I = Comma + 1;
   }
 
-  return std::make_pair(Out.str(), HasRegMem);
+  return std::make_pair(Out, HasRegMem);
 }
 
 namespace {

>From 89dbe77c938a2023c7f4977931ab032e84d31669 Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Thu, 5 Feb 2026 00:11:07 -0800
Subject: [PATCH 20/29] Merge CallBrPrepare into InlineAsmPrepare, which is the
 better name for what the pass does.

---
 llvm/include/llvm/CodeGen/CallBrPrepare.h     |  23 -
 llvm/include/llvm/CodeGen/InlineAsmPrepare.h  |   5 +
 llvm/include/llvm/InitializePasses.h          |   1 -
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |   5 +-
 llvm/lib/CodeGen/CMakeLists.txt               |   1 -
 llvm/lib/CodeGen/CallBrPrepare.cpp            | 252 ----------
 llvm/lib/CodeGen/CodeGen.cpp                  |   1 -
 llvm/lib/CodeGen/InlineAsmPrepare.cpp         | 455 +++++++++++++-----
 llvm/lib/CodeGen/TargetPassConfig.cpp         |   5 +-
 llvm/lib/Passes/PassBuilder.cpp               |   1 -
 llvm/lib/Passes/PassRegistry.def              |   3 +-
 llvm/test/CodeGen/AArch64/O0-pipeline.ll      |   3 +-
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |   2 +-
 llvm/test/CodeGen/AArch64/callbr-prepare.ll   |   6 +-
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll  |   6 +-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |  11 +-
 llvm/test/CodeGen/ARM/O3-pipeline.ll          |   2 +-
 llvm/test/CodeGen/LoongArch/O0-pipeline.ll    |   3 +-
 llvm/test/CodeGen/LoongArch/opt-pipeline.ll   |   2 +-
 llvm/test/CodeGen/PowerPC/O0-pipeline.ll      |   3 +-
 llvm/test/CodeGen/PowerPC/O3-pipeline.ll      |   2 +-
 llvm/test/CodeGen/RISCV/O0-pipeline.ll        |   3 +-
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |   2 +-
 llvm/test/CodeGen/SPIRV/llc-pipeline.ll       |   5 +-
 llvm/test/CodeGen/X86/O0-pipeline.ll          |   3 +-
 llvm/test/CodeGen/X86/asm-constraints-rm.ll   |  46 --
 llvm/test/CodeGen/X86/llc-pipeline-npm.ll     |   8 +-
 llvm/test/CodeGen/X86/opt-pipeline.ll         |   2 +-
 llvm/tools/opt/optdriver.cpp                  |   4 +-
 .../gn/secondary/llvm/lib/CodeGen/BUILD.gn    |   2 +-
 30 files changed, 375 insertions(+), 492 deletions(-)
 delete mode 100644 llvm/include/llvm/CodeGen/CallBrPrepare.h
 delete mode 100644 llvm/lib/CodeGen/CallBrPrepare.cpp

diff --git a/llvm/include/llvm/CodeGen/CallBrPrepare.h b/llvm/include/llvm/CodeGen/CallBrPrepare.h
deleted file mode 100644
index d44d30b0adc17..0000000000000
--- a/llvm/include/llvm/CodeGen/CallBrPrepare.h
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- CallBrPrepare - Prepare callbr for code generation ------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_CALLBRPREPARE_H
-#define LLVM_CODEGEN_CALLBRPREPARE_H
-
-#include "llvm/IR/PassManager.h"
-
-namespace llvm {
-
-class CallBrPreparePass : public PassInfoMixin<CallBrPreparePass> {
-public:
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
-};
-
-} // namespace llvm
-
-#endif // LLVM_CODEGEN_CALLBRPREPARE_H
diff --git a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
index 76f62c9df3579..130346084b428 100644
--- a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
+++ b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
@@ -14,8 +14,13 @@
 
 namespace llvm {
 
+class TargetMachine;
+
 class InlineAsmPreparePass : public PassInfoMixin<InlineAsmPreparePass> {
+  const TargetMachine *TM;
+
 public:
+  explicit InlineAsmPreparePass(const TargetMachine &TM) : TM(&TM) {}
   LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
 
   static bool isRequired() { return true; }
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 24a8bf5ce5e47..b46fabb14a04d 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -76,7 +76,6 @@ LLVM_ABI void initializeCFGuardPass(PassRegistry &);
 LLVM_ABI void initializeCFGuardLongjmpPass(PassRegistry &);
 LLVM_ABI void initializeCFIFixupPass(PassRegistry &);
 LLVM_ABI void initializeCFIInstrInserterPass(PassRegistry &);
-LLVM_ABI void initializeCallBrPreparePass(PassRegistry &);
 LLVM_ABI void initializeCallGraphDOTPrinterPass(PassRegistry &);
 LLVM_ABI void initializeCallGraphViewerPass(PassRegistry &);
 LLVM_ABI void initializeCallGraphWrapperPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 6942fc42ca721..ff506994b65e9 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -24,7 +24,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/CodeGen/BranchFoldingPass.h"
-#include "llvm/CodeGen/CallBrPrepare.h"
 #include "llvm/CodeGen/CodeGenPrepare.h"
 #include "llvm/CodeGen/DeadMachineInstructionElim.h"
 #include "llvm/CodeGen/DetectDeadLanes.h"
@@ -41,6 +40,7 @@
 #include "llvm/CodeGen/GlobalMergeFunctions.h"
 #include "llvm/CodeGen/IndirectBrExpand.h"
 #include "llvm/CodeGen/InitUndef.h"
+#include "llvm/CodeGen/InlineAsmPrepare.h"
 #include "llvm/CodeGen/InterleavedAccess.h"
 #include "llvm/CodeGen/InterleavedLoadCombine.h"
 #include "llvm/CodeGen/LiveDebugValuesPass.h"
@@ -842,7 +842,8 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addISelPrepare(
   if (getOptLevel() != CodeGenOptLevel::None)
     addFunctionPass(ObjCARCContractPass(), PMW);
 
-  addFunctionPass(CallBrPreparePass(), PMW);
+  addFunctionPass(InlineAsmPreparePass(TM), PMW);
+
   // Add both the safe stack and the stack protection passes: each of them will
   // only protect functions that have corresponding attributes.
   addFunctionPass(SafeStackPass(TM), PMW);
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 9a1561402adfd..bb1357214bc71 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -37,7 +37,6 @@ add_llvm_component_library(LLVMCodeGen
   BasicBlockSectionsProfileReader.cpp
   BasicBlockMatchingAndInference.cpp
   CalcSpillWeights.cpp
-  CallBrPrepare.cpp
   CallingConvLower.cpp
   CFGuardLongjmp.cpp
   CFIFixup.cpp
diff --git a/llvm/lib/CodeGen/CallBrPrepare.cpp b/llvm/lib/CodeGen/CallBrPrepare.cpp
deleted file mode 100644
index 77a0d0b653871..0000000000000
--- a/llvm/lib/CodeGen/CallBrPrepare.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-//===-- CallBrPrepare - Prepare callbr for code generation ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass lowers callbrs in LLVM IR in order to to assist SelectionDAG's
-// codegen.
-//
-// In particular, this pass assists in inserting register copies for the output
-// values of a callbr along the edges leading to the indirect target blocks.
-// Though the output SSA value is defined by the callbr instruction itself in
-// the IR representation, the value cannot be copied to the appropriate virtual
-// registers prior to jumping to an indirect label, since the jump occurs
-// within the user-provided assembly blob.
-//
-// Instead, those copies must occur separately at the beginning of each
-// indirect target. That requires that we create a separate SSA definition in
-// each of them (via llvm.callbr.landingpad), and may require splitting
-// critical edges so we have a location to place the intrinsic. Finally, we
-// remap users of the original callbr output SSA value to instead point to the
-// appropriate llvm.callbr.landingpad value.
-//
-// Ideally, this could be done inside SelectionDAG, or in the
-// MachineInstruction representation, without the use of an IR-level intrinsic.
-// But, within the current framework, it’s simpler to implement as an IR pass.
-// (If support for callbr in GlobalISel is implemented, it’s worth considering
-// whether this is still required.)
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/CallBrPrepare.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/iterator.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "callbr-prepare"
-
-static bool SplitCriticalEdges(ArrayRef<CallBrInst *> CBRs, DominatorTree &DT);
-static bool InsertIntrinsicCalls(ArrayRef<CallBrInst *> CBRs,
-                                 DominatorTree &DT);
-static void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic,
-                      SSAUpdater &SSAUpdate);
-static SmallVector<CallBrInst *, 2> FindCallBrs(Function &F);
-
-namespace {
-
-class CallBrPrepare : public FunctionPass {
-public:
-  CallBrPrepare() : FunctionPass(ID) {}
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  bool runOnFunction(Function &F) override;
-  static char ID;
-};
-
-} // end anonymous namespace
-
-PreservedAnalyses CallBrPreparePass::run(Function &F,
-                                         FunctionAnalysisManager &FAM) {
-  bool Changed = false;
-  SmallVector<CallBrInst *, 2> CBRs = FindCallBrs(F);
-
-  if (CBRs.empty())
-    return PreservedAnalyses::all();
-
-  auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
-
-  Changed |= SplitCriticalEdges(CBRs, DT);
-  Changed |= InsertIntrinsicCalls(CBRs, DT);
-
-  if (!Changed)
-    return PreservedAnalyses::all();
-  PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
-  return PA;
-}
-
-char CallBrPrepare::ID = 0;
-INITIALIZE_PASS_BEGIN(CallBrPrepare, "callbrprepare", "Prepare callbr", false,
-                      false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(CallBrPrepare, "callbrprepare", "Prepare callbr", false,
-                    false)
-
-FunctionPass *llvm::createCallBrPass() { return new CallBrPrepare(); }
-
-void CallBrPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addPreserved<DominatorTreeWrapperPass>();
-}
-
-SmallVector<CallBrInst *, 2> FindCallBrs(Function &F) {
-  SmallVector<CallBrInst *, 2> CBRs;
-  for (BasicBlock &BB : F)
-    if (auto *CBR = dyn_cast<CallBrInst>(BB.getTerminator()))
-      if (!CBR->getType()->isVoidTy() && !CBR->use_empty())
-        CBRs.push_back(CBR);
-  return CBRs;
-}
-
-bool SplitCriticalEdges(ArrayRef<CallBrInst *> CBRs, DominatorTree &DT) {
-  bool Changed = false;
-  CriticalEdgeSplittingOptions Options(&DT);
-  Options.setMergeIdenticalEdges();
-
-  // The indirect destination might be duplicated between another parameter...
-  //   %0 = callbr ... [label %x, label %x]
-  // ...hence MergeIdenticalEdges and AllowIndentical edges, but we don't need
-  // to split the default destination if it's duplicated between an indirect
-  // destination...
-  //   %1 = callbr ... to label %x [label %x]
-  // ...hence starting at 1 and checking against successor 0 (aka the default
-  // destination).
-  for (CallBrInst *CBR : CBRs)
-    for (unsigned i = 1, e = CBR->getNumSuccessors(); i != e; ++i)
-      if (CBR->getSuccessor(i) == CBR->getSuccessor(0) ||
-          isCriticalEdge(CBR, i, /*AllowIdenticalEdges*/ true))
-        if (SplitKnownCriticalEdge(CBR, i, Options))
-          Changed = true;
-  return Changed;
-}
-
-bool InsertIntrinsicCalls(ArrayRef<CallBrInst *> CBRs, DominatorTree &DT) {
-  bool Changed = false;
-  SmallPtrSet<const BasicBlock *, 4> Visited;
-  IRBuilder<> Builder(CBRs[0]->getContext());
-  for (CallBrInst *CBR : CBRs) {
-    if (!CBR->getNumIndirectDests())
-      continue;
-
-    SSAUpdater SSAUpdate;
-    SSAUpdate.Initialize(CBR->getType(), CBR->getName());
-    SSAUpdate.AddAvailableValue(CBR->getParent(), CBR);
-    SSAUpdate.AddAvailableValue(CBR->getDefaultDest(), CBR);
-
-    for (BasicBlock *IndDest : CBR->getIndirectDests()) {
-      if (!Visited.insert(IndDest).second)
-        continue;
-      Builder.SetInsertPoint(&*IndDest->begin());
-      CallInst *Intrinsic = Builder.CreateIntrinsic(
-          CBR->getType(), Intrinsic::callbr_landingpad, {CBR});
-      SSAUpdate.AddAvailableValue(IndDest, Intrinsic);
-      UpdateSSA(DT, CBR, Intrinsic, SSAUpdate);
-      Changed = true;
-    }
-  }
-  return Changed;
-}
-
-static bool IsInSameBasicBlock(const Use &U, const BasicBlock *BB) {
-  const auto *I = dyn_cast<Instruction>(U.getUser());
-  return I && I->getParent() == BB;
-}
-
-#ifndef NDEBUG
-static void PrintDebugDomInfo(const DominatorTree &DT, const Use &U,
-                              const BasicBlock *BB, bool IsDefaultDest) {
-  if (!isa<Instruction>(U.getUser()))
-    return;
-  LLVM_DEBUG(dbgs() << "Use: " << *U.getUser() << ", in block "
-                    << cast<Instruction>(U.getUser())->getParent()->getName()
-                    << ", is " << (DT.dominates(BB, U) ? "" : "NOT ")
-                    << "dominated by " << BB->getName() << " ("
-                    << (IsDefaultDest ? "in" : "") << "direct)\n");
-}
-#endif
-
-void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic,
-               SSAUpdater &SSAUpdate) {
-
-  SmallPtrSet<Use *, 4> Visited;
-  BasicBlock *DefaultDest = CBR->getDefaultDest();
-  BasicBlock *LandingPad = Intrinsic->getParent();
-
-  SmallVector<Use *, 4> Uses(make_pointer_range(CBR->uses()));
-  for (Use *U : Uses) {
-    if (!Visited.insert(U).second)
-      continue;
-
-#ifndef NDEBUG
-    PrintDebugDomInfo(DT, *U, LandingPad, /*IsDefaultDest*/ false);
-    PrintDebugDomInfo(DT, *U, DefaultDest, /*IsDefaultDest*/ true);
-#endif
-
-    // Don't rewrite the use in the newly inserted intrinsic.
-    if (const auto *II = dyn_cast<IntrinsicInst>(U->getUser()))
-      if (II->getIntrinsicID() == Intrinsic::callbr_landingpad)
-        continue;
-
-    // If the Use is in the same BasicBlock as the Intrinsic call, replace
-    // the Use with the value of the Intrinsic call.
-    if (IsInSameBasicBlock(*U, LandingPad)) {
-      U->set(Intrinsic);
-      continue;
-    }
-
-    // If the Use is dominated by the default dest, do not touch it.
-    if (DT.dominates(DefaultDest, *U))
-      continue;
-
-    SSAUpdate.RewriteUse(*U);
-  }
-}
-
-bool CallBrPrepare::runOnFunction(Function &F) {
-  bool Changed = false;
-  SmallVector<CallBrInst *, 2> CBRs = FindCallBrs(F);
-
-  if (CBRs.empty())
-    return Changed;
-
-  // It's highly likely that most programs do not contain CallBrInsts. Follow a
-  // similar pattern from SafeStackLegacyPass::runOnFunction to reuse previous
-  // domtree analysis if available, otherwise compute it lazily. This avoids
-  // forcing Dominator Tree Construction at -O0 for programs that likely do not
-  // contain CallBrInsts. It does pessimize programs with callbr at higher
-  // optimization levels, as the DominatorTree created here is not reused by
-  // subsequent passes.
-  DominatorTree *DT;
-  std::optional<DominatorTree> LazilyComputedDomTree;
-  if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
-    DT = &DTWP->getDomTree();
-  else {
-    LazilyComputedDomTree.emplace(F);
-    DT = &*LazilyComputedDomTree;
-  }
-
-  if (SplitCriticalEdges(CBRs, *DT))
-    Changed = true;
-
-  if (InsertIntrinsicCalls(CBRs, *DT))
-    Changed = true;
-
-  return Changed;
-}
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index c157418665f4b..8b57a09b12f07 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -27,7 +27,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeBranchFolderLegacyPass(Registry);
   initializeBranchRelaxationLegacyPass(Registry);
   initializeBreakFalseDepsPass(Registry);
-  initializeCallBrPreparePass(Registry);
   initializeCFGuardLongjmpPass(Registry);
   initializeCFIFixupPass(Registry);
   initializeCFIInstrInserterPass(Registry);
diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
index bd377f6ebaa31..0442e99d999d2 100644
--- a/llvm/lib/CodeGen/InlineAsmPrepare.cpp
+++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
@@ -6,25 +6,60 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass prepares inline assembly for code generation with the fast register
-// allocator---e.g., by converting "rm" (register-or-memory) constraints to "m"
-// (memory-only) constraints, simplifying register allocation by forcing
-// operands to memory locations, avoiding the complexity of handling dual
-// register/memory options. The other register allocators are equipped to
-// handle folding registers all ready.
+// This pass lowers callbrs and inline asm in LLVM IR in order to to assist
+// SelectionDAG's codegen.
+//
+// CallBrInst:
+//
+//   - Assists in inserting register copies for the output values of a callbr
+//     along the edges leading to the indirect target blocks. Though the output
+//     SSA value is defined by the callbr instruction itself in the IR
+//     representation, the value cannot be copied to the appropriate virtual
+//     registers prior to jumping to an indirect label, since the jump occurs
+//     within the user-provided assembly blob.
+//
+//     Instead, those copies must occur separately at the beginning of each
+//     indirect target. That requires that we create a separate SSA definition
+//     in each of them (via llvm.callbr.landingpad), and may require splitting
+//     critical edges so we have a location to place the intrinsic. Finally, we
+//     remap users of the original callbr output SSA value to instead point to
+//     the appropriate llvm.callbr.landingpad value.
+//
+//     Ideally, this could be done inside SelectionDAG, or in the
+//     MachineInstruction representation, without the use of an IR-level
+//     intrinsic.  But, within the current framework, it’s simpler to implement
+//     as an IR pass.  (If support for callbr in GlobalISel is implemented,
+//     it’s worth considering whether this is still required.)
+//
+// InlineAsm:
+//
+//   - Prepares inline assembly for code generation with the fast register
+//     allocator. In particular, it defaults "rm" (register-or-memory) to
+//     prefer the "m" constraints (the front-end opts for the "r" constraint),
+//     simplifying register allocation by forcing operands to memory locations.
+//     The other register allocators are equipped to handle folding registers
+//     already, so don't need to change the default.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/InlineAsmPrepare.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
 
@@ -33,12 +68,12 @@ using namespace llvm;
 namespace {
 
 class InlineAsmPrepare : public FunctionPass {
-  InlineAsmPrepare(InlineAsmPrepare &) = delete;
-
 public:
   InlineAsmPrepare() : FunctionPass(ID) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
     AU.setPreservesCFG();
   }
   bool runOnFunction(Function &F) override;
@@ -50,30 +85,28 @@ char InlineAsmPrepare::ID = 0;
 
 } // end anonymous namespace
 
-INITIALIZE_PASS(InlineAsmPrepare, DEBUG_TYPE,
-                "Prepare inline asm insts for fast register allocation", false,
-                false)
-FunctionPass *llvm::createInlineAsmPass() { return new InlineAsmPrepare(); }
+INITIALIZE_PASS_BEGIN(InlineAsmPrepare, DEBUG_TYPE, "Prepare inline asm insts",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(InlineAsmPrepare, DEBUG_TYPE, "Prepare inline asm insts",
+                    false, false)
 
-/// Find all inline assembly calls in the given function.
-static SmallVector<CallBase *, 4> findInlineAsms(Function &F) {
-  SmallVector<CallBase *, 4> InlineAsms;
-
-  for (BasicBlock &BB : F)
-    for (Instruction &I : BB)
-      if (CallBase *CB = dyn_cast<CallBase>(&I); CB && CB->isInlineAsm())
-        InlineAsms.push_back(CB);
+FunctionPass *llvm::createInlineAsmPass() { return new InlineAsmPrepare(); }
 
-  return InlineAsms;
-}
+//===----------------------------------------------------------------------===//
+//                     Process InlineAsm instructions
+//===----------------------------------------------------------------------===//
 
-static bool isRegMemConstraint(StringRef Constraint) {
+/// The inline asm constraint allows both register and memory.
+static bool IsRegMemConstraint(StringRef Constraint) {
   return Constraint.size() == 2 && (Constraint == "rm" || Constraint == "mr");
 }
 
-/// Convert instances of the "rm" constraints into "m".
+/// Tag "rm" output constraints with '*' to signify that they default to a
+/// memory location.
 static std::pair<std::string, bool>
-convertConstraintsToMemory(StringRef ConstraintStr) {
+ConvertConstraintsToMemory(StringRef ConstraintStr) {
   auto I = ConstraintStr.begin(), E = ConstraintStr.end();
   std::string Out;
   raw_string_ostream O(Out);
@@ -100,7 +133,7 @@ convertConstraintsToMemory(StringRef ConstraintStr) {
 
     auto Comma = std::find(I, E, ',');
     std::string Sub(I, Comma);
-    if (isRegMemConstraint(Sub)) {
+    if (IsRegMemConstraint(Sub)) {
       HasRegMem = true;
       if (IsOutput && !HasIndirect)
         O << '*';
@@ -118,13 +151,10 @@ convertConstraintsToMemory(StringRef ConstraintStr) {
   return std::make_pair(Out, HasRegMem);
 }
 
-namespace {
-
-/// Build a map of tied constraints.
-/// TiedOutput[i] = j means Constraint i is an Input tied to Output Constraint
-/// j.
+/// Build a map of tied constraints. TiedOutput[i] = j means Constraint i is an
+/// input tied to output constraint j.
 static void
-buildTiedConstraintMap(const InlineAsm::ConstraintInfoVector &Constraints,
+BuildTiedConstraintMap(const InlineAsm::ConstraintInfoVector &Constraints,
                        SmallVectorImpl<int> &TiedOutput) {
   for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
     const InlineAsm::ConstraintInfo &C = Constraints[I];
@@ -133,6 +163,7 @@ buildTiedConstraintMap(const InlineAsm::ConstraintInfoVector &Constraints,
       if (InputIdx >= 0 && InputIdx < (int)Constraints.size())
         TiedOutput[InputIdx] = I;
     }
+
     if (C.Type == InlineAsm::isInput && C.hasMatchingInput()) {
       int OutputIdx = C.MatchingInput;
       if (OutputIdx >= 0 && OutputIdx < (int)Constraints.size())
@@ -142,7 +173,7 @@ buildTiedConstraintMap(const InlineAsm::ConstraintInfoVector &Constraints,
 }
 
 /// Process an output constraint, creating allocas for converted constraints.
-static void processOutputConstraint(
+static void ProcessOutputConstraint(
     const InlineAsm::ConstraintInfo &C, Type *RetTy, unsigned OutputIdx,
     IRBuilder<> &EntryBuilder, SmallVectorImpl<Value *> &NewArgs,
     SmallVectorImpl<Type *> &NewArgTypes, SmallVectorImpl<Type *> &NewRetTypes,
@@ -168,57 +199,58 @@ static void processOutputConstraint(
 }
 
 /// Process an input constraint, handling tied constraints and conversions.
-static void processInputConstraint(
-    const InlineAsm::ConstraintInfo &C, Value *ArgVal,
-    const SmallVectorImpl<int> &TiedOutput,
-    const SmallVectorImpl<AllocaInst *> &OutputAllocas, unsigned ConstraintIdx,
-    IRBuilder<> &Builder, IRBuilder<> &EntryBuilder,
-    SmallVectorImpl<Value *> &NewArgs, SmallVectorImpl<Type *> &NewArgTypes) {
+static void ProcessInputConstraint(const InlineAsm::ConstraintInfo &C,
+                                   Value *ArgVal, ArrayRef<int> TiedOutput,
+                                   ArrayRef<AllocaInst *> OutputAllocas,
+                                   unsigned ConstraintIdx, IRBuilder<> &Builder,
+                                   IRBuilder<> &EntryBuilder,
+                                   SmallVectorImpl<Value *> &NewArgs,
+                                   SmallVectorImpl<Type *> &NewArgTypes) {
   Type *ArgTy = ArgVal->getType();
-  bool Handled = false;
 
   if (TiedOutput[ConstraintIdx] != -1) {
     int MatchIdx = TiedOutput[ConstraintIdx];
     if (AllocaInst *Slot = OutputAllocas[MatchIdx]) {
-      // The matched output was converted to memory.
-      // Store this input into the alloca.
+      // The matched output was converted to memory. Store this input into the
+      // alloca.
       Builder.CreateStore(ArgVal, Slot);
-      // Pass the alloca pointer as the argument, instead of ArgVal.
-      // This ensures the tied "0" constraint matches the "*m" output.
+
+      // Pass the alloca pointer as the argument, instead of ArgVal. This
+      // ensures the tied "0" constraint matches the "*m" output.
       NewArgs.push_back(Slot);
       NewArgTypes.push_back(Slot->getType());
-      Handled = true;
+      return;
     }
   }
 
-  if (!Handled) {
-    if (C.hasRegMemConstraints()) {
-      // Converted to memory constraint.
-      // Create alloca, store input, pass pointer as argument.
-      AllocaInst *Slot = EntryBuilder.CreateAlloca(ArgTy, nullptr, "asm_mem");
-      Builder.CreateStore(ArgVal, Slot);
-      NewArgs.push_back(Slot);
-      NewArgTypes.push_back(Slot->getType());
-    } else {
-      // Unchanged
-      NewArgs.push_back(ArgVal);
-      NewArgTypes.push_back(ArgTy);
-    }
+  if (C.hasRegMemConstraints()) {
+    // Converted to memory constraint. Create alloca, store input, pass pointer
+    // as argument.
+    AllocaInst *Slot = EntryBuilder.CreateAlloca(ArgTy, nullptr, "asm_mem");
+    Builder.CreateStore(ArgVal, Slot);
+    NewArgs.push_back(Slot);
+    NewArgTypes.push_back(Slot->getType());
+  } else {
+    // Unchanged
+    NewArgs.push_back(ArgVal);
+    NewArgTypes.push_back(ArgTy);
   }
 }
 
 /// Build the return type from the collected return types.
-static Type *buildReturnType(const SmallVectorImpl<Type *> &NewRetTypes,
+static Type *BuildReturnType(ArrayRef<Type *> NewRetTypes,
                              LLVMContext &Context) {
   if (NewRetTypes.empty())
     return Type::getVoidTy(Context);
+
   if (NewRetTypes.size() == 1)
     return NewRetTypes[0];
+
   return StructType::get(Context, NewRetTypes);
 }
 
 /// Create the new inline assembly call with converted constraints.
-static CallInst *createNewInlineAsm(
+static CallInst *CreateNewInlineAsm(
     InlineAsm *IA, const std::string &NewConstraintStr, Type *NewRetTy,
     const SmallVectorImpl<Type *> &NewArgTypes,
     const SmallVectorImpl<Value *> &NewArgs,
@@ -244,7 +276,7 @@ static CallInst *createNewInlineAsm(
 
 /// Reconstruct the return value from the new call and allocas.
 static Value *
-reconstructReturnValue(Type *RetTy, CallInst *NewCall,
+ReconstructReturnValue(Type *RetTy, CallInst *NewCall,
                        const InlineAsm::ConstraintInfoVector &Constraints,
                        const SmallVectorImpl<AllocaInst *> &OutputAllocas,
                        const SmallVectorImpl<Type *> &NewRetTypes,
@@ -278,6 +310,7 @@ reconstructReturnValue(Type *RetTy, CallInst *NewCall,
 
       Res = Builder.CreateInsertValue(Res, Val, OriginalOutIdx++);
     }
+
     return Res;
   }
 
@@ -297,88 +330,268 @@ reconstructReturnValue(Type *RetTy, CallInst *NewCall,
   return NewCall;
 }
 
-} // namespace
+static bool ProcessInlineAsm(Function &F, CallBase *CB) {
+  InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
+  const InlineAsm::ConstraintInfoVector &Constraints = IA->ParseConstraints();
 
-bool InlineAsmPrepare::runOnFunction(Function &F) {
-  SmallVector<CallBase *, 4> IAs = findInlineAsms(F);
-  if (IAs.empty())
+  auto [NewConstraintStr, HasRegMem] =
+      ConvertConstraintsToMemory(IA->getConstraintString());
+  if (!HasRegMem)
     return false;
 
-  bool Changed = false;
-  for (CallBase *CB : IAs) {
-    InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
-    const InlineAsm::ConstraintInfoVector &Constraints = IA->ParseConstraints();
-
-    auto [NewConstraintStr, HasRegMem] =
-        convertConstraintsToMemory(IA->getConstraintString());
-    if (!HasRegMem)
-      continue;
+  IRBuilder<> Builder(CB);
+  IRBuilder<> EntryBuilder(&F.getEntryBlock(), F.getEntryBlock().begin());
 
-    IRBuilder<> Builder(CB);
-    IRBuilder<> EntryBuilder(&F.getEntryBlock(), F.getEntryBlock().begin());
+  // Collect new arguments and return types.
+  SmallVector<Value *, 8> NewArgs;
+  SmallVector<Type *, 8> NewArgTypes;
+  SmallVector<Type *, 2> NewRetTypes;
+  SmallVector<std::pair<unsigned, Type *>, 8> ElementTypeAttrs;
 
-    // Collect new arguments and return types.
-    SmallVector<Value *, 8> NewArgs;
-    SmallVector<Type *, 8> NewArgTypes;
-    SmallVector<Type *, 2> NewRetTypes;
-    SmallVector<std::pair<unsigned, Type *>, 8> ElementTypeAttrs;
+  // Track allocas created for converted outputs.
+  SmallVector<AllocaInst *, 8> OutputAllocas(Constraints.size(), nullptr);
 
-    // Track allocas created for converted outputs.
-    SmallVector<AllocaInst *, 8> OutputAllocas(Constraints.size(), nullptr);
+  // Build tied constraint map.
+  SmallVector<int, 8> TiedOutput(Constraints.size(), -1);
+  BuildTiedConstraintMap(Constraints, TiedOutput);
 
-    // Build tied constraint map.
-    SmallVector<int, 8> TiedOutput(Constraints.size(), -1);
-    buildTiedConstraintMap(Constraints, TiedOutput);
+  // Process constraints.
+  unsigned ArgNo = 0;
+  unsigned OutputIdx = 0;
+  for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
+    const InlineAsm::ConstraintInfo &C = Constraints[I];
 
-    // Process constraints.
-    unsigned ArgNo = 0;
-    unsigned OutputIdx = 0;
-    for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
-      const InlineAsm::ConstraintInfo &C = Constraints[I];
-
-      if (C.Type == InlineAsm::isOutput) {
-        processOutputConstraint(C, CB->getType(), OutputIdx, EntryBuilder,
-                                NewArgs, NewArgTypes, NewRetTypes,
-                                ElementTypeAttrs, OutputAllocas, I);
-        OutputIdx++;
-      } else if (C.Type == InlineAsm::isInput) {
-        Value *ArgVal = CB->getArgOperand(ArgNo);
-        processInputConstraint(C, ArgVal, TiedOutput, OutputAllocas, I, Builder,
-                               EntryBuilder, NewArgs, NewArgTypes);
-        ArgNo++;
-      }
+    if (C.Type == InlineAsm::isOutput) {
+      ProcessOutputConstraint(C, CB->getType(), OutputIdx, EntryBuilder,
+                              NewArgs, NewArgTypes, NewRetTypes,
+                              ElementTypeAttrs, OutputAllocas, I);
+      OutputIdx++;
+    } else if (C.Type == InlineAsm::isInput) {
+      Value *ArgVal = CB->getArgOperand(ArgNo);
+      ProcessInputConstraint(C, ArgVal, TiedOutput, OutputAllocas, I, Builder,
+                             EntryBuilder, NewArgs, NewArgTypes);
+      ArgNo++;
     }
+  }
+
+  // Build the new return type.
+  Type *NewRetTy = BuildReturnType(NewRetTypes, F.getContext());
+
+  // Create the new inline assembly call.
+  CallInst *NewCall =
+      CreateNewInlineAsm(IA, NewConstraintStr, NewRetTy, NewArgTypes, NewArgs,
+                         ElementTypeAttrs, CB, Builder, F.getContext());
 
-    // Build the new return type.
-    Type *NewRetTy = buildReturnType(NewRetTypes, F.getContext());
+  // Reconstruct the return value and update users.
+  if (!CB->use_empty()) {
+    if (Value *Replacement =
+            ReconstructReturnValue(CB->getType(), NewCall, Constraints,
+                                   OutputAllocas, NewRetTypes, Builder))
+      CB->replaceAllUsesWith(Replacement);
+  }
+
+  CB->eraseFromParent();
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                           Process CallBrInsts
+//===----------------------------------------------------------------------===//
+
+/// The Use is in the same BasicBlock as the intrinsic call.
+static bool IsInSameBasicBlock(const Use &U, const BasicBlock *BB) {
+  const auto *I = dyn_cast<Instruction>(U.getUser());
+  return I && I->getParent() == BB;
+}
+
+#ifndef NDEBUG
+static void PrintDebugDomInfo(const DominatorTree &DT, const Use &U,
+                              const BasicBlock *BB, bool IsDefaultDest) {
+  if (isa<Instruction>(U.getUser()))
+    LLVM_DEBUG(dbgs() << "Use: " << *U.getUser() << ", in block "
+                      << cast<Instruction>(U.getUser())->getParent()->getName()
+                      << ", is " << (DT.dominates(BB, U) ? "" : "NOT ")
+                      << "dominated by " << BB->getName() << " ("
+                      << (IsDefaultDest ? "in" : "") << "direct)\n");
+}
+#endif
+
+static void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic,
+                      SSAUpdater &SSAUpdate) {
+  SmallPtrSet<Use *, 4> Visited;
+
+  BasicBlock *DefaultDest = CBR->getDefaultDest();
+  BasicBlock *LandingPad = Intrinsic->getParent();
+  SmallVector<Use *, 4> Uses(make_pointer_range(CBR->uses()));
+
+  for (Use *U : Uses) {
+    if (!Visited.insert(U).second)
+      continue;
 
-    // Create the new inline assembly call.
-    CallInst *NewCall =
-        createNewInlineAsm(IA, NewConstraintStr, NewRetTy, NewArgTypes, NewArgs,
-                           ElementTypeAttrs, CB, Builder, F.getContext());
+#ifndef NDEBUG
+    PrintDebugDomInfo(DT, *U, LandingPad, /*IsDefaultDest*/ false);
+    PrintDebugDomInfo(DT, *U, DefaultDest, /*IsDefaultDest*/ true);
+#endif
 
-    // Reconstruct the return value and update users.
-    if (!CB->use_empty()) {
-      if (Value *Replacement =
-              reconstructReturnValue(CB->getType(), NewCall, Constraints,
-                                     OutputAllocas, NewRetTypes, Builder))
-        CB->replaceAllUsesWith(Replacement);
+    // Don't rewrite the use in the newly inserted intrinsic.
+    if (const auto *II = dyn_cast<IntrinsicInst>(U->getUser()))
+      if (II->getIntrinsicID() == Intrinsic::callbr_landingpad)
+        continue;
+
+    // If the Use is in the same BasicBlock as the Intrinsic call, replace
+    // the Use with the value of the Intrinsic call.
+    if (IsInSameBasicBlock(*U, LandingPad)) {
+      U->set(Intrinsic);
+      continue;
     }
 
-    CB->eraseFromParent();
+    // If the Use is dominated by the default dest, do not touch it.
+    if (DT.dominates(DefaultDest, *U))
+      continue;
+
+    SSAUpdate.RewriteUse(*U);
+  }
+}
+
+static bool SplitCriticalEdges(CallBrInst *CBR, DominatorTree *DT) {
+  bool Changed = false;
+
+  CriticalEdgeSplittingOptions Options(DT);
+  Options.setMergeIdenticalEdges();
+
+  // The indirect destination might be duplicated between another parameter...
+  //
+  //   %0 = callbr ... [label %x, label %x]
+  //
+  // ...hence MergeIdenticalEdges and AllowIndentical edges, but we don't need
+  // to split the default destination if it's duplicated between an indirect
+  // destination...
+  //
+  //   %1 = callbr ... to label %x [label %x]
+  //
+  // ...hence starting at 1 and checking against successor 0 (aka the default
+  // destination).
+  for (unsigned i = 1, e = CBR->getNumSuccessors(); i != e; ++i)
+    if (CBR->getSuccessor(i) == CBR->getSuccessor(0) ||
+        isCriticalEdge(CBR, i, /*AllowIdenticalEdges*/ true))
+      if (SplitKnownCriticalEdge(CBR, i, Options))
+        Changed = true;
+
+  return Changed;
+}
+
+static bool InsertIntrinsicCalls(CallBrInst *CBR, DominatorTree &DT) {
+  bool Changed = false;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  IRBuilder<> Builder(CBR->getContext());
+
+  if (!CBR->getNumIndirectDests())
+    return false;
+
+  SSAUpdater SSAUpdate;
+  SSAUpdate.Initialize(CBR->getType(), CBR->getName());
+  SSAUpdate.AddAvailableValue(CBR->getParent(), CBR);
+  SSAUpdate.AddAvailableValue(CBR->getDefaultDest(), CBR);
+
+  for (BasicBlock *IndDest : CBR->getIndirectDests()) {
+    if (!Visited.insert(IndDest).second)
+      continue;
+
+    Builder.SetInsertPoint(&*IndDest->begin());
+    CallInst *Intrinsic = Builder.CreateIntrinsic(
+        CBR->getType(), Intrinsic::callbr_landingpad, {CBR});
+    SSAUpdate.AddAvailableValue(IndDest, Intrinsic);
+    UpdateSSA(DT, CBR, Intrinsic, SSAUpdate);
     Changed = true;
   }
 
   return Changed;
 }
 
+static bool ProcessCallBrInst(Function &F, CallBrInst *CBR, DominatorTree *DT) {
+  bool Changed = false;
+
+  Changed |= SplitCriticalEdges(CBR, DT);
+  Changed |= InsertIntrinsicCalls(CBR, *DT);
+
+  return Changed;
+}
+
+static bool runImpl(Function &F, ArrayRef<CallBase *> IAs, DominatorTree *DT) {
+  bool Changed = false;
+
+  for (CallBase *CB : IAs)
+    if (auto *CBR = dyn_cast<CallBrInst>(CB))
+      Changed |= ProcessCallBrInst(F, CBR, DT);
+    else
+      Changed |= ProcessInlineAsm(F, CB);
+
+  return Changed;
+}
+
+/// Find all inline assembly calls in the given function.
+static SmallVector<CallBase *, 4> FindInlineAsms(Function &F,
+                                                 const TargetMachine *TM) {
+  bool isOptLevelNone = TM->getOptLevel() == CodeGenOptLevel::None;
+  SmallVector<CallBase *, 4> InlineAsms;
+
+  for (BasicBlock &BB : F) {
+    if (auto *CBR = dyn_cast<CallBrInst>(BB.getTerminator())) {
+      if (!CBR->getType()->isVoidTy() && !CBR->use_empty())
+        InlineAsms.push_back(CBR);
+      continue;
+    }
+
+    if (isOptLevelNone)
+      // Only inline assembly compiled at '-O0' (i.e. uses the fast register
+      // allocator) needs to be processed.
+      for (Instruction &I : BB)
+        if (CallBase *CB = dyn_cast<CallBase>(&I); CB && CB->isInlineAsm())
+          InlineAsms.push_back(CB);
+  }
+
+  return InlineAsms;
+}
+
+bool InlineAsmPrepare::runOnFunction(Function &F) {
+  const auto *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+  SmallVector<CallBase *, 4> IAs = FindInlineAsms(F, TM);
+  if (IAs.empty())
+    return false;
+
+  // It's highly likely that most programs do not contain CallBrInsts. Follow a
+  // similar pattern from SafeStackLegacyPass::runOnFunction to reuse previous
+  // domtree analysis if available, otherwise compute it lazily. This avoids
+  // forcing Dominator Tree Construction at -O0 for programs that likely do not
+  // contain CallBrInsts. It does pessimize programs with callbr at higher
+  // optimization levels, as the DominatorTree created here is not reused by
+  // subsequent passes.
+  DominatorTree *DT;
+  std::optional<DominatorTree> LazilyComputedDomTree;
+  if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
+    DT = &DTWP->getDomTree();
+  else {
+    LazilyComputedDomTree.emplace(F);
+    DT = &*LazilyComputedDomTree;
+  }
+
+  return runImpl(F, IAs, DT);
+}
+
 PreservedAnalyses InlineAsmPreparePass::run(Function &F,
                                             FunctionAnalysisManager &FAM) {
-  bool Changed = InlineAsmPrepare().runOnFunction(F);
-  if (!Changed)
+  SmallVector<CallBase *, 4> IAs = FindInlineAsms(F, TM);
+  if (IAs.empty())
     return PreservedAnalyses::all();
 
-  PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
-  return PA;
+  DominatorTree *DT = &FAM.getResult<DominatorTreeAnalysis>(F);
+
+  if (runImpl(F, IAs, DT)) {
+    PreservedAnalyses PA;
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserveSet<CFGAnalyses>();
+    return PA;
+  }
+
+  return PreservedAnalyses::all();
 }
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 35914eb8b911f..7e49693dd7cc0 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -985,10 +985,7 @@ void TargetPassConfig::addISelPrepare() {
   if (getOptLevel() != CodeGenOptLevel::None)
     addPass(createObjCARCContractPass());
 
-  if (getOptLevel() == CodeGenOptLevel::None)
-    addPass(createInlineAsmPass());
-
-  addPass(createCallBrPass());
+  addPass(createInlineAsmPass());
 
   // Add both the safe stack and the stack protection passes: each of them will
   // only protect functions that have corresponding attributes.
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 0491e1fb8de46..61f653fe30b20 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -83,7 +83,6 @@
 #include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"
 #include "llvm/CodeGen/BranchFoldingPass.h"
 #include "llvm/CodeGen/BranchRelaxation.h"
-#include "llvm/CodeGen/CallBrPrepare.h"
 #include "llvm/CodeGen/CodeGenPrepare.h"
 #include "llvm/CodeGen/ComplexDeinterleavingPass.h"
 #include "llvm/CodeGen/DeadMachineInstructionElim.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 1b6774157e291..a837d3aa5d354 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -411,7 +411,6 @@ FUNCTION_PASS("assume-simplify", AssumeSimplifyPass())
 FUNCTION_PASS("atomic-expand", AtomicExpandPass(*TM))
 FUNCTION_PASS("bdce", BDCEPass())
 FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass())
-FUNCTION_PASS("callbr-prepare", CallBrPreparePass())
 FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass())
 FUNCTION_PASS("chr", ControlHeightReductionPass())
 FUNCTION_PASS("codegenprepare", CodeGenPreparePass(*TM))
@@ -450,7 +449,7 @@ FUNCTION_PASS("helloworld", HelloWorldPass())
 FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass(*TM))
 FUNCTION_PASS("infer-address-spaces", InferAddressSpacesPass())
 FUNCTION_PASS("infer-alignment", InferAlignmentPass())
-FUNCTION_PASS("inline-asm-prepare", InlineAsmPreparePass())
+FUNCTION_PASS("inline-asm-prepare", InlineAsmPreparePass(*TM))
 FUNCTION_PASS("inject-tli-mappings", InjectTLIMappings())
 FUNCTION_PASS("instcount", InstCountPass())
 FUNCTION_PASS("instnamer", InstructionNamerPass())
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
index bb81786d97958..9f9e47865c1b8 100644
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -33,8 +33,7 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       AArch64 Stack Tagging
 ; CHECK-NEXT:       Exception handling preparation
-; CHECK-NEXT:       Prepare inline asm insts for fast register allocation
-; CHECK-NEXT:       Prepare callbr
+; CHECK-NEXT:       Prepare inline asm insts
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
 ; CHECK-NEXT:       Module Verifier
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 472f1f616c600..620041253ecfc 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -110,7 +110,7 @@
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       ObjC ARC contraction
-; CHECK-NEXT:       Prepare callbr
+; CHECK-NEXT:       Prepare inline asm insts
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
 ; CHECK-NEXT:       Module Verifier
diff --git a/llvm/test/CodeGen/AArch64/callbr-prepare.ll b/llvm/test/CodeGen/AArch64/callbr-prepare.ll
index 826e27d92720f..c7c976f373efd 100644
--- a/llvm/test/CodeGen/AArch64/callbr-prepare.ll
+++ b/llvm/test/CodeGen/AArch64/callbr-prepare.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt %s -callbrprepare -S -o - | FileCheck %s
-; RUN: opt %s -passes=callbr-prepare -S -o - | FileCheck %s
+; RUN: opt %s -mtriple=aarch64-linux-gnu -inline-asm-prepare -S -o - | FileCheck %s
+; RUN: opt %s -mtriple=aarch64-linux-gnu -passes=inline-asm-prepare -S -o - | FileCheck %s
 
 define i32 @test0() {
 ; CHECK-LABEL: @test0(
@@ -407,7 +407,7 @@ foo:
 }
 
 ; Test the result of the callbr having multiple uses to avoid iterator
-; invalidation bugs in CallBrPrepare::UpdateSSA.
+; invalidation bugs in InlineAsmPrepare::UpdateSSA.
 define i32 @multiple_split() {
 ; CHECK-LABEL: @multiple_split(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 525ab3757e6e0..b1d9d618302a8 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -47,7 +47,7 @@
 ; GCN-O0-NEXT: amdgpu-rewrite-undef-for-phi
 ; GCN-O0-NEXT: lcssa
 ; GCN-O0-NEXT: require<uniformity>
-; GCN-O0-NEXT: callbr-prepare
+; GCN-O0-NEXT: inline-asm-prepare
 ; GCN-O0-NEXT: safe-stack
 ; GCN-O0-NEXT: stack-protector
 ; GCN-O0-NEXT: verify))
@@ -161,7 +161,7 @@
 ; GCN-O2-NEXT: amdgpu-perf-hint
 ; GCN-O2-NEXT: cgscc(function(require<uniformity>
 ; GCN-O2-NEXT: objc-arc-contract
-; GCN-O2-NEXT: callbr-prepare
+; GCN-O2-NEXT: inline-asm-prepare
 ; GCN-O2-NEXT: safe-stack
 ; GCN-O2-NEXT: stack-protector
 ; GCN-O2-NEXT: verify))
@@ -333,7 +333,7 @@
 ; GCN-O3-NEXT: amdgpu-perf-hint
 ; GCN-O3-NEXT: cgscc(function(require<uniformity>
 ; GCN-O3-NEXT: objc-arc-contract
-; GCN-O3-NEXT: callbr-prepare
+; GCN-O3-NEXT: inline-asm-prepare
 ; GCN-O3-NEXT: safe-stack
 ; GCN-O3-NEXT: stack-protector
 ; GCN-O3-NEXT: verify))
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 9a04dcda80978..2904ba604fb1b 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -95,8 +95,7 @@
 ; GCN-O0-NEXT:    Call Graph SCC Pass Manager
 ; GCN-O0-NEXT:      DummyCGSCCPass
 ; GCN-O0-NEXT:      FunctionPass Manager
-; GCN-O0-NEXT:        Prepare inline asm insts for fast register allocation
-; GCN-O0-NEXT:        Prepare callbr
+; GCN-O0-NEXT:        Prepare inline asm insts
 ; GCN-O0-NEXT:        Safe Stack instrumentation pass
 ; GCN-O0-NEXT:        Insert stack protectors
 ; GCN-O0-NEXT:        Dominator Tree Construction
@@ -303,7 +302,7 @@
 ; GCN-O1-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:        Function Alias Analysis Results
 ; GCN-O1-NEXT:        ObjC ARC contraction
-; GCN-O1-NEXT:        Prepare callbr
+; GCN-O1-NEXT:        Prepare inline asm insts
 ; GCN-O1-NEXT:        Safe Stack instrumentation pass
 ; GCN-O1-NEXT:        Insert stack protectors
 ; GCN-O1-NEXT:        Cycle Info Analysis
@@ -616,7 +615,7 @@
 ; GCN-O1-OPTS-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-OPTS-NEXT:        Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:        ObjC ARC contraction
-; GCN-O1-OPTS-NEXT:        Prepare callbr
+; GCN-O1-OPTS-NEXT:        Prepare inline asm insts
 ; GCN-O1-OPTS-NEXT:        Safe Stack instrumentation pass
 ; GCN-O1-OPTS-NEXT:        Insert stack protectors
 ; GCN-O1-OPTS-NEXT:        Cycle Info Analysis
@@ -940,7 +939,7 @@
 ; GCN-O2-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O2-NEXT:        Function Alias Analysis Results
 ; GCN-O2-NEXT:        ObjC ARC contraction
-; GCN-O2-NEXT:        Prepare callbr
+; GCN-O2-NEXT:        Prepare inline asm insts
 ; GCN-O2-NEXT:        Safe Stack instrumentation pass
 ; GCN-O2-NEXT:        Insert stack protectors
 ; GCN-O2-NEXT:        Cycle Info Analysis
@@ -1278,7 +1277,7 @@
 ; GCN-O3-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O3-NEXT:        Function Alias Analysis Results
 ; GCN-O3-NEXT:        ObjC ARC contraction
-; GCN-O3-NEXT:        Prepare callbr
+; GCN-O3-NEXT:        Prepare inline asm insts
 ; GCN-O3-NEXT:        Safe Stack instrumentation pass
 ; GCN-O3-NEXT:        Insert stack protectors
 ; GCN-O3-NEXT:        Cycle Info Analysis
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 5801d3fe55c59..98bb87524db44 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -69,7 +69,7 @@
 ; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:      Function Alias Analysis Results
 ; CHECK-NEXT:      ObjC ARC contraction
-; CHECK-NEXT:      Prepare callbr
+; CHECK-NEXT:      Prepare inline asm insts
 ; CHECK-NEXT:      Safe Stack instrumentation pass
 ; CHECK-NEXT:      Insert stack protectors
 ; CHECK-NEXT:      Module Verifier
diff --git a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
index 40d54aa4d4cf3..bf519342fa4cc 100644
--- a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
@@ -31,8 +31,7 @@
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Exception handling preparation
-; CHECK-NEXT:       Prepare inline asm insts for fast register allocation
-; CHECK-NEXT:       Prepare callbr
+; CHECK-NEXT:       Prepare inline asm insts
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
 ; CHECK-NEXT:       Module Verifier
diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
index dd7bb2b6d19ba..262ee06c6f732 100644
--- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
@@ -76,7 +76,7 @@
 ; LAXX-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; LAXX-NEXT:       Function Alias Analysis Results
 ; LAXX-NEXT:       ObjC ARC contraction
-; LAXX-NEXT:       Prepare callbr
+; LAXX-NEXT:       Prepare inline asm insts
 ; LAXX-NEXT:       Safe Stack instrumentation pass
 ; LAXX-NEXT:       Insert stack protectors
 ; LAXX-NEXT:       Module Verifier
diff --git a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
index a1633aa141698..b0ba623edfb0a 100644
--- a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
@@ -30,8 +30,7 @@
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Exception handling preparation
-; CHECK-NEXT:       Prepare inline asm insts for fast register allocation
-; CHECK-NEXT:       Prepare callbr
+; CHECK-NEXT:       Prepare inline asm insts
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
 ; CHECK-NEXT:       Module Verifier
diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
index 31c0d8558f96a..f771b5728e5b5 100644
--- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
@@ -85,7 +85,7 @@
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       ObjC ARC contraction
-; CHECK-NEXT:       Prepare callbr
+; CHECK-NEXT:       Prepare inline asm insts
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
 ; CHECK-NEXT:       Module Verifier
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
index 9b8cd17f1a5f0..847a8bd96c6d6 100644
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -32,8 +32,7 @@
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Exception handling preparation
-; CHECK-NEXT:       Prepare inline asm insts for fast register allocation
-; CHECK-NEXT:       Prepare callbr
+; CHECK-NEXT:       Prepare inline asm insts
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
 ; CHECK-NEXT:       Module Verifier
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 3b63c1d86d3b1..e3f277ab4e889 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -84,7 +84,7 @@
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       ObjC ARC contraction
-; CHECK-NEXT:       Prepare callbr
+; CHECK-NEXT:       Prepare inline asm insts
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
 ; CHECK-NEXT:       Module Verifier
diff --git a/llvm/test/CodeGen/SPIRV/llc-pipeline.ll b/llvm/test/CodeGen/SPIRV/llc-pipeline.ll
index 203fc7e53ae96..4a13eac25a089 100644
--- a/llvm/test/CodeGen/SPIRV/llc-pipeline.ll
+++ b/llvm/test/CodeGen/SPIRV/llc-pipeline.ll
@@ -45,8 +45,7 @@
 ; SPIRV-O0-NEXT:    SPIRV emit intrinsics
 ; SPIRV-O0-NEXT:    FunctionPass Manager
 ; SPIRV-O0-NEXT:      SPIRV legalize bitcast pass
-; SPIRV-O0-NEXT:      Prepare inline asm insts for fast register allocation
-; SPIRV-O0-NEXT:      Prepare callbr
+; SPIRV-O0-NEXT:      Prepare inline asm insts
 ; SPIRV-O0-NEXT:      Safe Stack instrumentation pass
 ; SPIRV-O0-NEXT:      Insert stack protectors
 ; SPIRV-O0-NEXT:      Analysis containing CSE Info
@@ -157,7 +156,7 @@
 ; SPIRV-Opt-NEXT:      Basic Alias Analysis (stateless AA impl)
 ; SPIRV-Opt-NEXT:      Function Alias Analysis Results
 ; SPIRV-Opt-NEXT:      ObjC ARC contraction
-; SPIRV-Opt-NEXT:      Prepare callbr
+; SPIRV-Opt-NEXT:      Prepare inline asm insts
 ; SPIRV-Opt-NEXT:      Safe Stack instrumentation pass
 ; SPIRV-Opt-NEXT:      Insert stack protectors
 ; SPIRV-Opt-NEXT:      Analysis containing CSE Info
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index ec1d7bcee1f6d..e8a3084563573 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -32,8 +32,7 @@
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Expand indirectbr instructions
 ; CHECK-NEXT:       Exception handling preparation
-; CHECK-NEXT:       Prepare inline asm insts for fast register allocation
-; CHECK-NEXT:       Prepare callbr
+; CHECK-NEXT:       Prepare inline asm insts
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
 ; CHECK-NEXT:       Module Verifier
diff --git a/llvm/test/CodeGen/X86/asm-constraints-rm.ll b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
index 59c4672a97407..a694bdb26390d 100644
--- a/llvm/test/CodeGen/X86/asm-constraints-rm.ll
+++ b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
@@ -3,8 +3,6 @@
 ; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=greedy   < %s | FileCheck --check-prefix=GREEDY-I386 %s
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=basic  < %s | FileCheck --check-prefix=BASIC-X86_64 %s
 ; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=basic    < %s | FileCheck --check-prefix=BASIC-I386 %s
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O0 -regalloc=fast   < %s | FileCheck --check-prefix=FAST-X86_64 %s
-; RUN: llc -mtriple=i386-unknown-linux-gnu -O0 -regalloc=fast     < %s | FileCheck --check-prefix=FAST-I386 %s
 
 ; The non-fast register allocators should use registers when there isn't
 ; register pressure.
@@ -17,10 +15,6 @@ define dso_local i32 @test1(ptr nocapture noundef readonly %ptr) local_unnamed_a
 ; BASIC-X86_64-LABEL: test1:
 ;
 ; BASIC-I386-LABEL: test1:
-;
-; FAST-X86_64-LABEL: test1:
-;
-; FAST-I386-LABEL: test1:
 entry:
   %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
   %0 = load i32, ptr %b, align 4
@@ -39,10 +33,6 @@ define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_a
 ; BASIC-X86_64-LABEL: test2:
 ;
 ; BASIC-I386-LABEL: test2:
-;
-; FAST-X86_64-LABEL: test2:
-;
-; FAST-I386-LABEL: test2:
 entry:
   %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
   %0 = load i32, ptr %b, align 4
@@ -61,10 +51,6 @@ define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr {
 ; BASIC-X86_64-LABEL: test3:
 ;
 ; BASIC-I386-LABEL: test3:
-;
-; FAST-X86_64-LABEL: test3:
-;
-; FAST-I386-LABEL: test3:
 entry:
   %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
   %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
@@ -85,10 +71,6 @@ define dso_local i32 @test4(ptr noundef %ptr) local_unnamed_addr {
 ; BASIC-X86_64-LABEL: test4:
 ;
 ; BASIC-I386-LABEL: test4:
-;
-; FAST-X86_64-LABEL: test4:
-;
-; FAST-I386-LABEL: test4:
 entry:
   %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
   %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
@@ -109,10 +91,6 @@ define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_a
 ; BASIC-X86_64-LABEL: test5:
 ;
 ; BASIC-I386-LABEL: test5:
-;
-; FAST-X86_64-LABEL: test5:
-;
-; FAST-I386-LABEL: test5:
 entry:
   %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
   %0 = load i32, ptr %b, align 4
@@ -135,10 +113,6 @@ define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_a
 ; BASIC-X86_64-LABEL: test6:
 ;
 ; BASIC-I386-LABEL: test6:
-;
-; FAST-X86_64-LABEL: test6:
-;
-; FAST-I386-LABEL: test6:
 entry:
   %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
   %0 = load i32, ptr %b, align 4
@@ -161,10 +135,6 @@ define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr {
 ; BASIC-X86_64-LABEL: test7:
 ;
 ; BASIC-I386-LABEL: test7:
-;
-; FAST-X86_64-LABEL: test7:
-;
-; FAST-I386-LABEL: test7:
 entry:
   %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
   %0 = load i32, ptr %b, align 4
@@ -181,10 +151,6 @@ define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr {
 ; BASIC-X86_64-LABEL: test8:
 ;
 ; BASIC-I386-LABEL: test8:
-;
-; FAST-X86_64-LABEL: test8:
-;
-; FAST-I386-LABEL: test8:
 entry:
   %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
   %0 = load i32, ptr %b, align 4
@@ -203,10 +169,6 @@ define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr {
 ; BASIC-X86_64-LABEL: test9:
 ;
 ; BASIC-I386-LABEL: test9:
-;
-; FAST-X86_64-LABEL: test9:
-;
-; FAST-I386-LABEL: test9:
 entry:
   %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
   %0 = tail call i32 asm sideeffect "# single 'rm' output -> $0", "=rm,~{dirflag},~{fpsr},~{flags}"()
@@ -223,10 +185,6 @@ define dso_local i32 @test10(ptr noundef captures(none) %ptr) local_unnamed_addr
 ; BASIC-X86_64-LABEL: test10:
 ;
 ; BASIC-I386-LABEL: test10:
-;
-; FAST-X86_64-LABEL: test10:
-;
-; FAST-I386-LABEL: test10:
 entry:
   %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
   %0 = load i32, ptr %b, align 4
@@ -244,10 +202,6 @@ define dso_local i32 @test11(ptr noundef captures(none) %ptr) local_unnamed_addr
 ; BASIC-X86_64-LABEL: test11:
 ;
 ; BASIC-I386-LABEL: test11:
-;
-; FAST-X86_64-LABEL: test11:
-;
-; FAST-I386-LABEL: test11:
 entry:
   %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
   %0 = load i32, ptr %b, align 4
diff --git a/llvm/test/CodeGen/X86/llc-pipeline-npm.ll b/llvm/test/CodeGen/X86/llc-pipeline-npm.ll
index 3f3a992b1b23f..9a35399f58655 100644
--- a/llvm/test/CodeGen/X86/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/X86/llc-pipeline-npm.ll
@@ -29,7 +29,7 @@
 ; O0-NEXT: expand-reductions
 ; O0-NEXT: indirectbr-expand
 ; O0-NEXT: dwarf-eh-prepare
-; O0-NEXT: callbr-prepare
+; O0-NEXT: inline-asm-prepare
 ; O0-NEXT: safe-stack
 ; O0-NEXT: stack-protector
 ; O0-NEXT: verify)
@@ -97,7 +97,7 @@
 ; O2-NEXT: codegenprepare
 ; O2-NEXT: dwarf-eh-prepare
 ; O2-NEXT: objc-arc-contract
-; O2-NEXT: callbr-prepare
+; O2-NEXT: inline-asm-prepare
 ; O2-NEXT: safe-stack
 ; O2-NEXT: stack-protector
 ; O2-NEXT: verify)
@@ -199,7 +199,7 @@
 ; O0-WINDOWS-NEXT: cfguard
 ; O0-WINDOWS-NEXT: win-eh-prepare
 ; O0-WINDOWS-NEXT: dwarf-eh-prepare
-; O0-WINDOWS-NEXT: callbr-prepare
+; O0-WINDOWS-NEXT: inline-asm-prepare
 ; O0-WINDOWS-NEXT: safe-stack
 ; O0-WINDOWS-NEXT: stack-protector
 ; O0-WINDOWS-NEXT: verify)
@@ -271,7 +271,7 @@
 ; O3-WINDOWS-NEXT: win-eh-prepare
 ; O3-WINDOWS-NEXT: dwarf-eh-prepare
 ; O3-WINDOWS-NEXT: objc-arc-contract
-; O3-WINDOWS-NEXT: callbr-prepare
+; O3-WINDOWS-NEXT: inline-asm-prepare
 ; O3-WINDOWS-NEXT: safe-stack
 ; O3-WINDOWS-NEXT: stack-protector
 ; O3-WINDOWS-NEXT: verify)
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 617cfd4c01145..55d386d0f0952 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -76,7 +76,7 @@
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       ObjC ARC contraction
-; CHECK-NEXT:       Prepare callbr
+; CHECK-NEXT:       Prepare inline asm insts
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
 ; CHECK-NEXT:       Module Verifier
diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
index b0c2733758948..adfbf2e368e58 100644
--- a/llvm/tools/opt/optdriver.cpp
+++ b/llvm/tools/opt/optdriver.cpp
@@ -378,7 +378,7 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
       "structurizecfg",
       "fix-irreducible",
       "expand-ir-insts",
-      "callbrprepare",
+      "inline-asm-prepare",
       "scalarizer",
   };
   for (StringLiteral P : PassNamePrefix)
@@ -432,7 +432,7 @@ optMain(int argc, char **argv,
   initializeExpandMemCmpLegacyPassPass(Registry);
   initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
   initializeSelectOptimizePass(Registry);
-  initializeCallBrPreparePass(Registry);
+  initializeInlineAsmPreparePass(Registry);
   initializeCodeGenPrepareLegacyPassPass(Registry);
   initializeAtomicExpandLegacyPass(Registry);
   initializeWinEHPreparePass(Registry);
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index 58b78d39533ca..57371a8e08c3c 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -39,7 +39,6 @@ static_library("CodeGen") {
     "CFIFixup.cpp",
     "CFIInstrInserter.cpp",
     "CalcSpillWeights.cpp",
-    "CallBrPrepare.cpp",
     "CallingConvLower.cpp",
     "CodeGen.cpp",
     "CodeGenCommonISel.cpp",
@@ -78,6 +77,7 @@ static_library("CodeGen") {
     "ImplicitNullChecks.cpp",
     "IndirectBrExpandPass.cpp",
     "InitUndef.cpp",
+    "InlineAsmPrepare.cpp",
     "InlineSpiller.cpp",
     "InsertCodePrefetch.cpp",
     "InterferenceCache.cpp",

>From b67ba055ec7dc6a9c43902a9c1d9e970ccc414fa Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Thu, 5 Feb 2026 00:45:01 -0800
Subject: [PATCH 21/29] fixup! Merge CallBrPrepare into InlineAsmPrepare, which
 is the better name for what the pass does.

Remove dead declaration.
---
 llvm/include/llvm/CodeGen/Passes.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 7b51d15427ce7..f3c9fe627f40d 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -622,8 +622,6 @@ LLVM_ABI ModulePass *createJMCInstrumenterPass();
 /// This pass converts conditional moves to conditional jumps when profitable.
 LLVM_ABI FunctionPass *createSelectOptimizePass();
 
-LLVM_ABI FunctionPass *createCallBrPass();
-
 /// Creates Windows Secure Hot Patch pass. \see WindowsSecureHotPatching.cpp
 LLVM_ABI ModulePass *createWindowsSecureHotPatchingPass();
 

>From 6418815761a543242fb1da2b8bfb338b6afb8435 Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Thu, 5 Feb 2026 00:55:48 -0800
Subject: [PATCH 22/29] Update comments.

---
 llvm/include/llvm/CodeGen/Passes.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index f3c9fe627f40d..0fa652fb7e3e5 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -628,8 +628,7 @@ LLVM_ABI ModulePass *createWindowsSecureHotPatchingPass();
 /// Lowers KCFI operand bundles for indirect calls.
 LLVM_ABI FunctionPass *createKCFIPass();
 
-/// Modify inline asms with "rm" constraints to "m" for the fast register
-/// allocator.
+/// Process inline assembly calls to prepare for code generation.
 LLVM_ABI FunctionPass *createInlineAsmPass();
 
 } // namespace llvm

>From 90ef857dbe9cff6b3fb16c47d30c9eee686c452f Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Thu, 5 Feb 2026 01:26:12 -0800
Subject: [PATCH 23/29] Restore deleted files

---
 llvm/include/llvm/CodeGen/CallBrPrepare.h |  23 ++
 llvm/lib/CodeGen/CallBrPrepare.cpp        | 252 ++++++++++++++++++++++
 2 files changed, 275 insertions(+)
 create mode 100644 llvm/include/llvm/CodeGen/CallBrPrepare.h
 create mode 100644 llvm/lib/CodeGen/CallBrPrepare.cpp

diff --git a/llvm/include/llvm/CodeGen/CallBrPrepare.h b/llvm/include/llvm/CodeGen/CallBrPrepare.h
new file mode 100644
index 0000000000000..d44d30b0adc17
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/CallBrPrepare.h
@@ -0,0 +1,23 @@
+//===-- CallBrPrepare - Prepare callbr for code generation ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_CALLBRPREPARE_H
+#define LLVM_CODEGEN_CALLBRPREPARE_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class CallBrPreparePass : public PassInfoMixin<CallBrPreparePass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_CALLBRPREPARE_H
diff --git a/llvm/lib/CodeGen/CallBrPrepare.cpp b/llvm/lib/CodeGen/CallBrPrepare.cpp
new file mode 100644
index 0000000000000..77a0d0b653871
--- /dev/null
+++ b/llvm/lib/CodeGen/CallBrPrepare.cpp
@@ -0,0 +1,252 @@
+//===-- CallBrPrepare - Prepare callbr for code generation ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers callbrs in LLVM IR in order to to assist SelectionDAG's
+// codegen.
+//
+// In particular, this pass assists in inserting register copies for the output
+// values of a callbr along the edges leading to the indirect target blocks.
+// Though the output SSA value is defined by the callbr instruction itself in
+// the IR representation, the value cannot be copied to the appropriate virtual
+// registers prior to jumping to an indirect label, since the jump occurs
+// within the user-provided assembly blob.
+//
+// Instead, those copies must occur separately at the beginning of each
+// indirect target. That requires that we create a separate SSA definition in
+// each of them (via llvm.callbr.landingpad), and may require splitting
+// critical edges so we have a location to place the intrinsic. Finally, we
+// remap users of the original callbr output SSA value to instead point to the
+// appropriate llvm.callbr.landingpad value.
+//
+// Ideally, this could be done inside SelectionDAG, or in the
+// MachineInstruction representation, without the use of an IR-level intrinsic.
+// But, within the current framework, it’s simpler to implement as an IR pass.
+// (If support for callbr in GlobalISel is implemented, it’s worth considering
+// whether this is still required.)
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CallBrPrepare.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "callbr-prepare"
+
+static bool SplitCriticalEdges(ArrayRef<CallBrInst *> CBRs, DominatorTree &DT);
+static bool InsertIntrinsicCalls(ArrayRef<CallBrInst *> CBRs,
+                                 DominatorTree &DT);
+static void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic,
+                      SSAUpdater &SSAUpdate);
+static SmallVector<CallBrInst *, 2> FindCallBrs(Function &F);
+
+namespace {
+
+class CallBrPrepare : public FunctionPass {
+public:
+  CallBrPrepare() : FunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+  static char ID;
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses CallBrPreparePass::run(Function &F,
+                                         FunctionAnalysisManager &FAM) {
+  bool Changed = false;
+  SmallVector<CallBrInst *, 2> CBRs = FindCallBrs(F);
+
+  if (CBRs.empty())
+    return PreservedAnalyses::all();
+
+  auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+
+  Changed |= SplitCriticalEdges(CBRs, DT);
+  Changed |= InsertIntrinsicCalls(CBRs, DT);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
+
+char CallBrPrepare::ID = 0;
+INITIALIZE_PASS_BEGIN(CallBrPrepare, "callbrprepare", "Prepare callbr", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(CallBrPrepare, "callbrprepare", "Prepare callbr", false,
+                    false)
+
+FunctionPass *llvm::createCallBrPass() { return new CallBrPrepare(); }
+
+void CallBrPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<DominatorTreeWrapperPass>();
+}
+
+SmallVector<CallBrInst *, 2> FindCallBrs(Function &F) {
+  SmallVector<CallBrInst *, 2> CBRs;
+  for (BasicBlock &BB : F)
+    if (auto *CBR = dyn_cast<CallBrInst>(BB.getTerminator()))
+      if (!CBR->getType()->isVoidTy() && !CBR->use_empty())
+        CBRs.push_back(CBR);
+  return CBRs;
+}
+
+bool SplitCriticalEdges(ArrayRef<CallBrInst *> CBRs, DominatorTree &DT) {
+  bool Changed = false;
+  CriticalEdgeSplittingOptions Options(&DT);
+  Options.setMergeIdenticalEdges();
+
+  // The indirect destination might be duplicated between another parameter...
+  //   %0 = callbr ... [label %x, label %x]
+  // ...hence MergeIdenticalEdges and AllowIndentical edges, but we don't need
+  // to split the default destination if it's duplicated between an indirect
+  // destination...
+  //   %1 = callbr ... to label %x [label %x]
+  // ...hence starting at 1 and checking against successor 0 (aka the default
+  // destination).
+  for (CallBrInst *CBR : CBRs)
+    for (unsigned i = 1, e = CBR->getNumSuccessors(); i != e; ++i)
+      if (CBR->getSuccessor(i) == CBR->getSuccessor(0) ||
+          isCriticalEdge(CBR, i, /*AllowIdenticalEdges*/ true))
+        if (SplitKnownCriticalEdge(CBR, i, Options))
+          Changed = true;
+  return Changed;
+}
+
+bool InsertIntrinsicCalls(ArrayRef<CallBrInst *> CBRs, DominatorTree &DT) {
+  bool Changed = false;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  IRBuilder<> Builder(CBRs[0]->getContext());
+  for (CallBrInst *CBR : CBRs) {
+    if (!CBR->getNumIndirectDests())
+      continue;
+
+    SSAUpdater SSAUpdate;
+    SSAUpdate.Initialize(CBR->getType(), CBR->getName());
+    SSAUpdate.AddAvailableValue(CBR->getParent(), CBR);
+    SSAUpdate.AddAvailableValue(CBR->getDefaultDest(), CBR);
+
+    for (BasicBlock *IndDest : CBR->getIndirectDests()) {
+      if (!Visited.insert(IndDest).second)
+        continue;
+      Builder.SetInsertPoint(&*IndDest->begin());
+      CallInst *Intrinsic = Builder.CreateIntrinsic(
+          CBR->getType(), Intrinsic::callbr_landingpad, {CBR});
+      SSAUpdate.AddAvailableValue(IndDest, Intrinsic);
+      UpdateSSA(DT, CBR, Intrinsic, SSAUpdate);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+static bool IsInSameBasicBlock(const Use &U, const BasicBlock *BB) {
+  const auto *I = dyn_cast<Instruction>(U.getUser());
+  return I && I->getParent() == BB;
+}
+
+#ifndef NDEBUG
+static void PrintDebugDomInfo(const DominatorTree &DT, const Use &U,
+                              const BasicBlock *BB, bool IsDefaultDest) {
+  if (!isa<Instruction>(U.getUser()))
+    return;
+  LLVM_DEBUG(dbgs() << "Use: " << *U.getUser() << ", in block "
+                    << cast<Instruction>(U.getUser())->getParent()->getName()
+                    << ", is " << (DT.dominates(BB, U) ? "" : "NOT ")
+                    << "dominated by " << BB->getName() << " ("
+                    << (IsDefaultDest ? "in" : "") << "direct)\n");
+}
+#endif
+
+void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic,
+               SSAUpdater &SSAUpdate) {
+
+  SmallPtrSet<Use *, 4> Visited;
+  BasicBlock *DefaultDest = CBR->getDefaultDest();
+  BasicBlock *LandingPad = Intrinsic->getParent();
+
+  SmallVector<Use *, 4> Uses(make_pointer_range(CBR->uses()));
+  for (Use *U : Uses) {
+    if (!Visited.insert(U).second)
+      continue;
+
+#ifndef NDEBUG
+    PrintDebugDomInfo(DT, *U, LandingPad, /*IsDefaultDest*/ false);
+    PrintDebugDomInfo(DT, *U, DefaultDest, /*IsDefaultDest*/ true);
+#endif
+
+    // Don't rewrite the use in the newly inserted intrinsic.
+    if (const auto *II = dyn_cast<IntrinsicInst>(U->getUser()))
+      if (II->getIntrinsicID() == Intrinsic::callbr_landingpad)
+        continue;
+
+    // If the Use is in the same BasicBlock as the Intrinsic call, replace
+    // the Use with the value of the Intrinsic call.
+    if (IsInSameBasicBlock(*U, LandingPad)) {
+      U->set(Intrinsic);
+      continue;
+    }
+
+    // If the Use is dominated by the default dest, do not touch it.
+    if (DT.dominates(DefaultDest, *U))
+      continue;
+
+    SSAUpdate.RewriteUse(*U);
+  }
+}
+
+bool CallBrPrepare::runOnFunction(Function &F) {
+  bool Changed = false;
+  SmallVector<CallBrInst *, 2> CBRs = FindCallBrs(F);
+
+  if (CBRs.empty())
+    return Changed;
+
+  // It's highly likely that most programs do not contain CallBrInsts. Follow a
+  // similar pattern from SafeStackLegacyPass::runOnFunction to reuse previous
+  // domtree analysis if available, otherwise compute it lazily. This avoids
+  // forcing Dominator Tree Construction at -O0 for programs that likely do not
+  // contain CallBrInsts. It does pessimize programs with callbr at higher
+  // optimization levels, as the DominatorTree created here is not reused by
+  // subsequent passes.
+  DominatorTree *DT;
+  std::optional<DominatorTree> LazilyComputedDomTree;
+  if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
+    DT = &DTWP->getDomTree();
+  else {
+    LazilyComputedDomTree.emplace(F);
+    DT = &*LazilyComputedDomTree;
+  }
+
+  if (SplitCriticalEdges(CBRs, *DT))
+    Changed = true;
+
+  if (InsertIntrinsicCalls(CBRs, *DT))
+    Changed = true;
+
+  return Changed;
+}

>From e1df87052c03f43a51fcb696eff5c994c433f385 Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Thu, 5 Feb 2026 01:27:07 -0800
Subject: [PATCH 24/29] Remove temporarily.

---
 llvm/include/llvm/CodeGen/InlineAsmPrepare.h |  31 -
 llvm/lib/CodeGen/InlineAsmPrepare.cpp        | 597 -------------------
 2 files changed, 628 deletions(-)
 delete mode 100644 llvm/include/llvm/CodeGen/InlineAsmPrepare.h
 delete mode 100644 llvm/lib/CodeGen/InlineAsmPrepare.cpp

diff --git a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
deleted file mode 100644
index 130346084b428..0000000000000
--- a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- InlineAsmPrepare - Prepare inline asm for code gen ------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_INLINEASMPREPARE_H
-#define LLVM_CODEGEN_INLINEASMPREPARE_H
-
-#include "llvm/IR/PassManager.h"
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-
-class TargetMachine;
-
-class InlineAsmPreparePass : public PassInfoMixin<InlineAsmPreparePass> {
-  const TargetMachine *TM;
-
-public:
-  explicit InlineAsmPreparePass(const TargetMachine &TM) : TM(&TM) {}
-  LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
-
-  static bool isRequired() { return true; }
-};
-
-} // namespace llvm
-
-#endif // LLVM_CODEGEN_INLINEASMPREPARE_H
diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
deleted file mode 100644
index 0442e99d999d2..0000000000000
--- a/llvm/lib/CodeGen/InlineAsmPrepare.cpp
+++ /dev/null
@@ -1,597 +0,0 @@
-//===-- InlineAsmPrepare - Prepare inline asm for code generation ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass lowers callbrs and inline asm in LLVM IR in order to to assist
-// SelectionDAG's codegen.
-//
-// CallBrInst:
-//
-//   - Assists in inserting register copies for the output values of a callbr
-//     along the edges leading to the indirect target blocks. Though the output
-//     SSA value is defined by the callbr instruction itself in the IR
-//     representation, the value cannot be copied to the appropriate virtual
-//     registers prior to jumping to an indirect label, since the jump occurs
-//     within the user-provided assembly blob.
-//
-//     Instead, those copies must occur separately at the beginning of each
-//     indirect target. That requires that we create a separate SSA definition
-//     in each of them (via llvm.callbr.landingpad), and may require splitting
-//     critical edges so we have a location to place the intrinsic. Finally, we
-//     remap users of the original callbr output SSA value to instead point to
-//     the appropriate llvm.callbr.landingpad value.
-//
-//     Ideally, this could be done inside SelectionDAG, or in the
-//     MachineInstruction representation, without the use of an IR-level
-//     intrinsic.  But, within the current framework, it’s simpler to implement
-//     as an IR pass.  (If support for callbr in GlobalISel is implemented,
-//     it’s worth considering whether this is still required.)
-//
-// InlineAsm:
-//
-//   - Prepares inline assembly for code generation with the fast register
-//     allocator. In particular, it defaults "rm" (register-or-memory) to
-//     prefer the "m" constraints (the front-end opts for the "r" constraint),
-//     simplifying register allocation by forcing operands to memory locations.
-//     The other register allocators are equipped to handle folding registers
-//     already, so don't need to change the default.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/InlineAsmPrepare.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "inline-asm-prepare"
-
-namespace {
-
-class InlineAsmPrepare : public FunctionPass {
-public:
-  InlineAsmPrepare() : FunctionPass(ID) {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<TargetPassConfig>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.setPreservesCFG();
-  }
-  bool runOnFunction(Function &F) override;
-
-  static char ID;
-};
-
-char InlineAsmPrepare::ID = 0;
-
-} // end anonymous namespace
-
-INITIALIZE_PASS_BEGIN(InlineAsmPrepare, DEBUG_TYPE, "Prepare inline asm insts",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(InlineAsmPrepare, DEBUG_TYPE, "Prepare inline asm insts",
-                    false, false)
-
-FunctionPass *llvm::createInlineAsmPass() { return new InlineAsmPrepare(); }
-
-//===----------------------------------------------------------------------===//
-//                     Process InlineAsm instructions
-//===----------------------------------------------------------------------===//
-
-/// The inline asm constraint allows both register and memory.
-static bool IsRegMemConstraint(StringRef Constraint) {
-  return Constraint.size() == 2 && (Constraint == "rm" || Constraint == "mr");
-}
-
-/// Tag "rm" output constraints with '*' to signify that they default to a
-/// memory location.
-static std::pair<std::string, bool>
-ConvertConstraintsToMemory(StringRef ConstraintStr) {
-  auto I = ConstraintStr.begin(), E = ConstraintStr.end();
-  std::string Out;
-  raw_string_ostream O(Out);
-  bool HasRegMem = false;
-
-  while (I != E) {
-    bool IsOutput = false;
-    bool HasIndirect = false;
-    if (*I == '=') {
-      O << *I;
-      IsOutput = true;
-      ++I;
-    }
-    if (*I == '*') {
-      O << '*';
-      HasIndirect = true;
-      ++I;
-    }
-    if (*I == '+') {
-      O << '+';
-      IsOutput = true;
-      ++I;
-    }
-
-    auto Comma = std::find(I, E, ',');
-    std::string Sub(I, Comma);
-    if (IsRegMemConstraint(Sub)) {
-      HasRegMem = true;
-      if (IsOutput && !HasIndirect)
-        O << '*';
-    }
-
-    O << Sub;
-
-    if (Comma == E)
-      break;
-
-    O << ',';
-    I = Comma + 1;
-  }
-
-  return std::make_pair(Out, HasRegMem);
-}
-
-/// Build a map of tied constraints. TiedOutput[i] = j means Constraint i is an
-/// input tied to output constraint j.
-static void
-BuildTiedConstraintMap(const InlineAsm::ConstraintInfoVector &Constraints,
-                       SmallVectorImpl<int> &TiedOutput) {
-  for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
-    const InlineAsm::ConstraintInfo &C = Constraints[I];
-    if (C.Type == InlineAsm::isOutput && C.hasMatchingInput()) {
-      int InputIdx = C.MatchingInput;
-      if (InputIdx >= 0 && InputIdx < (int)Constraints.size())
-        TiedOutput[InputIdx] = I;
-    }
-
-    if (C.Type == InlineAsm::isInput && C.hasMatchingInput()) {
-      int OutputIdx = C.MatchingInput;
-      if (OutputIdx >= 0 && OutputIdx < (int)Constraints.size())
-        TiedOutput[I] = OutputIdx;
-    }
-  }
-}
-
-/// Process an output constraint, creating allocas for converted constraints.
-static void ProcessOutputConstraint(
-    const InlineAsm::ConstraintInfo &C, Type *RetTy, unsigned OutputIdx,
-    IRBuilder<> &EntryBuilder, SmallVectorImpl<Value *> &NewArgs,
-    SmallVectorImpl<Type *> &NewArgTypes, SmallVectorImpl<Type *> &NewRetTypes,
-    SmallVectorImpl<std::pair<unsigned, Type *>> &ElementTypeAttrs,
-    SmallVectorImpl<AllocaInst *> &OutputAllocas, unsigned ConstraintIdx) {
-  Type *SlotTy = RetTy;
-  if (StructType *ST = dyn_cast<StructType>(RetTy))
-    SlotTy = ST->getElementType(OutputIdx);
-
-  if (C.hasRegMemConstraints()) {
-    // Converted to memory constraint. Create alloca and pass pointer as
-    // argument.
-    AllocaInst *Slot = EntryBuilder.CreateAlloca(SlotTy, nullptr, "asm_mem");
-    NewArgs.push_back(Slot);
-    NewArgTypes.push_back(Slot->getType());
-    ElementTypeAttrs.push_back({NewArgs.size() - 1, SlotTy});
-    OutputAllocas[ConstraintIdx] = Slot;
-    // No return value for this output since it's now an out-parameter.
-  } else {
-    // Unchanged, still an output return value.
-    NewRetTypes.push_back(SlotTy);
-  }
-}
-
-/// Process an input constraint, handling tied constraints and conversions.
-static void ProcessInputConstraint(const InlineAsm::ConstraintInfo &C,
-                                   Value *ArgVal, ArrayRef<int> TiedOutput,
-                                   ArrayRef<AllocaInst *> OutputAllocas,
-                                   unsigned ConstraintIdx, IRBuilder<> &Builder,
-                                   IRBuilder<> &EntryBuilder,
-                                   SmallVectorImpl<Value *> &NewArgs,
-                                   SmallVectorImpl<Type *> &NewArgTypes) {
-  Type *ArgTy = ArgVal->getType();
-
-  if (TiedOutput[ConstraintIdx] != -1) {
-    int MatchIdx = TiedOutput[ConstraintIdx];
-    if (AllocaInst *Slot = OutputAllocas[MatchIdx]) {
-      // The matched output was converted to memory. Store this input into the
-      // alloca.
-      Builder.CreateStore(ArgVal, Slot);
-
-      // Pass the alloca pointer as the argument, instead of ArgVal. This
-      // ensures the tied "0" constraint matches the "*m" output.
-      NewArgs.push_back(Slot);
-      NewArgTypes.push_back(Slot->getType());
-      return;
-    }
-  }
-
-  if (C.hasRegMemConstraints()) {
-    // Converted to memory constraint. Create alloca, store input, pass pointer
-    // as argument.
-    AllocaInst *Slot = EntryBuilder.CreateAlloca(ArgTy, nullptr, "asm_mem");
-    Builder.CreateStore(ArgVal, Slot);
-    NewArgs.push_back(Slot);
-    NewArgTypes.push_back(Slot->getType());
-  } else {
-    // Unchanged
-    NewArgs.push_back(ArgVal);
-    NewArgTypes.push_back(ArgTy);
-  }
-}
-
-/// Build the return type from the collected return types.
-static Type *BuildReturnType(ArrayRef<Type *> NewRetTypes,
-                             LLVMContext &Context) {
-  if (NewRetTypes.empty())
-    return Type::getVoidTy(Context);
-
-  if (NewRetTypes.size() == 1)
-    return NewRetTypes[0];
-
-  return StructType::get(Context, NewRetTypes);
-}
-
-/// Create the new inline assembly call with converted constraints.
-static CallInst *CreateNewInlineAsm(
-    InlineAsm *IA, const std::string &NewConstraintStr, Type *NewRetTy,
-    const SmallVectorImpl<Type *> &NewArgTypes,
-    const SmallVectorImpl<Value *> &NewArgs,
-    const SmallVectorImpl<std::pair<unsigned, Type *>> &ElementTypeAttrs,
-    CallBase *CB, IRBuilder<> &Builder, LLVMContext &Context) {
-  FunctionType *NewFTy = FunctionType::get(NewRetTy, NewArgTypes, false);
-  InlineAsm *NewIA = InlineAsm::get(
-      NewFTy, IA->getAsmString(), NewConstraintStr, IA->hasSideEffects(),
-      IA->isAlignStack(), IA->getDialect(), IA->canThrow());
-
-  CallInst *NewCall = Builder.CreateCall(NewFTy, NewIA, NewArgs);
-  NewCall->setCallingConv(CB->getCallingConv());
-  NewCall->setAttributes(CB->getAttributes());
-  NewCall->setDebugLoc(CB->getDebugLoc());
-
-  for (const std::pair<unsigned, Type *> &Item : ElementTypeAttrs)
-    NewCall->addParamAttr(
-        Item.first,
-        Attribute::get(Context, Attribute::ElementType, Item.second));
-
-  return NewCall;
-}
-
-/// Reconstruct the return value from the new call and allocas.
-static Value *
-ReconstructReturnValue(Type *RetTy, CallInst *NewCall,
-                       const InlineAsm::ConstraintInfoVector &Constraints,
-                       const SmallVectorImpl<AllocaInst *> &OutputAllocas,
-                       const SmallVectorImpl<Type *> &NewRetTypes,
-                       IRBuilder<> &Builder) {
-  if (RetTy->isVoidTy())
-    return nullptr;
-
-  if (isa<StructType>(RetTy)) {
-    // Multiple outputs. Reconstruct the struct.
-    Value *Res = PoisonValue::get(RetTy);
-    unsigned NewRetIdx = 0;
-    unsigned OriginalOutIdx = 0;
-
-    for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
-      if (Constraints[I].Type != InlineAsm::isOutput)
-        continue;
-
-      Value *Val = nullptr;
-      if (AllocaInst *Slot = OutputAllocas[I]) {
-        // Converted to memory. Load from alloca.
-        Val = Builder.CreateLoad(Slot->getAllocatedType(), Slot);
-      } else {
-        // Not converted. Extract from NewCall return.
-        if (NewRetTypes.size() == 1) {
-          Val = NewCall;
-        } else {
-          Val = Builder.CreateExtractValue(NewCall, NewRetIdx);
-        }
-        NewRetIdx++;
-      }
-
-      Res = Builder.CreateInsertValue(Res, Val, OriginalOutIdx++);
-    }
-
-    return Res;
-  }
-
-  // Single output.
-  // Find the output constraint (should be the first one).
-  unsigned OutConstraintIdx = 0;
-  for (unsigned I = 0; I < Constraints.size(); ++I) {
-    if (Constraints[I].Type == InlineAsm::isOutput) {
-      OutConstraintIdx = I;
-      break;
-    }
-  }
-
-  if (AllocaInst *Slot = OutputAllocas[OutConstraintIdx])
-    return Builder.CreateLoad(Slot->getAllocatedType(), Slot);
-
-  return NewCall;
-}
-
-static bool ProcessInlineAsm(Function &F, CallBase *CB) {
-  InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
-  const InlineAsm::ConstraintInfoVector &Constraints = IA->ParseConstraints();
-
-  auto [NewConstraintStr, HasRegMem] =
-      ConvertConstraintsToMemory(IA->getConstraintString());
-  if (!HasRegMem)
-    return false;
-
-  IRBuilder<> Builder(CB);
-  IRBuilder<> EntryBuilder(&F.getEntryBlock(), F.getEntryBlock().begin());
-
-  // Collect new arguments and return types.
-  SmallVector<Value *, 8> NewArgs;
-  SmallVector<Type *, 8> NewArgTypes;
-  SmallVector<Type *, 2> NewRetTypes;
-  SmallVector<std::pair<unsigned, Type *>, 8> ElementTypeAttrs;
-
-  // Track allocas created for converted outputs.
-  SmallVector<AllocaInst *, 8> OutputAllocas(Constraints.size(), nullptr);
-
-  // Build tied constraint map.
-  SmallVector<int, 8> TiedOutput(Constraints.size(), -1);
-  BuildTiedConstraintMap(Constraints, TiedOutput);
-
-  // Process constraints.
-  unsigned ArgNo = 0;
-  unsigned OutputIdx = 0;
-  for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
-    const InlineAsm::ConstraintInfo &C = Constraints[I];
-
-    if (C.Type == InlineAsm::isOutput) {
-      ProcessOutputConstraint(C, CB->getType(), OutputIdx, EntryBuilder,
-                              NewArgs, NewArgTypes, NewRetTypes,
-                              ElementTypeAttrs, OutputAllocas, I);
-      OutputIdx++;
-    } else if (C.Type == InlineAsm::isInput) {
-      Value *ArgVal = CB->getArgOperand(ArgNo);
-      ProcessInputConstraint(C, ArgVal, TiedOutput, OutputAllocas, I, Builder,
-                             EntryBuilder, NewArgs, NewArgTypes);
-      ArgNo++;
-    }
-  }
-
-  // Build the new return type.
-  Type *NewRetTy = BuildReturnType(NewRetTypes, F.getContext());
-
-  // Create the new inline assembly call.
-  CallInst *NewCall =
-      CreateNewInlineAsm(IA, NewConstraintStr, NewRetTy, NewArgTypes, NewArgs,
-                         ElementTypeAttrs, CB, Builder, F.getContext());
-
-  // Reconstruct the return value and update users.
-  if (!CB->use_empty()) {
-    if (Value *Replacement =
-            ReconstructReturnValue(CB->getType(), NewCall, Constraints,
-                                   OutputAllocas, NewRetTypes, Builder))
-      CB->replaceAllUsesWith(Replacement);
-  }
-
-  CB->eraseFromParent();
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-//                           Process CallBrInsts
-//===----------------------------------------------------------------------===//
-
-/// The Use is in the same BasicBlock as the intrinsic call.
-static bool IsInSameBasicBlock(const Use &U, const BasicBlock *BB) {
-  const auto *I = dyn_cast<Instruction>(U.getUser());
-  return I && I->getParent() == BB;
-}
-
-#ifndef NDEBUG
-static void PrintDebugDomInfo(const DominatorTree &DT, const Use &U,
-                              const BasicBlock *BB, bool IsDefaultDest) {
-  if (isa<Instruction>(U.getUser()))
-    LLVM_DEBUG(dbgs() << "Use: " << *U.getUser() << ", in block "
-                      << cast<Instruction>(U.getUser())->getParent()->getName()
-                      << ", is " << (DT.dominates(BB, U) ? "" : "NOT ")
-                      << "dominated by " << BB->getName() << " ("
-                      << (IsDefaultDest ? "in" : "") << "direct)\n");
-}
-#endif
-
-static void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic,
-                      SSAUpdater &SSAUpdate) {
-  SmallPtrSet<Use *, 4> Visited;
-
-  BasicBlock *DefaultDest = CBR->getDefaultDest();
-  BasicBlock *LandingPad = Intrinsic->getParent();
-  SmallVector<Use *, 4> Uses(make_pointer_range(CBR->uses()));
-
-  for (Use *U : Uses) {
-    if (!Visited.insert(U).second)
-      continue;
-
-#ifndef NDEBUG
-    PrintDebugDomInfo(DT, *U, LandingPad, /*IsDefaultDest*/ false);
-    PrintDebugDomInfo(DT, *U, DefaultDest, /*IsDefaultDest*/ true);
-#endif
-
-    // Don't rewrite the use in the newly inserted intrinsic.
-    if (const auto *II = dyn_cast<IntrinsicInst>(U->getUser()))
-      if (II->getIntrinsicID() == Intrinsic::callbr_landingpad)
-        continue;
-
-    // If the Use is in the same BasicBlock as the Intrinsic call, replace
-    // the Use with the value of the Intrinsic call.
-    if (IsInSameBasicBlock(*U, LandingPad)) {
-      U->set(Intrinsic);
-      continue;
-    }
-
-    // If the Use is dominated by the default dest, do not touch it.
-    if (DT.dominates(DefaultDest, *U))
-      continue;
-
-    SSAUpdate.RewriteUse(*U);
-  }
-}
-
-static bool SplitCriticalEdges(CallBrInst *CBR, DominatorTree *DT) {
-  bool Changed = false;
-
-  CriticalEdgeSplittingOptions Options(DT);
-  Options.setMergeIdenticalEdges();
-
-  // The indirect destination might be duplicated between another parameter...
-  //
-  //   %0 = callbr ... [label %x, label %x]
-  //
-  // ...hence MergeIdenticalEdges and AllowIndentical edges, but we don't need
-  // to split the default destination if it's duplicated between an indirect
-  // destination...
-  //
-  //   %1 = callbr ... to label %x [label %x]
-  //
-  // ...hence starting at 1 and checking against successor 0 (aka the default
-  // destination).
-  for (unsigned i = 1, e = CBR->getNumSuccessors(); i != e; ++i)
-    if (CBR->getSuccessor(i) == CBR->getSuccessor(0) ||
-        isCriticalEdge(CBR, i, /*AllowIdenticalEdges*/ true))
-      if (SplitKnownCriticalEdge(CBR, i, Options))
-        Changed = true;
-
-  return Changed;
-}
-
-static bool InsertIntrinsicCalls(CallBrInst *CBR, DominatorTree &DT) {
-  bool Changed = false;
-  SmallPtrSet<const BasicBlock *, 4> Visited;
-  IRBuilder<> Builder(CBR->getContext());
-
-  if (!CBR->getNumIndirectDests())
-    return false;
-
-  SSAUpdater SSAUpdate;
-  SSAUpdate.Initialize(CBR->getType(), CBR->getName());
-  SSAUpdate.AddAvailableValue(CBR->getParent(), CBR);
-  SSAUpdate.AddAvailableValue(CBR->getDefaultDest(), CBR);
-
-  for (BasicBlock *IndDest : CBR->getIndirectDests()) {
-    if (!Visited.insert(IndDest).second)
-      continue;
-
-    Builder.SetInsertPoint(&*IndDest->begin());
-    CallInst *Intrinsic = Builder.CreateIntrinsic(
-        CBR->getType(), Intrinsic::callbr_landingpad, {CBR});
-    SSAUpdate.AddAvailableValue(IndDest, Intrinsic);
-    UpdateSSA(DT, CBR, Intrinsic, SSAUpdate);
-    Changed = true;
-  }
-
-  return Changed;
-}
-
-static bool ProcessCallBrInst(Function &F, CallBrInst *CBR, DominatorTree *DT) {
-  bool Changed = false;
-
-  Changed |= SplitCriticalEdges(CBR, DT);
-  Changed |= InsertIntrinsicCalls(CBR, *DT);
-
-  return Changed;
-}
-
-static bool runImpl(Function &F, ArrayRef<CallBase *> IAs, DominatorTree *DT) {
-  bool Changed = false;
-
-  for (CallBase *CB : IAs)
-    if (auto *CBR = dyn_cast<CallBrInst>(CB))
-      Changed |= ProcessCallBrInst(F, CBR, DT);
-    else
-      Changed |= ProcessInlineAsm(F, CB);
-
-  return Changed;
-}
-
-/// Find all inline assembly calls in the given function.
-static SmallVector<CallBase *, 4> FindInlineAsms(Function &F,
-                                                 const TargetMachine *TM) {
-  bool isOptLevelNone = TM->getOptLevel() == CodeGenOptLevel::None;
-  SmallVector<CallBase *, 4> InlineAsms;
-
-  for (BasicBlock &BB : F) {
-    if (auto *CBR = dyn_cast<CallBrInst>(BB.getTerminator())) {
-      if (!CBR->getType()->isVoidTy() && !CBR->use_empty())
-        InlineAsms.push_back(CBR);
-      continue;
-    }
-
-    if (isOptLevelNone)
-      // Only inline assembly compiled at '-O0' (i.e. uses the fast register
-      // allocator) needs to be processed.
-      for (Instruction &I : BB)
-        if (CallBase *CB = dyn_cast<CallBase>(&I); CB && CB->isInlineAsm())
-          InlineAsms.push_back(CB);
-  }
-
-  return InlineAsms;
-}
-
-bool InlineAsmPrepare::runOnFunction(Function &F) {
-  const auto *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
-  SmallVector<CallBase *, 4> IAs = FindInlineAsms(F, TM);
-  if (IAs.empty())
-    return false;
-
-  // It's highly likely that most programs do not contain CallBrInsts. Follow a
-  // similar pattern from SafeStackLegacyPass::runOnFunction to reuse previous
-  // domtree analysis if available, otherwise compute it lazily. This avoids
-  // forcing Dominator Tree Construction at -O0 for programs that likely do not
-  // contain CallBrInsts. It does pessimize programs with callbr at higher
-  // optimization levels, as the DominatorTree created here is not reused by
-  // subsequent passes.
-  DominatorTree *DT;
-  std::optional<DominatorTree> LazilyComputedDomTree;
-  if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
-    DT = &DTWP->getDomTree();
-  else {
-    LazilyComputedDomTree.emplace(F);
-    DT = &*LazilyComputedDomTree;
-  }
-
-  return runImpl(F, IAs, DT);
-}
-
-PreservedAnalyses InlineAsmPreparePass::run(Function &F,
-                                            FunctionAnalysisManager &FAM) {
-  SmallVector<CallBase *, 4> IAs = FindInlineAsms(F, TM);
-  if (IAs.empty())
-    return PreservedAnalyses::all();
-
-  DominatorTree *DT = &FAM.getResult<DominatorTreeAnalysis>(F);
-
-  if (runImpl(F, IAs, DT)) {
-    PreservedAnalyses PA;
-    PA.preserve<DominatorTreeAnalysis>();
-    PA.preserveSet<CFGAnalyses>();
-    return PA;
-  }
-
-  return PreservedAnalyses::all();
-}

>From f39f7af73a5460ba859e62402cfc40cefb8c339a Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Thu, 5 Feb 2026 01:27:54 -0800
Subject: [PATCH 25/29] Rename

---
 llvm/include/llvm/CodeGen/{CallBrPrepare.h => InlineAsmPrepare.h} | 0
 llvm/lib/CodeGen/{CallBrPrepare.cpp => InlineAsmPrepare.cpp}      | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename llvm/include/llvm/CodeGen/{CallBrPrepare.h => InlineAsmPrepare.h} (100%)
 rename llvm/lib/CodeGen/{CallBrPrepare.cpp => InlineAsmPrepare.cpp} (100%)

diff --git a/llvm/include/llvm/CodeGen/CallBrPrepare.h b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
similarity index 100%
rename from llvm/include/llvm/CodeGen/CallBrPrepare.h
rename to llvm/include/llvm/CodeGen/InlineAsmPrepare.h
diff --git a/llvm/lib/CodeGen/CallBrPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
similarity index 100%
rename from llvm/lib/CodeGen/CallBrPrepare.cpp
rename to llvm/lib/CodeGen/InlineAsmPrepare.cpp

>From b6495490856b3cdfc3547db2b925ca4d332d152e Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Thu, 5 Feb 2026 01:28:32 -0800
Subject: [PATCH 26/29] Merge.

---
 llvm/include/llvm/CodeGen/InlineAsmPrepare.h |  20 +-
 llvm/lib/CodeGen/InlineAsmPrepare.cpp        | 599 +++++++++++++++----
 2 files changed, 486 insertions(+), 133 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
index d44d30b0adc17..130346084b428 100644
--- a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
+++ b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
@@ -1,4 +1,4 @@
-//===-- CallBrPrepare - Prepare callbr for code generation ------*- C++ -*-===//
+//===-- InlineAsmPrepare - Prepare inline asm for code gen ------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,18 +6,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_CALLBRPREPARE_H
-#define LLVM_CODEGEN_CALLBRPREPARE_H
+#ifndef LLVM_CODEGEN_INLINEASMPREPARE_H
+#define LLVM_CODEGEN_INLINEASMPREPARE_H
 
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
 
-class CallBrPreparePass : public PassInfoMixin<CallBrPreparePass> {
+class TargetMachine;
+
+class InlineAsmPreparePass : public PassInfoMixin<InlineAsmPreparePass> {
+  const TargetMachine *TM;
+
 public:
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  explicit InlineAsmPreparePass(const TargetMachine &TM) : TM(&TM) {}
+  LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+
+  static bool isRequired() { return true; }
 };
 
 } // namespace llvm
 
-#endif // LLVM_CODEGEN_CALLBRPREPARE_H
+#endif // LLVM_CODEGEN_INLINEASMPREPARE_H
diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
index 77a0d0b653871..0442e99d999d2 100644
--- a/llvm/lib/CodeGen/InlineAsmPrepare.cpp
+++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
@@ -1,4 +1,4 @@
-//===-- CallBrPrepare - Prepare callbr for code generation ----------------===//
+//===-- InlineAsmPrepare - Prepare inline asm for code generation ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,164 +6,399 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass lowers callbrs in LLVM IR in order to to assist SelectionDAG's
-// codegen.
+// This pass lowers callbrs and inline asm in LLVM IR in order to to assist
+// SelectionDAG's codegen.
 //
-// In particular, this pass assists in inserting register copies for the output
-// values of a callbr along the edges leading to the indirect target blocks.
-// Though the output SSA value is defined by the callbr instruction itself in
-// the IR representation, the value cannot be copied to the appropriate virtual
-// registers prior to jumping to an indirect label, since the jump occurs
-// within the user-provided assembly blob.
+// CallBrInst:
 //
-// Instead, those copies must occur separately at the beginning of each
-// indirect target. That requires that we create a separate SSA definition in
-// each of them (via llvm.callbr.landingpad), and may require splitting
-// critical edges so we have a location to place the intrinsic. Finally, we
-// remap users of the original callbr output SSA value to instead point to the
-// appropriate llvm.callbr.landingpad value.
+//   - Assists in inserting register copies for the output values of a callbr
+//     along the edges leading to the indirect target blocks. Though the output
+//     SSA value is defined by the callbr instruction itself in the IR
+//     representation, the value cannot be copied to the appropriate virtual
+//     registers prior to jumping to an indirect label, since the jump occurs
+//     within the user-provided assembly blob.
 //
-// Ideally, this could be done inside SelectionDAG, or in the
-// MachineInstruction representation, without the use of an IR-level intrinsic.
-// But, within the current framework, it’s simpler to implement as an IR pass.
-// (If support for callbr in GlobalISel is implemented, it’s worth considering
-// whether this is still required.)
+//     Instead, those copies must occur separately at the beginning of each
+//     indirect target. That requires that we create a separate SSA definition
+//     in each of them (via llvm.callbr.landingpad), and may require splitting
+//     critical edges so we have a location to place the intrinsic. Finally, we
+//     remap users of the original callbr output SSA value to instead point to
+//     the appropriate llvm.callbr.landingpad value.
+//
+//     Ideally, this could be done inside SelectionDAG, or in the
+//     MachineInstruction representation, without the use of an IR-level
+//     intrinsic.  But, within the current framework, it’s simpler to implement
+//     as an IR pass.  (If support for callbr in GlobalISel is implemented,
+//     it’s worth considering whether this is still required.)
+//
+// InlineAsm:
+//
+//   - Prepares inline assembly for code generation with the fast register
+//     allocator. In particular, it defaults "rm" (register-or-memory) to
+//     prefer the "m" constraints (the front-end opts for the "r" constraint),
+//     simplifying register allocation by forcing operands to memory locations.
+//     The other register allocators are equipped to handle folding registers
+//     already, so don't need to change the default.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/CallBrPrepare.h"
+#include "llvm/CodeGen/InlineAsmPrepare.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/iterator.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/BasicBlock.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
 
-#define DEBUG_TYPE "callbr-prepare"
-
-static bool SplitCriticalEdges(ArrayRef<CallBrInst *> CBRs, DominatorTree &DT);
-static bool InsertIntrinsicCalls(ArrayRef<CallBrInst *> CBRs,
-                                 DominatorTree &DT);
-static void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic,
-                      SSAUpdater &SSAUpdate);
-static SmallVector<CallBrInst *, 2> FindCallBrs(Function &F);
+#define DEBUG_TYPE "inline-asm-prepare"
 
 namespace {
 
-class CallBrPrepare : public FunctionPass {
+class InlineAsmPrepare : public FunctionPass {
 public:
-  CallBrPrepare() : FunctionPass(ID) {}
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  InlineAsmPrepare() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.setPreservesCFG();
+  }
   bool runOnFunction(Function &F) override;
+
   static char ID;
 };
 
-} // end anonymous namespace
+char InlineAsmPrepare::ID = 0;
 
-PreservedAnalyses CallBrPreparePass::run(Function &F,
-                                         FunctionAnalysisManager &FAM) {
-  bool Changed = false;
-  SmallVector<CallBrInst *, 2> CBRs = FindCallBrs(F);
+} // end anonymous namespace
 
-  if (CBRs.empty())
-    return PreservedAnalyses::all();
+INITIALIZE_PASS_BEGIN(InlineAsmPrepare, DEBUG_TYPE, "Prepare inline asm insts",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(InlineAsmPrepare, DEBUG_TYPE, "Prepare inline asm insts",
+                    false, false)
 
-  auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+FunctionPass *llvm::createInlineAsmPass() { return new InlineAsmPrepare(); }
 
-  Changed |= SplitCriticalEdges(CBRs, DT);
-  Changed |= InsertIntrinsicCalls(CBRs, DT);
+//===----------------------------------------------------------------------===//
+//                     Process InlineAsm instructions
+//===----------------------------------------------------------------------===//
 
-  if (!Changed)
-    return PreservedAnalyses::all();
-  PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
-  return PA;
+/// The inline asm constraint allows both register and memory.
+static bool IsRegMemConstraint(StringRef Constraint) {
+  return Constraint.size() == 2 && (Constraint == "rm" || Constraint == "mr");
 }
 
-char CallBrPrepare::ID = 0;
-INITIALIZE_PASS_BEGIN(CallBrPrepare, "callbrprepare", "Prepare callbr", false,
-                      false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(CallBrPrepare, "callbrprepare", "Prepare callbr", false,
-                    false)
+/// Tag "rm" output constraints with '*' to signify that they default to a
+/// memory location.
+static std::pair<std::string, bool>
+ConvertConstraintsToMemory(StringRef ConstraintStr) {
+  auto I = ConstraintStr.begin(), E = ConstraintStr.end();
+  std::string Out;
+  raw_string_ostream O(Out);
+  bool HasRegMem = false;
+
+  while (I != E) {
+    bool IsOutput = false;
+    bool HasIndirect = false;
+    if (*I == '=') {
+      O << *I;
+      IsOutput = true;
+      ++I;
+    }
+    if (*I == '*') {
+      O << '*';
+      HasIndirect = true;
+      ++I;
+    }
+    if (*I == '+') {
+      O << '+';
+      IsOutput = true;
+      ++I;
+    }
+
+    auto Comma = std::find(I, E, ',');
+    std::string Sub(I, Comma);
+    if (IsRegMemConstraint(Sub)) {
+      HasRegMem = true;
+      if (IsOutput && !HasIndirect)
+        O << '*';
+    }
 
-FunctionPass *llvm::createCallBrPass() { return new CallBrPrepare(); }
+    O << Sub;
 
-void CallBrPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addPreserved<DominatorTreeWrapperPass>();
+    if (Comma == E)
+      break;
+
+    O << ',';
+    I = Comma + 1;
+  }
+
+  return std::make_pair(Out, HasRegMem);
 }
 
-SmallVector<CallBrInst *, 2> FindCallBrs(Function &F) {
-  SmallVector<CallBrInst *, 2> CBRs;
-  for (BasicBlock &BB : F)
-    if (auto *CBR = dyn_cast<CallBrInst>(BB.getTerminator()))
-      if (!CBR->getType()->isVoidTy() && !CBR->use_empty())
-        CBRs.push_back(CBR);
-  return CBRs;
+/// Build a map of tied constraints. TiedOutput[i] = j means Constraint i is an
+/// input tied to output constraint j.
+static void
+BuildTiedConstraintMap(const InlineAsm::ConstraintInfoVector &Constraints,
+                       SmallVectorImpl<int> &TiedOutput) {
+  for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
+    const InlineAsm::ConstraintInfo &C = Constraints[I];
+    if (C.Type == InlineAsm::isOutput && C.hasMatchingInput()) {
+      int InputIdx = C.MatchingInput;
+      if (InputIdx >= 0 && InputIdx < (int)Constraints.size())
+        TiedOutput[InputIdx] = I;
+    }
+
+    if (C.Type == InlineAsm::isInput && C.hasMatchingInput()) {
+      int OutputIdx = C.MatchingInput;
+      if (OutputIdx >= 0 && OutputIdx < (int)Constraints.size())
+        TiedOutput[I] = OutputIdx;
+    }
+  }
 }
 
-bool SplitCriticalEdges(ArrayRef<CallBrInst *> CBRs, DominatorTree &DT) {
-  bool Changed = false;
-  CriticalEdgeSplittingOptions Options(&DT);
-  Options.setMergeIdenticalEdges();
+/// Process an output constraint, creating allocas for converted constraints.
+static void ProcessOutputConstraint(
+    const InlineAsm::ConstraintInfo &C, Type *RetTy, unsigned OutputIdx,
+    IRBuilder<> &EntryBuilder, SmallVectorImpl<Value *> &NewArgs,
+    SmallVectorImpl<Type *> &NewArgTypes, SmallVectorImpl<Type *> &NewRetTypes,
+    SmallVectorImpl<std::pair<unsigned, Type *>> &ElementTypeAttrs,
+    SmallVectorImpl<AllocaInst *> &OutputAllocas, unsigned ConstraintIdx) {
+  Type *SlotTy = RetTy;
+  if (StructType *ST = dyn_cast<StructType>(RetTy))
+    SlotTy = ST->getElementType(OutputIdx);
+
+  if (C.hasRegMemConstraints()) {
+    // Converted to memory constraint. Create alloca and pass pointer as
+    // argument.
+    AllocaInst *Slot = EntryBuilder.CreateAlloca(SlotTy, nullptr, "asm_mem");
+    NewArgs.push_back(Slot);
+    NewArgTypes.push_back(Slot->getType());
+    ElementTypeAttrs.push_back({NewArgs.size() - 1, SlotTy});
+    OutputAllocas[ConstraintIdx] = Slot;
+    // No return value for this output since it's now an out-parameter.
+  } else {
+    // Unchanged, still an output return value.
+    NewRetTypes.push_back(SlotTy);
+  }
+}
 
-  // The indirect destination might be duplicated between another parameter...
-  //   %0 = callbr ... [label %x, label %x]
-  // ...hence MergeIdenticalEdges and AllowIndentical edges, but we don't need
-  // to split the default destination if it's duplicated between an indirect
-  // destination...
-  //   %1 = callbr ... to label %x [label %x]
-  // ...hence starting at 1 and checking against successor 0 (aka the default
-  // destination).
-  for (CallBrInst *CBR : CBRs)
-    for (unsigned i = 1, e = CBR->getNumSuccessors(); i != e; ++i)
-      if (CBR->getSuccessor(i) == CBR->getSuccessor(0) ||
-          isCriticalEdge(CBR, i, /*AllowIdenticalEdges*/ true))
-        if (SplitKnownCriticalEdge(CBR, i, Options))
-          Changed = true;
-  return Changed;
+/// Process an input constraint, handling tied constraints and conversions.
+static void ProcessInputConstraint(const InlineAsm::ConstraintInfo &C,
+                                   Value *ArgVal, ArrayRef<int> TiedOutput,
+                                   ArrayRef<AllocaInst *> OutputAllocas,
+                                   unsigned ConstraintIdx, IRBuilder<> &Builder,
+                                   IRBuilder<> &EntryBuilder,
+                                   SmallVectorImpl<Value *> &NewArgs,
+                                   SmallVectorImpl<Type *> &NewArgTypes) {
+  Type *ArgTy = ArgVal->getType();
+
+  if (TiedOutput[ConstraintIdx] != -1) {
+    int MatchIdx = TiedOutput[ConstraintIdx];
+    if (AllocaInst *Slot = OutputAllocas[MatchIdx]) {
+      // The matched output was converted to memory. Store this input into the
+      // alloca.
+      Builder.CreateStore(ArgVal, Slot);
+
+      // Pass the alloca pointer as the argument, instead of ArgVal. This
+      // ensures the tied "0" constraint matches the "*m" output.
+      NewArgs.push_back(Slot);
+      NewArgTypes.push_back(Slot->getType());
+      return;
+    }
+  }
+
+  if (C.hasRegMemConstraints()) {
+    // Converted to memory constraint. Create alloca, store input, pass pointer
+    // as argument.
+    AllocaInst *Slot = EntryBuilder.CreateAlloca(ArgTy, nullptr, "asm_mem");
+    Builder.CreateStore(ArgVal, Slot);
+    NewArgs.push_back(Slot);
+    NewArgTypes.push_back(Slot->getType());
+  } else {
+    // Unchanged
+    NewArgs.push_back(ArgVal);
+    NewArgTypes.push_back(ArgTy);
+  }
 }
 
-bool InsertIntrinsicCalls(ArrayRef<CallBrInst *> CBRs, DominatorTree &DT) {
-  bool Changed = false;
-  SmallPtrSet<const BasicBlock *, 4> Visited;
-  IRBuilder<> Builder(CBRs[0]->getContext());
-  for (CallBrInst *CBR : CBRs) {
-    if (!CBR->getNumIndirectDests())
-      continue;
+/// Build the return type from the collected return types.
+static Type *BuildReturnType(ArrayRef<Type *> NewRetTypes,
+                             LLVMContext &Context) {
+  if (NewRetTypes.empty())
+    return Type::getVoidTy(Context);
+
+  if (NewRetTypes.size() == 1)
+    return NewRetTypes[0];
 
-    SSAUpdater SSAUpdate;
-    SSAUpdate.Initialize(CBR->getType(), CBR->getName());
-    SSAUpdate.AddAvailableValue(CBR->getParent(), CBR);
-    SSAUpdate.AddAvailableValue(CBR->getDefaultDest(), CBR);
+  return StructType::get(Context, NewRetTypes);
+}
+
+/// Create the new inline assembly call with converted constraints.
+static CallInst *CreateNewInlineAsm(
+    InlineAsm *IA, const std::string &NewConstraintStr, Type *NewRetTy,
+    const SmallVectorImpl<Type *> &NewArgTypes,
+    const SmallVectorImpl<Value *> &NewArgs,
+    const SmallVectorImpl<std::pair<unsigned, Type *>> &ElementTypeAttrs,
+    CallBase *CB, IRBuilder<> &Builder, LLVMContext &Context) {
+  FunctionType *NewFTy = FunctionType::get(NewRetTy, NewArgTypes, false);
+  InlineAsm *NewIA = InlineAsm::get(
+      NewFTy, IA->getAsmString(), NewConstraintStr, IA->hasSideEffects(),
+      IA->isAlignStack(), IA->getDialect(), IA->canThrow());
+
+  CallInst *NewCall = Builder.CreateCall(NewFTy, NewIA, NewArgs);
+  NewCall->setCallingConv(CB->getCallingConv());
+  NewCall->setAttributes(CB->getAttributes());
+  NewCall->setDebugLoc(CB->getDebugLoc());
+
+  for (const std::pair<unsigned, Type *> &Item : ElementTypeAttrs)
+    NewCall->addParamAttr(
+        Item.first,
+        Attribute::get(Context, Attribute::ElementType, Item.second));
+
+  return NewCall;
+}
 
-    for (BasicBlock *IndDest : CBR->getIndirectDests()) {
-      if (!Visited.insert(IndDest).second)
+/// Reconstruct the return value from the new call and allocas.
+static Value *
+ReconstructReturnValue(Type *RetTy, CallInst *NewCall,
+                       const InlineAsm::ConstraintInfoVector &Constraints,
+                       const SmallVectorImpl<AllocaInst *> &OutputAllocas,
+                       const SmallVectorImpl<Type *> &NewRetTypes,
+                       IRBuilder<> &Builder) {
+  if (RetTy->isVoidTy())
+    return nullptr;
+
+  if (isa<StructType>(RetTy)) {
+    // Multiple outputs. Reconstruct the struct.
+    Value *Res = PoisonValue::get(RetTy);
+    unsigned NewRetIdx = 0;
+    unsigned OriginalOutIdx = 0;
+
+    for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
+      if (Constraints[I].Type != InlineAsm::isOutput)
         continue;
-      Builder.SetInsertPoint(&*IndDest->begin());
-      CallInst *Intrinsic = Builder.CreateIntrinsic(
-          CBR->getType(), Intrinsic::callbr_landingpad, {CBR});
-      SSAUpdate.AddAvailableValue(IndDest, Intrinsic);
-      UpdateSSA(DT, CBR, Intrinsic, SSAUpdate);
-      Changed = true;
+
+      Value *Val = nullptr;
+      if (AllocaInst *Slot = OutputAllocas[I]) {
+        // Converted to memory. Load from alloca.
+        Val = Builder.CreateLoad(Slot->getAllocatedType(), Slot);
+      } else {
+        // Not converted. Extract from NewCall return.
+        if (NewRetTypes.size() == 1) {
+          Val = NewCall;
+        } else {
+          Val = Builder.CreateExtractValue(NewCall, NewRetIdx);
+        }
+        NewRetIdx++;
+      }
+
+      Res = Builder.CreateInsertValue(Res, Val, OriginalOutIdx++);
     }
+
+    return Res;
   }
-  return Changed;
+
+  // Single output.
+  // Find the output constraint (should be the first one).
+  unsigned OutConstraintIdx = 0;
+  for (unsigned I = 0; I < Constraints.size(); ++I) {
+    if (Constraints[I].Type == InlineAsm::isOutput) {
+      OutConstraintIdx = I;
+      break;
+    }
+  }
+
+  if (AllocaInst *Slot = OutputAllocas[OutConstraintIdx])
+    return Builder.CreateLoad(Slot->getAllocatedType(), Slot);
+
+  return NewCall;
 }
 
+static bool ProcessInlineAsm(Function &F, CallBase *CB) {
+  InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
+  const InlineAsm::ConstraintInfoVector &Constraints = IA->ParseConstraints();
+
+  auto [NewConstraintStr, HasRegMem] =
+      ConvertConstraintsToMemory(IA->getConstraintString());
+  if (!HasRegMem)
+    return false;
+
+  IRBuilder<> Builder(CB);
+  IRBuilder<> EntryBuilder(&F.getEntryBlock(), F.getEntryBlock().begin());
+
+  // Collect new arguments and return types.
+  SmallVector<Value *, 8> NewArgs;
+  SmallVector<Type *, 8> NewArgTypes;
+  SmallVector<Type *, 2> NewRetTypes;
+  SmallVector<std::pair<unsigned, Type *>, 8> ElementTypeAttrs;
+
+  // Track allocas created for converted outputs.
+  SmallVector<AllocaInst *, 8> OutputAllocas(Constraints.size(), nullptr);
+
+  // Build tied constraint map.
+  SmallVector<int, 8> TiedOutput(Constraints.size(), -1);
+  BuildTiedConstraintMap(Constraints, TiedOutput);
+
+  // Process constraints.
+  unsigned ArgNo = 0;
+  unsigned OutputIdx = 0;
+  for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
+    const InlineAsm::ConstraintInfo &C = Constraints[I];
+
+    if (C.Type == InlineAsm::isOutput) {
+      ProcessOutputConstraint(C, CB->getType(), OutputIdx, EntryBuilder,
+                              NewArgs, NewArgTypes, NewRetTypes,
+                              ElementTypeAttrs, OutputAllocas, I);
+      OutputIdx++;
+    } else if (C.Type == InlineAsm::isInput) {
+      Value *ArgVal = CB->getArgOperand(ArgNo);
+      ProcessInputConstraint(C, ArgVal, TiedOutput, OutputAllocas, I, Builder,
+                             EntryBuilder, NewArgs, NewArgTypes);
+      ArgNo++;
+    }
+  }
+
+  // Build the new return type.
+  Type *NewRetTy = BuildReturnType(NewRetTypes, F.getContext());
+
+  // Create the new inline assembly call.
+  CallInst *NewCall =
+      CreateNewInlineAsm(IA, NewConstraintStr, NewRetTy, NewArgTypes, NewArgs,
+                         ElementTypeAttrs, CB, Builder, F.getContext());
+
+  // Reconstruct the return value and update users.
+  if (!CB->use_empty()) {
+    if (Value *Replacement =
+            ReconstructReturnValue(CB->getType(), NewCall, Constraints,
+                                   OutputAllocas, NewRetTypes, Builder))
+      CB->replaceAllUsesWith(Replacement);
+  }
+
+  CB->eraseFromParent();
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                           Process CallBrInsts
+//===----------------------------------------------------------------------===//
+
+/// The Use is in the same BasicBlock as the intrinsic call.
 static bool IsInSameBasicBlock(const Use &U, const BasicBlock *BB) {
   const auto *I = dyn_cast<Instruction>(U.getUser());
   return I && I->getParent() == BB;
@@ -172,24 +407,23 @@ static bool IsInSameBasicBlock(const Use &U, const BasicBlock *BB) {
 #ifndef NDEBUG
 static void PrintDebugDomInfo(const DominatorTree &DT, const Use &U,
                               const BasicBlock *BB, bool IsDefaultDest) {
-  if (!isa<Instruction>(U.getUser()))
-    return;
-  LLVM_DEBUG(dbgs() << "Use: " << *U.getUser() << ", in block "
-                    << cast<Instruction>(U.getUser())->getParent()->getName()
-                    << ", is " << (DT.dominates(BB, U) ? "" : "NOT ")
-                    << "dominated by " << BB->getName() << " ("
-                    << (IsDefaultDest ? "in" : "") << "direct)\n");
+  if (isa<Instruction>(U.getUser()))
+    LLVM_DEBUG(dbgs() << "Use: " << *U.getUser() << ", in block "
+                      << cast<Instruction>(U.getUser())->getParent()->getName()
+                      << ", is " << (DT.dominates(BB, U) ? "" : "NOT ")
+                      << "dominated by " << BB->getName() << " ("
+                      << (IsDefaultDest ? "in" : "") << "direct)\n");
 }
 #endif
 
-void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic,
-               SSAUpdater &SSAUpdate) {
-
+static void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic,
+                      SSAUpdater &SSAUpdate) {
   SmallPtrSet<Use *, 4> Visited;
+
   BasicBlock *DefaultDest = CBR->getDefaultDest();
   BasicBlock *LandingPad = Intrinsic->getParent();
-
   SmallVector<Use *, 4> Uses(make_pointer_range(CBR->uses()));
+
   for (Use *U : Uses) {
     if (!Visited.insert(U).second)
       continue;
@@ -219,12 +453,111 @@ void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic,
   }
 }
 
-bool CallBrPrepare::runOnFunction(Function &F) {
+static bool SplitCriticalEdges(CallBrInst *CBR, DominatorTree *DT) {
+  bool Changed = false;
+
+  CriticalEdgeSplittingOptions Options(DT);
+  Options.setMergeIdenticalEdges();
+
+  // The indirect destination might be duplicated between another parameter...
+  //
+  //   %0 = callbr ... [label %x, label %x]
+  //
+  // ...hence MergeIdenticalEdges and AllowIndentical edges, but we don't need
+  // to split the default destination if it's duplicated between an indirect
+  // destination...
+  //
+  //   %1 = callbr ... to label %x [label %x]
+  //
+  // ...hence starting at 1 and checking against successor 0 (aka the default
+  // destination).
+  for (unsigned i = 1, e = CBR->getNumSuccessors(); i != e; ++i)
+    if (CBR->getSuccessor(i) == CBR->getSuccessor(0) ||
+        isCriticalEdge(CBR, i, /*AllowIdenticalEdges*/ true))
+      if (SplitKnownCriticalEdge(CBR, i, Options))
+        Changed = true;
+
+  return Changed;
+}
+
+static bool InsertIntrinsicCalls(CallBrInst *CBR, DominatorTree &DT) {
   bool Changed = false;
-  SmallVector<CallBrInst *, 2> CBRs = FindCallBrs(F);
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  IRBuilder<> Builder(CBR->getContext());
+
+  if (!CBR->getNumIndirectDests())
+    return false;
+
+  SSAUpdater SSAUpdate;
+  SSAUpdate.Initialize(CBR->getType(), CBR->getName());
+  SSAUpdate.AddAvailableValue(CBR->getParent(), CBR);
+  SSAUpdate.AddAvailableValue(CBR->getDefaultDest(), CBR);
 
-  if (CBRs.empty())
-    return Changed;
+  for (BasicBlock *IndDest : CBR->getIndirectDests()) {
+    if (!Visited.insert(IndDest).second)
+      continue;
+
+    Builder.SetInsertPoint(&*IndDest->begin());
+    CallInst *Intrinsic = Builder.CreateIntrinsic(
+        CBR->getType(), Intrinsic::callbr_landingpad, {CBR});
+    SSAUpdate.AddAvailableValue(IndDest, Intrinsic);
+    UpdateSSA(DT, CBR, Intrinsic, SSAUpdate);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+static bool ProcessCallBrInst(Function &F, CallBrInst *CBR, DominatorTree *DT) {
+  bool Changed = false;
+
+  Changed |= SplitCriticalEdges(CBR, DT);
+  Changed |= InsertIntrinsicCalls(CBR, *DT);
+
+  return Changed;
+}
+
+static bool runImpl(Function &F, ArrayRef<CallBase *> IAs, DominatorTree *DT) {
+  bool Changed = false;
+
+  for (CallBase *CB : IAs)
+    if (auto *CBR = dyn_cast<CallBrInst>(CB))
+      Changed |= ProcessCallBrInst(F, CBR, DT);
+    else
+      Changed |= ProcessInlineAsm(F, CB);
+
+  return Changed;
+}
+
+/// Find all inline assembly calls in the given function.
+static SmallVector<CallBase *, 4> FindInlineAsms(Function &F,
+                                                 const TargetMachine *TM) {
+  bool isOptLevelNone = TM->getOptLevel() == CodeGenOptLevel::None;
+  SmallVector<CallBase *, 4> InlineAsms;
+
+  for (BasicBlock &BB : F) {
+    if (auto *CBR = dyn_cast<CallBrInst>(BB.getTerminator())) {
+      if (!CBR->getType()->isVoidTy() && !CBR->use_empty())
+        InlineAsms.push_back(CBR);
+      continue;
+    }
+
+    if (isOptLevelNone)
+      // Only inline assembly compiled at '-O0' (i.e. uses the fast register
+      // allocator) needs to be processed.
+      for (Instruction &I : BB)
+        if (CallBase *CB = dyn_cast<CallBase>(&I); CB && CB->isInlineAsm())
+          InlineAsms.push_back(CB);
+  }
+
+  return InlineAsms;
+}
+
+bool InlineAsmPrepare::runOnFunction(Function &F) {
+  const auto *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+  SmallVector<CallBase *, 4> IAs = FindInlineAsms(F, TM);
+  if (IAs.empty())
+    return false;
 
   // It's highly likely that most programs do not contain CallBrInsts. Follow a
   // similar pattern from SafeStackLegacyPass::runOnFunction to reuse previous
@@ -242,11 +575,23 @@ bool CallBrPrepare::runOnFunction(Function &F) {
     DT = &*LazilyComputedDomTree;
   }
 
-  if (SplitCriticalEdges(CBRs, *DT))
-    Changed = true;
+  return runImpl(F, IAs, DT);
+}
 
-  if (InsertIntrinsicCalls(CBRs, *DT))
-    Changed = true;
+PreservedAnalyses InlineAsmPreparePass::run(Function &F,
+                                            FunctionAnalysisManager &FAM) {
+  SmallVector<CallBase *, 4> IAs = FindInlineAsms(F, TM);
+  if (IAs.empty())
+    return PreservedAnalyses::all();
 
-  return Changed;
+  DominatorTree *DT = &FAM.getResult<DominatorTreeAnalysis>(F);
+
+  if (runImpl(F, IAs, DT)) {
+    PreservedAnalyses PA;
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserveSet<CFGAnalyses>();
+    return PA;
+  }
+
+  return PreservedAnalyses::all();
 }

>From 0e03f247821983cb88397a0f2d19845e23abe3af Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Fri, 6 Feb 2026 20:50:00 -0800
Subject: [PATCH 27/29] Handle indirect output constraints (like =*m)
 correctly.

---
 llvm/lib/CodeGen/InlineAsmPrepare.cpp | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
index 0442e99d999d2..c9ff17c7b4e43 100644
--- a/llvm/lib/CodeGen/InlineAsmPrepare.cpp
+++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
@@ -362,10 +362,22 @@ static bool ProcessInlineAsm(Function &F, CallBase *CB) {
     const InlineAsm::ConstraintInfo &C = Constraints[I];
 
     if (C.Type == InlineAsm::isOutput) {
-      ProcessOutputConstraint(C, CB->getType(), OutputIdx, EntryBuilder,
-                              NewArgs, NewArgTypes, NewRetTypes,
-                              ElementTypeAttrs, OutputAllocas, I);
-      OutputIdx++;
+      if (C.isIndirect) {
+        // Indirect output takes a pointer argument from the original call.
+        // Pass it through to the new call.
+        Value *ArgVal = CB->getArgOperand(ArgNo);
+        NewArgs.push_back(ArgVal);
+        NewArgTypes.push_back(ArgVal->getType());
+        // Preserve element type attribute if present.
+        if (auto *Ty = CB->getParamElementType(ArgNo))
+          ElementTypeAttrs.push_back({NewArgs.size() - 1, Ty});
+        ArgNo++;
+      } else {
+        ProcessOutputConstraint(C, CB->getType(), OutputIdx, EntryBuilder,
+                                NewArgs, NewArgTypes, NewRetTypes,
+                                ElementTypeAttrs, OutputAllocas, I);
+        OutputIdx++;
+      }
     } else if (C.Type == InlineAsm::isInput) {
       Value *ArgVal = CB->getArgOperand(ArgNo);
       ProcessInputConstraint(C, ArgVal, TiedOutput, OutputAllocas, I, Builder,

>From 766ac37f39c70ce50595e9a2590b6415fc8fa30f Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Fri, 6 Feb 2026 20:50:32 -0800
Subject: [PATCH 28/29] Update testcase. It's comprehensive now.

---
 llvm/test/CodeGen/X86/asm-constraints-rm.ll | 1405 +++++++++++++++++--
 1 file changed, 1250 insertions(+), 155 deletions(-)

diff --git a/llvm/test/CodeGen/X86/asm-constraints-rm.ll b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
index a694bdb26390d..086b430bccac8 100644
--- a/llvm/test/CodeGen/X86/asm-constraints-rm.ll
+++ b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
@@ -1,212 +1,1307 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "^\t#" --version 4
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=greedy < %s | FileCheck --check-prefix=GREEDY-X86_64 %s
-; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=greedy   < %s | FileCheck --check-prefix=GREEDY-I386 %s
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 -regalloc=basic  < %s | FileCheck --check-prefix=BASIC-X86_64 %s
-; RUN: llc -mtriple=i386-unknown-linux-gnu -O2 -regalloc=basic    < %s | FileCheck --check-prefix=BASIC-I386 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "^\t(mov|call|#)" --version 4
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 < %s | FileCheck --check-prefix=O2 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O0 < %s | FileCheck --check-prefix=O0 %s
 
 ; The non-fast register allocators should use registers when there isn't
 ; register pressure.
 
-define dso_local i32 @test1(ptr nocapture noundef readonly %ptr) local_unnamed_addr {
-; GREEDY-X86_64-LABEL: test1:
+define dso_local i32 @test1(ptr noundef readonly captures(none) %foo) local_unnamed_addr {
+; O2-LABEL: test1:
+; O2:    movl (%rdi), %eax
+; O2:    movl 4(%rdi), %ecx
+; O2:    movl 8(%rdi), %edx
+; O2:    movl 12(%rdi), %esi
+; O2:    movl 16(%rdi), %r8d
+; O2:    #APP
+; O2:    # rm input: no pressure
+; O2:    # %eax %ecx %edx %esi %r8d
+; O2:    #NO_APP
+; O2:    movl (%rdi), %eax
 ;
-; GREEDY-I386-LABEL: test1:
-;
-; BASIC-X86_64-LABEL: test1:
-;
-; BASIC-I386-LABEL: test1:
+; O0-LABEL: test1:
+; O0:    movl (%rdi), %r8d
+; O0:    movl 4(%rdi), %esi
+; O0:    movl 8(%rdi), %edx
+; O0:    movl 12(%rdi), %ecx
+; O0:    movl 16(%rdi), %eax
+; O0:    movl %r8d, -{{[0-9]+}}(%rsp)
+; O0:    movl %esi, -{{[0-9]+}}(%rsp)
+; O0:    movl %edx, -{{[0-9]+}}(%rsp)
+; O0:    movl %ecx, -{{[0-9]+}}(%rsp)
+; O0:    movl %eax, -{{[0-9]+}}(%rsp)
+; O0:    movq %rax, -{{[0-9]+}}(%rsp)
+; O0:    movq %rax, -{{[0-9]+}}(%rsp)
+; O0:    movq %rax, -{{[0-9]+}}(%rsp)
+; O0:    movq %rax, -{{[0-9]+}}(%rsp)
+; O0:    movq %rax, -{{[0-9]+}}(%rsp)
+; O0:    #APP
+; O0:    # rm input: no pressure
+; O0:    # -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movl (%rdi), %eax
 entry:
-  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
-  %0 = load i32, ptr %b, align 4
-  %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
-  %1 = load i32, ptr %d, align 4
-  tail call void asm sideeffect "# dual 'rm' input no pressure -> $0 $1", "rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
-  %2 = load i32, ptr %ptr, align 4
-  ret i32 %2
+  %0 = load i32, ptr %foo, align 4
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %1 = load i32, ptr %b, align 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %2 = load i32, ptr %c, align 4
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %3 = load i32, ptr %d, align 4
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %4 = load i32, ptr %e, align 4
+  tail call void asm sideeffect "# rm input: no pressure\0A\09# $0 $1 $2 $3 $4", "rm,rm,rm,rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4)
+  %5 = load i32, ptr %foo, align 4
+  ret i32 %5
 }
 
-define dso_local i32 @test2(ptr nocapture noundef readonly %ptr) local_unnamed_addr {
-; GREEDY-X86_64-LABEL: test2:
+define dso_local i32 @test2(ptr noundef readonly captures(none) %foo) local_unnamed_addr {
+; O2-LABEL: test2:
+; O2:    movq %rdi, (%rsp) # 8-byte Spill
+; O2:    #APP
+; O2:    movq $0, %rax
+; O2:    movq $1, %rcx
+; O2:    movq $2, %rdx
+; O2:    movq $3, %rsi
+; O2:    movq $4, %rdi
+; O2:    movq $5, %rbx
+; O2:    movq $6, %rbp
+; O2:    movq $7, %r8
+; O2:    movq $8, %r9
+; O2:    movq $9, %r10
+; O2:    movq $10, %r11
+; O2:    movq $11, %r12
+; O2:    movq $12, %r13
+; O2:    movq $13, %r14
+; O2:    movq $14, %r15
+; O2:    #NO_APP
+; O2:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r9, %rbp
+; O2:    movq (%rsp), %rbx # 8-byte Reload
+; O2:    movl (%rbx), %esi
+; O2:    movl 4(%rbx), %edi
+; O2:    movl 8(%rbx), %r8d
+; O2:    movl 12(%rbx), %r9d
+; O2:    movl 16(%rbx), %eax
+; O2:    #APP
+; O2:    # rm input: pressure
+; O2:    # %esi %edi %r8d %r9d %eax
+; O2:    #NO_APP
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O2:    movq %rcx, %rsi
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O2:    callq g at PLT
+; O2:    movl (%rbx), %eax
 ;
-; GREEDY-I386-LABEL: test2:
+; O0-LABEL: test2:
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    movq $0, %rax
+; O0:    movq $1, %rcx
+; O0:    movq $2, %rdx
+; O0:    movq $3, %rsi
+; O0:    movq $4, %rdi
+; O0:    movq $5, %rbx
+; O0:    movq $6, %rbp
+; O0:    movq $7, %r8
+; O0:    movq $8, %r9
+; O0:    movq $9, %r10
+; O0:    movq $10, %r11
+; O0:    movq $11, %r12
+; O0:    movq $12, %r13
+; O0:    movq $13, %r14
+; O0:    movq $14, %r15
+; O0:    #NO_APP
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rsi, %rcx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0:    movq %rdi, %rax
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %r8, %rbx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O0:    movq %r9, %rax
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movl (%rax), %eax
+; O0:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movl 4(%rax), %eax
+; O0:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movl 8(%rax), %eax
+; O0:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movl 12(%rax), %eax
+; O0:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movl 16(%rax), %eax
+; O0:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    #APP
+; O0:    # rm input: pressure
+; O0:    # {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movq %rbp, (%rsp)
+; O0:    movq %rbx, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %r10, {{[0-9]+}}(%rsp)
+; O0:    movq %r11, {{[0-9]+}}(%rsp)
+; O0:    movq %r12, {{[0-9]+}}(%rsp)
+; O0:    movq %r13, {{[0-9]+}}(%rsp)
+; O0:    movq %r14, {{[0-9]+}}(%rsp)
+; O0:    movq %r15, {{[0-9]+}}(%rsp)
+; O0:    callq g at PLT
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl (%rdi), %eax
+entry:
+  %0 = tail call { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "mov $$0, $0\0A\09mov $$1, $1\0A\09mov $$2, $2\0A\09mov $$3, $3\0A\09mov $$4, $4\0A\09mov $$5, $5\0A\09mov $$6, $6\0A\09mov $$7, $7\0A\09mov $$8, $8\0A\09mov $$9, $9\0A\09mov $$10, $10\0A\09mov $$11, $11\0A\09mov $$12, $12\0A\09mov $$13, $13\0A\09mov $$14, $14", "={rax},={rcx},={rdx},={rsi},={rdi},={rbx},={rbp},={r8},={r9},={r10},={r11},={r12},={r13},={r14},={r15},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 0
+  %asmresult1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 1
+  %asmresult2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 2
+  %asmresult3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 3
+  %asmresult4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 4
+  %asmresult5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 5
+  %asmresult6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 6
+  %asmresult7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 7
+  %asmresult8 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 8
+  %asmresult9 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 9
+  %asmresult10 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 10
+  %asmresult11 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 11
+  %asmresult12 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 12
+  %asmresult13 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 13
+  %asmresult14 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 14
+  %1 = load i32, ptr %foo, align 4
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %2 = load i32, ptr %b, align 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %3 = load i32, ptr %c, align 4
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %4 = load i32, ptr %d, align 4
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %5 = load i32, ptr %e, align 4
+  tail call void asm sideeffect "# rm input: pressure\0A\09# $0 $1 $2 $3 $4", "rm,rm,rm,rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5)
+  tail call void @g(i64 noundef %asmresult, i64 noundef %asmresult1, i64 noundef %asmresult2, i64 noundef %asmresult3, i64 noundef %asmresult4, i64 noundef %asmresult5, i64 noundef %asmresult6, i64 noundef %asmresult7, i64 noundef %asmresult8, i64 noundef %asmresult9, i64 noundef %asmresult10, i64 noundef %asmresult11, i64 noundef %asmresult12, i64 noundef %asmresult13, i64 noundef %asmresult14)
+  %6 = load i32, ptr %foo, align 4
+  ret i32 %6
+}
+
+define dso_local i32 @test3(ptr noundef writeonly captures(none) initializes((0, 20)) %foo) local_unnamed_addr {
+; O2-LABEL: test3:
+; O2:    #APP
+; O2:    # rm output: no pressure
+; O2:    # %eax %ecx %edx %esi %r8d
+; O2:    #NO_APP
+; O2:    movl %eax, (%rdi)
+; O2:    movl %ecx, 4(%rdi)
+; O2:    movl %edx, 8(%rdi)
+; O2:    movl %esi, 12(%rdi)
+; O2:    movl %r8d, 16(%rdi)
 ;
-; BASIC-X86_64-LABEL: test2:
+; O0-LABEL: test3:
+; O0:    #APP
+; O0:    # rm output: no pressure
+; O0:    # -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movl -{{[0-9]+}}(%rsp), %eax
+; O0:    movl -{{[0-9]+}}(%rsp), %r8d
+; O0:    movl -{{[0-9]+}}(%rsp), %esi
+; O0:    movl -{{[0-9]+}}(%rsp), %edx
+; O0:    movl -{{[0-9]+}}(%rsp), %ecx
+; O0:    movl %eax, (%rdi)
+; O0:    movl %r8d, 4(%rdi)
+; O0:    movl %esi, 8(%rdi)
+; O0:    movl %edx, 12(%rdi)
+; O0:    movl %ecx, 16(%rdi)
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %0 = tail call { i32, i32, i32, i32, i32 } asm sideeffect "# rm output: no pressure\0A\09# $0 $1 $2 $3 $4", "=rm,=rm,=rm,=rm,=rm,~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i32, i32, i32, i32, i32 } %0, 0
+  %asmresult1 = extractvalue { i32, i32, i32, i32, i32 } %0, 1
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32 } %0, 2
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32 } %0, 3
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32 } %0, 4
+  store i32 %asmresult, ptr %foo, align 4
+  store i32 %asmresult1, ptr %b, align 4
+  store i32 %asmresult2, ptr %c, align 4
+  store i32 %asmresult3, ptr %d, align 4
+  store i32 %asmresult4, ptr %e, align 4
+  ret i32 %asmresult
+}
+
+define dso_local i32 @test4(ptr noundef writeonly captures(none) initializes((0, 20)) %foo) local_unnamed_addr {
+; O2-LABEL: test4:
+; O2:    movq %rdi, (%rsp) # 8-byte Spill
+; O2:    #APP
+; O2:    movq $0, %rax
+; O2:    movq $1, %rcx
+; O2:    movq $2, %rdx
+; O2:    movq $3, %rsi
+; O2:    movq $4, %rdi
+; O2:    movq $5, %rbx
+; O2:    movq $6, %rbp
+; O2:    movq $7, %r8
+; O2:    movq $8, %r9
+; O2:    movq $9, %r10
+; O2:    movq $10, %r11
+; O2:    movq $11, %r12
+; O2:    movq $12, %r13
+; O2:    movq $13, %r14
+; O2:    movq $14, %r15
+; O2:    #NO_APP
+; O2:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r9, %rbp
+; O2:    #APP
+; O2:    # rm output: pressure
+; O2:    # %esi %edi %r8d %r9d %eax
+; O2:    #NO_APP
+; O2:    movq (%rsp), %rbx # 8-byte Reload
+; O2:    movl %esi, (%rbx)
+; O2:    movl %edi, 4(%rbx)
+; O2:    movl %r8d, 8(%rbx)
+; O2:    movl %r9d, 12(%rbx)
+; O2:    movl %eax, 16(%rbx)
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O2:    movq %rcx, %rsi
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O2:    callq g at PLT
+; O2:    movl (%rbx), %eax
 ;
-; BASIC-I386-LABEL: test2:
+; O0-LABEL: test4:
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    movq $0, %rax
+; O0:    movq $1, %rcx
+; O0:    movq $2, %rdx
+; O0:    movq $3, %rsi
+; O0:    movq $4, %rdi
+; O0:    movq $5, %rbx
+; O0:    movq $6, %rbp
+; O0:    movq $7, %r8
+; O0:    movq $8, %r9
+; O0:    movq $9, %r10
+; O0:    movq $10, %r11
+; O0:    movq $11, %r12
+; O0:    movq $12, %r13
+; O0:    movq $13, %r14
+; O0:    movq $14, %r15
+; O0:    #NO_APP
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rsi, %rcx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0:    movq %rdi, %rax
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %r8, %rbx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O0:    movq %r9, %rax
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    # rm output: pressure
+; O0:    # {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movl {{[0-9]+}}(%rsp), %eax
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl %eax, (%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 4(%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 8(%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 12(%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 16(%rdi)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rbp, (%rsp)
+; O0:    movq %rbx, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %r10, {{[0-9]+}}(%rsp)
+; O0:    movq %r11, {{[0-9]+}}(%rsp)
+; O0:    movq %r12, {{[0-9]+}}(%rsp)
+; O0:    movq %r13, {{[0-9]+}}(%rsp)
+; O0:    movq %r14, {{[0-9]+}}(%rsp)
+; O0:    movq %r15, {{[0-9]+}}(%rsp)
+; O0:    callq g at PLT
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl (%rdi), %eax
 entry:
-  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
-  %0 = load i32, ptr %b, align 4
-  %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
-  %1 = load i32, ptr %d, align 4
-  tail call void asm sideeffect "# dual 'rm' input pressure -> $0 $1", "rm,rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
-  %2 = load i32, ptr %ptr, align 4
+  %0 = tail call { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "mov $$0, $0\0A\09mov $$1, $1\0A\09mov $$2, $2\0A\09mov $$3, $3\0A\09mov $$4, $4\0A\09mov $$5, $5\0A\09mov $$6, $6\0A\09mov $$7, $7\0A\09mov $$8, $8\0A\09mov $$9, $9\0A\09mov $$10, $10\0A\09mov $$11, $11\0A\09mov $$12, $12\0A\09mov $$13, $13\0A\09mov $$14, $14", "={rax},={rcx},={rdx},={rsi},={rdi},={rbx},={rbp},={r8},={r9},={r10},={r11},={r12},={r13},={r14},={r15},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 0
+  %asmresult1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 1
+  %asmresult2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 2
+  %asmresult3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 3
+  %asmresult4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 4
+  %asmresult5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 5
+  %asmresult6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 6
+  %asmresult7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 7
+  %asmresult8 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 8
+  %asmresult9 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 9
+  %asmresult10 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 10
+  %asmresult11 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 11
+  %asmresult12 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 12
+  %asmresult13 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 13
+  %asmresult14 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 14
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %1 = tail call { i32, i32, i32, i32, i32 } asm sideeffect "# rm output: pressure\0A\09# $0 $1 $2 $3 $4", "=rm,=rm,=rm,=rm,=rm,~{dirflag},~{fpsr},~{flags}"()
+  %asmresult15 = extractvalue { i32, i32, i32, i32, i32 } %1, 0
+  %asmresult16 = extractvalue { i32, i32, i32, i32, i32 } %1, 1
+  %asmresult17 = extractvalue { i32, i32, i32, i32, i32 } %1, 2
+  %asmresult18 = extractvalue { i32, i32, i32, i32, i32 } %1, 3
+  %asmresult19 = extractvalue { i32, i32, i32, i32, i32 } %1, 4
+  store i32 %asmresult15, ptr %foo, align 4
+  store i32 %asmresult16, ptr %b, align 4
+  store i32 %asmresult17, ptr %c, align 4
+  store i32 %asmresult18, ptr %d, align 4
+  store i32 %asmresult19, ptr %e, align 4
+  tail call void @g(i64 noundef %asmresult, i64 noundef %asmresult1, i64 noundef %asmresult2, i64 noundef %asmresult3, i64 noundef %asmresult4, i64 noundef %asmresult5, i64 noundef %asmresult6, i64 noundef %asmresult7, i64 noundef %asmresult8, i64 noundef %asmresult9, i64 noundef %asmresult10, i64 noundef %asmresult11, i64 noundef %asmresult12, i64 noundef %asmresult13, i64 noundef %asmresult14)
+  %2 = load i32, ptr %foo, align 4
   ret i32 %2
 }
 
-define dso_local i32 @test3(ptr noundef %ptr) local_unnamed_addr {
-; GREEDY-X86_64-LABEL: test3:
-;
-; GREEDY-I386-LABEL: test3:
+define dso_local i32 @test5(ptr noundef captures(none) %foo) local_unnamed_addr {
+; O2-LABEL: test5:
+; O2:    movl (%rdi), %eax
+; O2:    movl 4(%rdi), %ecx
+; O2:    movl 8(%rdi), %edx
+; O2:    movl 12(%rdi), %esi
+; O2:    movl 16(%rdi), %r8d
+; O2:    #APP
+; O2:    # rm tied output: no pressure
+; O2:    # %eax %ecx %edx %esi %r8d
+; O2:    #NO_APP
+; O2:    movl %eax, (%rdi)
+; O2:    movl %ecx, 4(%rdi)
+; O2:    movl %edx, 8(%rdi)
+; O2:    movl %esi, 12(%rdi)
+; O2:    movl %r8d, 16(%rdi)
 ;
-; BASIC-X86_64-LABEL: test3:
-;
-; BASIC-I386-LABEL: test3:
+; O0-LABEL: test5:
+; O0:    movl (%rdi), %r8d
+; O0:    movl 4(%rdi), %esi
+; O0:    movl 8(%rdi), %edx
+; O0:    movl 12(%rdi), %ecx
+; O0:    movl 16(%rdi), %eax
+; O0:    movl %r8d, -{{[0-9]+}}(%rsp)
+; O0:    movl %esi, -{{[0-9]+}}(%rsp)
+; O0:    movl %edx, -{{[0-9]+}}(%rsp)
+; O0:    movl %ecx, -{{[0-9]+}}(%rsp)
+; O0:    movl %eax, -{{[0-9]+}}(%rsp)
+; O0:    #APP
+; O0:    # rm tied output: no pressure
+; O0:    # %eax %ecx %edx %esi %r8d
+; O0:    #NO_APP
+; O0:    movl %r8d, -{{[0-9]+}}(%rsp)
+; O0:    movl %esi, -{{[0-9]+}}(%rsp)
+; O0:    movl %edx, -{{[0-9]+}}(%rsp)
+; O0:    movl %ecx, -{{[0-9]+}}(%rsp)
+; O0:    movl %eax, -{{[0-9]+}}(%rsp)
+; O0:    movl -{{[0-9]+}}(%rsp), %eax
+; O0:    movl -{{[0-9]+}}(%rsp), %r8d
+; O0:    movl -{{[0-9]+}}(%rsp), %esi
+; O0:    movl -{{[0-9]+}}(%rsp), %edx
+; O0:    movl -{{[0-9]+}}(%rsp), %ecx
+; O0:    movl %eax, (%rdi)
+; O0:    movl %r8d, 4(%rdi)
+; O0:    movl %esi, 8(%rdi)
+; O0:    movl %edx, 12(%rdi)
+; O0:    movl %ecx, 16(%rdi)
 entry:
-  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
-  %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
-  %0 = tail call { i32, i32 } asm sideeffect "# dual 'rm' output no pressure -> $0 $1", "=rm,=rm,~{dirflag},~{fpsr},~{flags}"()
-  %asmresult = extractvalue { i32, i32 } %0, 0
-  %asmresult1 = extractvalue { i32, i32 } %0, 1
-  store i32 %asmresult, ptr %b, align 4
-  store i32 %asmresult1, ptr %d, align 4
-  %1 = load i32, ptr %ptr, align 4
-  ret i32 %1
+  %0 = load i32, ptr %foo, align 4
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %1 = load i32, ptr %b, align 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %2 = load i32, ptr %c, align 4
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %3 = load i32, ptr %d, align 4
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %4 = load i32, ptr %e, align 4
+  %5 = tail call { i32, i32, i32, i32, i32 } asm sideeffect "# rm tied output: no pressure\0A\09# $0 $1 $2 $3 $4", "=rm,=rm,=rm,=rm,=rm,0,1,2,3,4,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4)
+  %asmresult = extractvalue { i32, i32, i32, i32, i32 } %5, 0
+  %asmresult1 = extractvalue { i32, i32, i32, i32, i32 } %5, 1
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32 } %5, 2
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32 } %5, 3
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32 } %5, 4
+  store i32 %asmresult, ptr %foo, align 4
+  store i32 %asmresult1, ptr %b, align 4
+  store i32 %asmresult2, ptr %c, align 4
+  store i32 %asmresult3, ptr %d, align 4
+  store i32 %asmresult4, ptr %e, align 4
+  ret i32 %asmresult
 }
 
-define dso_local i32 @test4(ptr noundef %ptr) local_unnamed_addr {
-; GREEDY-X86_64-LABEL: test4:
-;
-; GREEDY-I386-LABEL: test4:
+define dso_local i32 @test6(ptr noundef captures(none) %foo) local_unnamed_addr {
+; O2-LABEL: test6:
+; O2:    movq %rdi, (%rsp) # 8-byte Spill
+; O2:    #APP
+; O2:    movq $0, %rax
+; O2:    movq $1, %rcx
+; O2:    movq $2, %rdx
+; O2:    movq $3, %rsi
+; O2:    movq $4, %rdi
+; O2:    movq $5, %rbx
+; O2:    movq $6, %rbp
+; O2:    movq $7, %r8
+; O2:    movq $8, %r9
+; O2:    movq $9, %r10
+; O2:    movq $10, %r11
+; O2:    movq $11, %r12
+; O2:    movq $12, %r13
+; O2:    movq $13, %r14
+; O2:    movq $14, %r15
+; O2:    #NO_APP
+; O2:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r9, %rbp
+; O2:    movq (%rsp), %rbx # 8-byte Reload
+; O2:    movl (%rbx), %esi
+; O2:    movl 4(%rbx), %edi
+; O2:    movl 8(%rbx), %r8d
+; O2:    movl 12(%rbx), %r9d
+; O2:    movl 16(%rbx), %eax
+; O2:    #APP
+; O2:    # rm tied output: pressure
+; O2:    # %esi %edi %r8d %r9d %eax
+; O2:    #NO_APP
+; O2:    movl %esi, (%rbx)
+; O2:    movl %edi, 4(%rbx)
+; O2:    movl %r8d, 8(%rbx)
+; O2:    movl %r9d, 12(%rbx)
+; O2:    movl %eax, 16(%rbx)
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O2:    movq %rcx, %rsi
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O2:    callq g at PLT
+; O2:    movl (%rbx), %eax
 ;
-; BASIC-X86_64-LABEL: test4:
-;
-; BASIC-I386-LABEL: test4:
+; O0-LABEL: test6:
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    movq $0, %rax
+; O0:    movq $1, %rcx
+; O0:    movq $2, %rdx
+; O0:    movq $3, %rsi
+; O0:    movq $4, %rdi
+; O0:    movq $5, %rbx
+; O0:    movq $6, %rbp
+; O0:    movq $7, %r8
+; O0:    movq $8, %r9
+; O0:    movq $9, %r10
+; O0:    movq $10, %r11
+; O0:    movq $11, %r12
+; O0:    movq $12, %r13
+; O0:    movq $13, %r14
+; O0:    movq $14, %r15
+; O0:    #NO_APP
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdi, %rcx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %r8, %rbx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O0:    movq %r9, %rcx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movl (%rax), %edi
+; O0:    movl 4(%rax), %esi
+; O0:    movl 8(%rax), %edx
+; O0:    movl 12(%rax), %ecx
+; O0:    movl 16(%rax), %eax
+; O0:    movl %edi, {{[0-9]+}}(%rsp)
+; O0:    movl %esi, {{[0-9]+}}(%rsp)
+; O0:    movl %edx, {{[0-9]+}}(%rsp)
+; O0:    movl %ecx, {{[0-9]+}}(%rsp)
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl %ecx, %edi
+; O0:    #APP
+; O0:    # rm tied output: pressure
+; O0:    # %eax %edi %ecx %edx %esi
+; O0:    #NO_APP
+; O0:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %edi, {{[0-9]+}}(%rsp)
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[0-9]+}}(%rsp), %eax
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl %eax, (%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 4(%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 8(%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 12(%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 16(%rdi)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rbp, (%rsp)
+; O0:    movq %rbx, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %r10, {{[0-9]+}}(%rsp)
+; O0:    movq %r11, {{[0-9]+}}(%rsp)
+; O0:    movq %r12, {{[0-9]+}}(%rsp)
+; O0:    movq %r13, {{[0-9]+}}(%rsp)
+; O0:    movq %r14, {{[0-9]+}}(%rsp)
+; O0:    movq %r15, {{[0-9]+}}(%rsp)
+; O0:    callq g at PLT
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl (%rdi), %eax
 entry:
-  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
-  %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
-  %0 = tail call { i32, i32 } asm sideeffect "# dual 'rm' output pressure -> $0 $1", "=rm,=rm,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"()
-  %asmresult = extractvalue { i32, i32 } %0, 0
-  %asmresult1 = extractvalue { i32, i32 } %0, 1
-  store i32 %asmresult, ptr %b, align 4
-  store i32 %asmresult1, ptr %d, align 4
-  %1 = load i32, ptr %ptr, align 4
-  ret i32 %1
+  %0 = tail call { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "mov $$0, $0\0A\09mov $$1, $1\0A\09mov $$2, $2\0A\09mov $$3, $3\0A\09mov $$4, $4\0A\09mov $$5, $5\0A\09mov $$6, $6\0A\09mov $$7, $7\0A\09mov $$8, $8\0A\09mov $$9, $9\0A\09mov $$10, $10\0A\09mov $$11, $11\0A\09mov $$12, $12\0A\09mov $$13, $13\0A\09mov $$14, $14", "={rax},={rcx},={rdx},={rsi},={rdi},={rbx},={rbp},={r8},={r9},={r10},={r11},={r12},={r13},={r14},={r15},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 0
+  %asmresult1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 1
+  %asmresult2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 2
+  %asmresult3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 3
+  %asmresult4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 4
+  %asmresult5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 5
+  %asmresult6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 6
+  %asmresult7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 7
+  %asmresult8 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 8
+  %asmresult9 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 9
+  %asmresult10 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 10
+  %asmresult11 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 11
+  %asmresult12 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 12
+  %asmresult13 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 13
+  %asmresult14 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 14
+  %1 = load i32, ptr %foo, align 4
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %2 = load i32, ptr %b, align 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %3 = load i32, ptr %c, align 4
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %4 = load i32, ptr %d, align 4
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %5 = load i32, ptr %e, align 4
+  %6 = tail call { i32, i32, i32, i32, i32 } asm sideeffect "# rm tied output: pressure\0A\09# $0 $1 $2 $3 $4", "=rm,=rm,=rm,=rm,=rm,0,1,2,3,4,~{dirflag},~{fpsr},~{flags}"(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5)
+  %asmresult15 = extractvalue { i32, i32, i32, i32, i32 } %6, 0
+  %asmresult16 = extractvalue { i32, i32, i32, i32, i32 } %6, 1
+  %asmresult17 = extractvalue { i32, i32, i32, i32, i32 } %6, 2
+  %asmresult18 = extractvalue { i32, i32, i32, i32, i32 } %6, 3
+  %asmresult19 = extractvalue { i32, i32, i32, i32, i32 } %6, 4
+  store i32 %asmresult15, ptr %foo, align 4
+  store i32 %asmresult16, ptr %b, align 4
+  store i32 %asmresult17, ptr %c, align 4
+  store i32 %asmresult18, ptr %d, align 4
+  store i32 %asmresult19, ptr %e, align 4
+  tail call void @g(i64 noundef %asmresult, i64 noundef %asmresult1, i64 noundef %asmresult2, i64 noundef %asmresult3, i64 noundef %asmresult4, i64 noundef %asmresult5, i64 noundef %asmresult6, i64 noundef %asmresult7, i64 noundef %asmresult8, i64 noundef %asmresult9, i64 noundef %asmresult10, i64 noundef %asmresult11, i64 noundef %asmresult12, i64 noundef %asmresult13, i64 noundef %asmresult14)
+  %7 = load i32, ptr %foo, align 4
+  ret i32 %7
 }
 
-define dso_local i32 @test5(ptr nocapture noundef readonly %ptr) local_unnamed_addr {
-; GREEDY-X86_64-LABEL: test5:
-;
-; GREEDY-I386-LABEL: test5:
+define dso_local i32 @test7(ptr noundef captures(none) initializes((0, 4)) %foo) local_unnamed_addr {
+; O2-LABEL: test7:
+; O2:    movl 4(%rdi), %eax
+; O2:    #APP
+; O2:    # rm output, r input: no pressure
+; O2:    # %eax %eax
+; O2:    #NO_APP
+; O2:    movl %eax, (%rdi)
 ;
-; BASIC-X86_64-LABEL: test5:
-;
-; BASIC-I386-LABEL: test5:
+; O0-LABEL: test7:
+; O0:    movl 4(%rdi), %eax
+; O0:    #APP
+; O0:    # rm output, r input: no pressure
+; O0:    # -{{[0-9]+}}(%rsp) %eax
+; O0:    #NO_APP
+; O0:    movl -{{[0-9]+}}(%rsp), %eax
+; O0:    movl %eax, (%rdi)
 entry:
-  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
   %0 = load i32, ptr %b, align 4
-  %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
-  %1 = load i32, ptr %d, align 4
-  %2 = tail call { i32, i32 } asm sideeffect "# dual tied 'rm' no pressure -> $0 $1 $2 $3", "=rm,=rm,0,1,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
-  %asmresult = extractvalue { i32, i32 } %2, 0
-  %asmresult1 = extractvalue { i32, i32 } %2, 1
-  store i32 %asmresult, ptr %b, align 4
-  store i32 %asmresult1, ptr %d, align 4
-  %3 = load i32, ptr %ptr, align 4
-  ret i32 %3
+  %1 = tail call i32 asm sideeffect "# rm output, r input: no pressure\0A\09# $0 $1", "=rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %0)
+  store i32 %1, ptr %foo, align 4
+  ret i32 %1
 }
 
-define dso_local i32 @test6(ptr nocapture noundef readonly %ptr) local_unnamed_addr {
-; GREEDY-X86_64-LABEL: test6:
-;
-; GREEDY-I386-LABEL: test6:
-;
-; BASIC-X86_64-LABEL: test6:
+define dso_local i32 @test8(ptr noundef captures(none) initializes((0, 4)) %foo) local_unnamed_addr {
+; O2-LABEL: test8:
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    #APP
+; O2:    movq $0, %rax
+; O2:    movq $1, %rcx
+; O2:    movq $2, %rdx
+; O2:    movq $3, %rsi
+; O2:    movq $4, %rdi
+; O2:    movq $5, %rbx
+; O2:    movq $6, %rbp
+; O2:    movq $7, %r8
+; O2:    movq $8, %r9
+; O2:    movq $9, %r10
+; O2:    movq $10, %r11
+; O2:    movq $11, %r12
+; O2:    movq $12, %r13
+; O2:    movq $13, %r14
+; O2:    movq $14, %r15
+; O2:    #NO_APP
+; O2:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; O2:    movl 4(%rbp), %esi
+; O2:    #APP
+; O2:    # rm output, r input: pressure
+; O2:    # %esi %esi
+; O2:    #NO_APP
+; O2:    movl %esi, (%rbp)
+; O2:    movq %rax, %rdi
+; O2:    movq %rcx, %rsi
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O2:    movq %rbx, %r9
+; O2:    callq g at PLT
+; O2:    movl (%rbp), %eax
 ;
-; BASIC-I386-LABEL: test6:
+; O0-LABEL: test8:
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    movq $0, %rax
+; O0:    movq $1, %rcx
+; O0:    movq $2, %rdx
+; O0:    movq $3, %rsi
+; O0:    movq $4, %rdi
+; O0:    movq $5, %rbx
+; O0:    movq $6, %rbp
+; O0:    movq $7, %r8
+; O0:    movq $8, %r9
+; O0:    movq $9, %r10
+; O0:    movq $10, %r11
+; O0:    movq $11, %r12
+; O0:    movq $12, %r13
+; O0:    movq $13, %r14
+; O0:    movq $14, %r15
+; O0:    #NO_APP
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rsi, %rcx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %r8, %rbx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O0:    movq %r9, %rdi
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movl 4(%rdi), %eax
+; O0:    #APP
+; O0:    # rm output, r input: pressure
+; O0:    # {{[0-9]+}}(%rsp) %eax
+; O0:    #NO_APP
+; O0:    movl {{[0-9]+}}(%rsp), %eax
+; O0:    movl %eax, (%rdi)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rbp, (%rsp)
+; O0:    movq %rbx, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %r10, {{[0-9]+}}(%rsp)
+; O0:    movq %r11, {{[0-9]+}}(%rsp)
+; O0:    movq %r12, {{[0-9]+}}(%rsp)
+; O0:    movq %r13, {{[0-9]+}}(%rsp)
+; O0:    movq %r14, {{[0-9]+}}(%rsp)
+; O0:    movq %r15, {{[0-9]+}}(%rsp)
+; O0:    callq g at PLT
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl (%rdi), %eax
 entry:
-  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
-  %0 = load i32, ptr %b, align 4
-  %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
-  %1 = load i32, ptr %d, align 4
-  %2 = tail call { i32, i32 } asm sideeffect "# dual tied 'rm' pressure -> $0 $1 $2 $3", "=rm,=rm,0,1,~{ax},~{cx},~{dx},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{bx},~{bp},~{r14},~{r15},~{r12},~{r13},~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
-  %asmresult = extractvalue { i32, i32 } %2, 0
-  %asmresult1 = extractvalue { i32, i32 } %2, 1
-  store i32 %asmresult, ptr %b, align 4
-  store i32 %asmresult1, ptr %d, align 4
-  %3 = load i32, ptr %ptr, align 4
+  %0 = tail call { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "mov $$0, $0\0A\09mov $$1, $1\0A\09mov $$2, $2\0A\09mov $$3, $3\0A\09mov $$4, $4\0A\09mov $$5, $5\0A\09mov $$6, $6\0A\09mov $$7, $7\0A\09mov $$8, $8\0A\09mov $$9, $9\0A\09mov $$10, $10\0A\09mov $$11, $11\0A\09mov $$12, $12\0A\09mov $$13, $13\0A\09mov $$14, $14", "={rax},={rcx},={rdx},={rsi},={rdi},={rbx},={rbp},={r8},={r9},={r10},={r11},={r12},={r13},={r14},={r15},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 0
+  %asmresult1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 1
+  %asmresult2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 2
+  %asmresult3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 3
+  %asmresult4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 4
+  %asmresult5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 5
+  %asmresult6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 6
+  %asmresult7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 7
+  %asmresult8 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 8
+  %asmresult9 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 9
+  %asmresult10 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 10
+  %asmresult11 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 11
+  %asmresult12 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 12
+  %asmresult13 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 13
+  %asmresult14 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 14
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %1 = load i32, ptr %b, align 4
+  %2 = tail call i32 asm sideeffect "# rm output, r input: pressure\0A\09# $0 $1", "=rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %1)
+  store i32 %2, ptr %foo, align 4
+  tail call void @g(i64 noundef %asmresult, i64 noundef %asmresult1, i64 noundef %asmresult2, i64 noundef %asmresult3, i64 noundef %asmresult4, i64 noundef %asmresult5, i64 noundef %asmresult6, i64 noundef %asmresult7, i64 noundef %asmresult8, i64 noundef %asmresult9, i64 noundef %asmresult10, i64 noundef %asmresult11, i64 noundef %asmresult12, i64 noundef %asmresult13, i64 noundef %asmresult14)
+  %3 = load i32, ptr %foo, align 4
   ret i32 %3
 }
 
-define dso_local i32 @test7(ptr noundef %ptr) local_unnamed_addr {
-; GREEDY-X86_64-LABEL: test7:
-;
-; GREEDY-I386-LABEL: test7:
-;
-; BASIC-X86_64-LABEL: test7:
+define dso_local i32 @test9(ptr noundef %foo) local_unnamed_addr {
+; O2-LABEL: test9:
+; O2:    movl 4(%rdi), %eax
+; O2:    #APP
+; O2:    # m output, rm input: no pressure
+; O2:    # (%rdi) %eax
+; O2:    #NO_APP
+; O2:    movl (%rdi), %eax
 ;
-; BASIC-I386-LABEL: test7:
+; O0-LABEL: test9:
+; O0:    movl 4(%rdi), %eax
+; O0:    movl %eax, -{{[0-9]+}}(%rsp)
+; O0:    movq %rax, -{{[0-9]+}}(%rsp)
+; O0:    #APP
+; O0:    # m output, rm input: no pressure
+; O0:    # (%rdi) -{{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movl (%rdi), %eax
 entry:
-  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
   %0 = load i32, ptr %b, align 4
-  tail call void asm sideeffect "# single 'rm' input -> $0", "rm,~{dirflag},~{fpsr},~{flags}"(i32 %0)
-  %1 = load i32, ptr %ptr, align 4
+  tail call void asm sideeffect "# m output, rm input: no pressure\0A\09# $0 $1", "=*m,rm,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %foo, i32 %0)
+  %1 = load i32, ptr %foo, align 4
   ret i32 %1
 }
 
-define dso_local i32 @test8(ptr noundef %ptr) local_unnamed_addr {
-; GREEDY-X86_64-LABEL: test8:
+define dso_local i32 @test10(ptr noundef %foo) local_unnamed_addr {
+; O2-LABEL: test10:
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    #APP
+; O2:    movq $0, %rax
+; O2:    movq $1, %rcx
+; O2:    movq $2, %rdx
+; O2:    movq $3, %rsi
+; O2:    movq $4, %rdi
+; O2:    movq $5, %rbx
+; O2:    movq $6, %rbp
+; O2:    movq $7, %r8
+; O2:    movq $8, %r9
+; O2:    movq $9, %r10
+; O2:    movq $10, %r11
+; O2:    movq $11, %r12
+; O2:    movq $12, %r13
+; O2:    movq $13, %r14
+; O2:    movq $14, %r15
+; O2:    #NO_APP
+; O2:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; O2:    movl 4(%rbp), %esi
+; O2:    #APP
+; O2:    # m output, rm input: pressure
+; O2:    # (%rbp) %esi
+; O2:    #NO_APP
+; O2:    movq %rax, %rdi
+; O2:    movq %rcx, %rsi
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O2:    movq %rbx, %r9
+; O2:    callq g at PLT
+; O2:    movl (%rbp), %eax
 ;
-; GREEDY-I386-LABEL: test8:
-;
-; BASIC-X86_64-LABEL: test8:
-;
-; BASIC-I386-LABEL: test8:
+; O0-LABEL: test10:
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    movq $0, %rax
+; O0:    movq $1, %rcx
+; O0:    movq $2, %rdx
+; O0:    movq $3, %rsi
+; O0:    movq $4, %rdi
+; O0:    movq $5, %rbx
+; O0:    movq $6, %rbp
+; O0:    movq $7, %r8
+; O0:    movq $8, %r9
+; O0:    movq $9, %r10
+; O0:    movq $10, %r11
+; O0:    movq $11, %r12
+; O0:    movq $12, %r13
+; O0:    movq $13, %r14
+; O0:    movq $14, %r15
+; O0:    #NO_APP
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rsi, %rcx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0:    movq %rdi, %rax
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %r8, %rbx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O0:    movq %r9, %rax
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl 4(%rdi), %edi
+; O0:    movl %edi, {{[0-9]+}}(%rsp)
+; O0:    movq %rdi, {{[0-9]+}}(%rsp)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    #APP
+; O0:    # m output, rm input: pressure
+; O0:    # (%rdi) {{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rbp, (%rsp)
+; O0:    movq %rbx, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %r10, {{[0-9]+}}(%rsp)
+; O0:    movq %r11, {{[0-9]+}}(%rsp)
+; O0:    movq %r12, {{[0-9]+}}(%rsp)
+; O0:    movq %r13, {{[0-9]+}}(%rsp)
+; O0:    movq %r14, {{[0-9]+}}(%rsp)
+; O0:    movq %r15, {{[0-9]+}}(%rsp)
+; O0:    callq g at PLT
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl (%rdi), %eax
 entry:
-  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
-  %0 = load i32, ptr %b, align 4
-  %d = getelementptr inbounds nuw i8, ptr %ptr, i64 12
-  %1 = load i32, ptr %d, align 4
-  tail call void asm sideeffect "# dual 'rm' and 'r' input -> $0 $1", "rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
-  %2 = load i32, ptr %ptr, align 4
+  %0 = tail call { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "mov $$0, $0\0A\09mov $$1, $1\0A\09mov $$2, $2\0A\09mov $$3, $3\0A\09mov $$4, $4\0A\09mov $$5, $5\0A\09mov $$6, $6\0A\09mov $$7, $7\0A\09mov $$8, $8\0A\09mov $$9, $9\0A\09mov $$10, $10\0A\09mov $$11, $11\0A\09mov $$12, $12\0A\09mov $$13, $13\0A\09mov $$14, $14", "={rax},={rcx},={rdx},={rsi},={rdi},={rbx},={rbp},={r8},={r9},={r10},={r11},={r12},={r13},={r14},={r15},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 0
+  %asmresult1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 1
+  %asmresult2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 2
+  %asmresult3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 3
+  %asmresult4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 4
+  %asmresult5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 5
+  %asmresult6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 6
+  %asmresult7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 7
+  %asmresult8 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 8
+  %asmresult9 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 9
+  %asmresult10 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 10
+  %asmresult11 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 11
+  %asmresult12 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 12
+  %asmresult13 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 13
+  %asmresult14 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 14
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %1 = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# m output, rm input: pressure\0A\09# $0 $1", "=*m,rm,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %foo, i32 %1)
+  tail call void @g(i64 noundef %asmresult, i64 noundef %asmresult1, i64 noundef %asmresult2, i64 noundef %asmresult3, i64 noundef %asmresult4, i64 noundef %asmresult5, i64 noundef %asmresult6, i64 noundef %asmresult7, i64 noundef %asmresult8, i64 noundef %asmresult9, i64 noundef %asmresult10, i64 noundef %asmresult11, i64 noundef %asmresult12, i64 noundef %asmresult13, i64 noundef %asmresult14)
+  %2 = load i32, ptr %foo, align 4
   ret i32 %2
 }
 
-define dso_local i32 @test9(ptr nocapture noundef %ptr) local_unnamed_addr {
-; GREEDY-X86_64-LABEL: test9:
+define dso_local i32 @test11(ptr noundef %foo) local_unnamed_addr {
+; O2-LABEL: test11:
+; O2:    movl (%rdi), %eax
+; O2:    movl 4(%rdi), %ecx
+; O2:    #APP
+; O2:    # multiple m output, rm input: no pressure
+; O2:    # (%rdi) 4(%rdi) 8(%rdi) 12(%rdi) 16(%rdi) %eax %ecx
+; O2:    #NO_APP
+; O2:    movl (%rdi), %eax
 ;
-; GREEDY-I386-LABEL: test9:
-;
-; BASIC-X86_64-LABEL: test9:
-;
-; BASIC-I386-LABEL: test9:
+; O0-LABEL: test11:
+; O0:    movq %rdi, %rax
+; O0:    movq %rdi, %rcx
+; O0:    movq %rdi, %rdx
+; O0:    movq %rdi, %rsi
+; O0:    movl (%rdi), %r9d
+; O0:    movl 4(%rdi), %r8d
+; O0:    movl %r9d, -{{[0-9]+}}(%rsp)
+; O0:    movl %r8d, -{{[0-9]+}}(%rsp)
+; O0:    movq %r8, -{{[0-9]+}}(%rsp)
+; O0:    movq %r8, -{{[0-9]+}}(%rsp)
+; O0:    #APP
+; O0:    # multiple m output, rm input: no pressure
+; O0:    # (%rdi) (%rax) (%rcx) (%rdx) (%rsi) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movl (%rdi), %eax
 entry:
-  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
-  %0 = tail call i32 asm sideeffect "# single 'rm' output -> $0", "=rm,~{dirflag},~{fpsr},~{flags}"()
-  store i32 %0, ptr %b, align 4
-  %1 = load i32, ptr %ptr, align 4
-  ret i32 %1
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %0 = load i32, ptr %foo, align 4
+  %1 = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# multiple m output, rm input: no pressure\0A\09# $0 $1 $2 $3 $4 $5 $6", "=*m,=*m,=*m,=*m,=*m,rm,rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %foo, ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %c, ptr nonnull elementtype(i32) %d, ptr nonnull elementtype(i32) %e, i32 %0, i32 %1)
+  %2 = load i32, ptr %foo, align 4
+  ret i32 %2
 }
 
-define dso_local i32 @test10(ptr noundef captures(none) %ptr) local_unnamed_addr {
-; GREEDY-X86_64-LABEL: test10:
-;
-; GREEDY-I386-LABEL: test10:
+define dso_local i32 @test12(ptr noundef %foo) local_unnamed_addr {
+; O2-LABEL: test12:
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    #APP
+; O2:    movq $0, %rax
+; O2:    movq $1, %rcx
+; O2:    movq $2, %rdx
+; O2:    movq $3, %rsi
+; O2:    movq $4, %rdi
+; O2:    movq $5, %rbx
+; O2:    movq $6, %rbp
+; O2:    movq $7, %r8
+; O2:    movq $8, %r9
+; O2:    movq $9, %r10
+; O2:    movq $10, %r11
+; O2:    movq $11, %r12
+; O2:    movq $12, %r13
+; O2:    movq $13, %r14
+; O2:    movq $14, %r15
+; O2:    #NO_APP
+; O2:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; O2:    movl (%rbp), %esi
+; O2:    movl 4(%rbp), %edi
+; O2:    #APP
+; O2:    # multiple m output, rm input: pressure
+; O2:    # (%rbp) 4(%rbp) 8(%rbp) 12(%rbp) 16(%rbp) %esi %edi
+; O2:    #NO_APP
+; O2:    movq %rax, %rdi
+; O2:    movq %rcx, %rsi
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O2:    movq %rbx, %r9
+; O2:    callq g at PLT
+; O2:    movl (%rbp), %eax
 ;
-; BASIC-X86_64-LABEL: test10:
+; O0-LABEL: test12:
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    movq $0, %rax
+; O0:    movq $1, %rcx
+; O0:    movq $2, %rdx
+; O0:    movq $3, %rsi
+; O0:    movq $4, %rdi
+; O0:    movq $5, %rbx
+; O0:    movq $6, %rbp
+; O0:    movq $7, %r8
+; O0:    movq $8, %r9
+; O0:    movq $9, %r10
+; O0:    movq $10, %r11
+; O0:    movq $11, %r12
+; O0:    movq $12, %r13
+; O0:    movq $13, %r14
+; O0:    movq $14, %r15
+; O0:    #NO_APP
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %r8, %rbx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O0:    movq %r9, %rdi
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdi, %rax
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdi, %rcx
+; O0:    movq %rdi, %rdx
+; O0:    movq %rdi, %rsi
+; O0:    movl (%rdi), %eax
+; O0:    movl 4(%rdi), %edi
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movl %edi, {{[0-9]+}}(%rsp)
+; O0:    movq %rdi, {{[0-9]+}}(%rsp)
+; O0:    movq %rdi, {{[0-9]+}}(%rsp)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    #APP
+; O0:    # multiple m output, rm input: pressure
+; O0:    # (%rdi) (%rax) (%rcx) (%rdx) (%rsi) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rbp, (%rsp)
+; O0:    movq %rbx, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %r10, {{[0-9]+}}(%rsp)
+; O0:    movq %r11, {{[0-9]+}}(%rsp)
+; O0:    movq %r12, {{[0-9]+}}(%rsp)
+; O0:    movq %r13, {{[0-9]+}}(%rsp)
+; O0:    movq %r14, {{[0-9]+}}(%rsp)
+; O0:    movq %r15, {{[0-9]+}}(%rsp)
+; O0:    callq g at PLT
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl (%rdi), %eax
+entry:
+  %0 = tail call { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "mov $$0, $0\0A\09mov $$1, $1\0A\09mov $$2, $2\0A\09mov $$3, $3\0A\09mov $$4, $4\0A\09mov $$5, $5\0A\09mov $$6, $6\0A\09mov $$7, $7\0A\09mov $$8, $8\0A\09mov $$9, $9\0A\09mov $$10, $10\0A\09mov $$11, $11\0A\09mov $$12, $12\0A\09mov $$13, $13\0A\09mov $$14, $14", "={rax},={rcx},={rdx},={rsi},={rdi},={rbx},={rbp},={r8},={r9},={r10},={r11},={r12},={r13},={r14},={r15},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 0
+  %asmresult1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 1
+  %asmresult2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 2
+  %asmresult3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 3
+  %asmresult4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 4
+  %asmresult5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 5
+  %asmresult6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 6
+  %asmresult7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 7
+  %asmresult8 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 8
+  %asmresult9 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 9
+  %asmresult10 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 10
+  %asmresult11 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 11
+  %asmresult12 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 12
+  %asmresult13 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 13
+  %asmresult14 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 14
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %1 = load i32, ptr %foo, align 4
+  %2 = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# multiple m output, rm input: pressure\0A\09# $0 $1 $2 $3 $4 $5 $6", "=*m,=*m,=*m,=*m,=*m,rm,rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %foo, ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %c, ptr nonnull elementtype(i32) %d, ptr nonnull elementtype(i32) %e, i32 %1, i32 %2)
+  tail call void @g(i64 noundef %asmresult, i64 noundef %asmresult1, i64 noundef %asmresult2, i64 noundef %asmresult3, i64 noundef %asmresult4, i64 noundef %asmresult5, i64 noundef %asmresult6, i64 noundef %asmresult7, i64 noundef %asmresult8, i64 noundef %asmresult9, i64 noundef %asmresult10, i64 noundef %asmresult11, i64 noundef %asmresult12, i64 noundef %asmresult13, i64 noundef %asmresult14)
+  %3 = load i32, ptr %foo, align 4
+  ret i32 %3
+}
+
+define dso_local i32 @test13(ptr noundef %foo) local_unnamed_addr {
+; O2-LABEL: test13:
+; O2:    movl (%rdi), %ecx
+; O2:    movl 4(%rdi), %edx
+; O2:    #APP
+; O2:    # multiple m output, rm input: no pressure
+; O2:    # %eax %esi %r8d %r9d %r10d %ecx %edx
+; O2:    #NO_APP
+; O2:    movl %eax, (%rdi)
+; O2:    movl %esi, 4(%rdi)
+; O2:    movl %r8d, 8(%rdi)
+; O2:    movl %r9d, 12(%rdi)
+; O2:    movl %r10d, 16(%rdi)
 ;
-; BASIC-I386-LABEL: test10:
+; O0-LABEL: test13:
+; O0:    movq %rdi, %rax
+; O0:    movq %rdi, %rcx
+; O0:    movq %rdi, %rdx
+; O0:    movq %rdi, %rsi
+; O0:    movl (%rdi), %r9d
+; O0:    movl 4(%rdi), %r8d
+; O0:    movl %r9d, -{{[0-9]+}}(%rsp)
+; O0:    movl %r8d, -{{[0-9]+}}(%rsp)
+; O0:    movq %r8, -{{[0-9]+}}(%rsp)
+; O0:    movq %r8, -{{[0-9]+}}(%rsp)
+; O0:    #APP
+; O0:    # multiple m output, rm input: no pressure
+; O0:    # (%rdi) (%rax) (%rcx) (%rdx) (%rsi) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movl (%rdi), %eax
 entry:
-  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
-  %0 = load i32, ptr %b, align 4
-  %1 = tail call i32 asm sideeffect "# simgle tied 'rm' input -> $0 $1", "=rm,0,~{dirflag},~{fpsr},~{flags}"(i32 %0)
-  store i32 %1, ptr %b, align 4
-  %2 = load i32, ptr %ptr, align 4
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %0 = load i32, ptr %foo, align 4
+  %1 = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# multiple m output, rm input: no pressure\0A\09# $0 $1 $2 $3 $4 $5 $6", "=*&rm,=*&rm,=*&rm,=*&rm,=*&rm,rm,rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %foo, ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %c, ptr nonnull elementtype(i32) %d, ptr nonnull elementtype(i32) %e, i32 %0, i32 %1)
+  %2 = load i32, ptr %foo, align 4
   ret i32 %2
 }
 
-define dso_local i32 @test11(ptr noundef captures(none) %ptr) local_unnamed_addr {
-; GREEDY-X86_64-LABEL: test11:
-;
-; GREEDY-I386-LABEL: test11:
+define dso_local i32 @test14(ptr noundef %foo) local_unnamed_addr {
+; O2-LABEL: test14:
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    #APP
+; O2:    movq $0, %rax
+; O2:    movq $1, %rcx
+; O2:    movq $2, %rdx
+; O2:    movq $3, %rsi
+; O2:    movq $4, %rdi
+; O2:    movq $5, %rbx
+; O2:    movq $6, %rbp
+; O2:    movq $7, %r8
+; O2:    movq $8, %r9
+; O2:    movq $9, %r10
+; O2:    movq $10, %r11
+; O2:    movq $11, %r12
+; O2:    movq $12, %r13
+; O2:    movq $13, %r14
+; O2:    movq $14, %r15
+; O2:    #NO_APP
+; O2:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; O2:    movl (%rbp), %esi
+; O2:    movl 4(%rbp), %edi
+; O2:    #APP
+; O2:    # multiple m output, rm input: pressure
+; O2:    # (%rbp) 4(%rbp) 8(%rbp) 12(%rbp) 16(%rbp) %esi %edi
+; O2:    #NO_APP
+; O2:    movq %rax, %rdi
+; O2:    movq %rcx, %rsi
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O2:    movq %rbx, %r9
+; O2:    callq g at PLT
+; O2:    movl (%rbp), %eax
 ;
-; BASIC-X86_64-LABEL: test11:
-;
-; BASIC-I386-LABEL: test11:
+; O0-LABEL: test14:
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    movq $0, %rax
+; O0:    movq $1, %rcx
+; O0:    movq $2, %rdx
+; O0:    movq $3, %rsi
+; O0:    movq $4, %rdi
+; O0:    movq $5, %rbx
+; O0:    movq $6, %rbp
+; O0:    movq $7, %r8
+; O0:    movq $8, %r9
+; O0:    movq $9, %r10
+; O0:    movq $10, %r11
+; O0:    movq $11, %r12
+; O0:    movq $12, %r13
+; O0:    movq $13, %r14
+; O0:    movq $14, %r15
+; O0:    #NO_APP
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %r8, %rbx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O0:    movq %r9, %rdi
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdi, %rax
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdi, %rcx
+; O0:    movq %rdi, %rdx
+; O0:    movq %rdi, %rsi
+; O0:    movl (%rdi), %eax
+; O0:    movl 4(%rdi), %edi
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movl %edi, {{[0-9]+}}(%rsp)
+; O0:    movq %rdi, {{[0-9]+}}(%rsp)
+; O0:    movq %rdi, {{[0-9]+}}(%rsp)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    #APP
+; O0:    # multiple m output, rm input: pressure
+; O0:    # (%rdi) (%rax) (%rcx) (%rdx) (%rsi) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rbp, (%rsp)
+; O0:    movq %rbx, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %r10, {{[0-9]+}}(%rsp)
+; O0:    movq %r11, {{[0-9]+}}(%rsp)
+; O0:    movq %r12, {{[0-9]+}}(%rsp)
+; O0:    movq %r13, {{[0-9]+}}(%rsp)
+; O0:    movq %r14, {{[0-9]+}}(%rsp)
+; O0:    movq %r15, {{[0-9]+}}(%rsp)
+; O0:    callq g at PLT
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl (%rdi), %eax
 entry:
-  %b = getelementptr inbounds nuw i8, ptr %ptr, i64 4
-  %0 = load i32, ptr %b, align 4
-  %1 = tail call i32 asm sideeffect "# dual 'r' output == input location -> $0 $1", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %0)
-  store i32 %1, ptr %b, align 4
-  %2 = load i32, ptr %ptr, align 4
-  ret i32 %2
+  %0 = tail call { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "mov $$0, $0\0A\09mov $$1, $1\0A\09mov $$2, $2\0A\09mov $$3, $3\0A\09mov $$4, $4\0A\09mov $$5, $5\0A\09mov $$6, $6\0A\09mov $$7, $7\0A\09mov $$8, $8\0A\09mov $$9, $9\0A\09mov $$10, $10\0A\09mov $$11, $11\0A\09mov $$12, $12\0A\09mov $$13, $13\0A\09mov $$14, $14", "={rax},={rcx},={rdx},={rsi},={rdi},={rbx},={rbp},={r8},={r9},={r10},={r11},={r12},={r13},={r14},={r15},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 0
+  %asmresult1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 1
+  %asmresult2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 2
+  %asmresult3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 3
+  %asmresult4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 4
+  %asmresult5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 5
+  %asmresult6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 6
+  %asmresult7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 7
+  %asmresult8 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 8
+  %asmresult9 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 9
+  %asmresult10 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 10
+  %asmresult11 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 11
+  %asmresult12 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 12
+  %asmresult13 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 13
+  %asmresult14 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 14
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %1 = load i32, ptr %foo, align 4
+  %2 = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# multiple m output, rm input: pressure\0A\09# $0 $1 $2 $3 $4 $5 $6", "=*m,=*m,=*m,=*m,=*m,rm,rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %foo, ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %c, ptr nonnull elementtype(i32) %d, ptr nonnull elementtype(i32) %e, i32 %1, i32 %2)
+  tail call void @g(i64 noundef %asmresult, i64 noundef %asmresult1, i64 noundef %asmresult2, i64 noundef %asmresult3, i64 noundef %asmresult4, i64 noundef %asmresult5, i64 noundef %asmresult6, i64 noundef %asmresult7, i64 noundef %asmresult8, i64 noundef %asmresult9, i64 noundef %asmresult10, i64 noundef %asmresult11, i64 noundef %asmresult12, i64 noundef %asmresult13, i64 noundef %asmresult14)
+  %3 = load i32, ptr %foo, align 4
+  ret i32 %3
 }
+
+declare void @g(i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef)

>From b490ca837d62674be2138faf626a52521e6cad8f Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Fri, 6 Feb 2026 21:03:20 -0800
Subject: [PATCH 29/29] Use opt and update_test_check.py

---
 .../CodeGen/X86/inline-asm-prepare-memory.ll  | 47 ++++++++++++-------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll b/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll
index ea4c90aaecafe..3cd664ab08754 100644
--- a/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll
@@ -1,21 +1,30 @@
-; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -stop-after=inline-asm-prepare < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=x86_64-unknown-linux-gnu -inline-asm-prepare < %s | FileCheck %s
 
 define void @test1(i32 %x) {
-; CHECK-LABEL: @test1
-; CHECK:         %asm_mem = alloca i32
-; CHECK-NEXT:    store i32 %x, ptr %asm_mem
-; CHECK-NEXT:    %0 = call i32 asm sideeffect "mov $1, $0", "=r,rm,~{dirflag},~{fpsr},~{flags}"(ptr %asm_mem)
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ASM_MEM:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[X]], ptr [[ASM_MEM]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 asm sideeffect "mov $1, $0", "=r,rm,~{dirflag},~{fpsr},~{flags}"(ptr [[ASM_MEM]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = call i32 asm sideeffect "mov $1, $0", "=r,rm,~{dirflag},~{fpsr},~{flags}"(i32 %x)
   ret void
 }
 
 define void @test2(ptr %p) {
-; CHECK-LABEL: @test2
-; CHECK:         %asm_mem = alloca i32
-; CHECK-NEXT:    call void asm sideeffect "mov $1, $0", "=*rm,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %asm_mem)
-; CHECK-NEXT:    %[[VAL1:.*]] = load i32, ptr %asm_mem
-; CHECK-NEXT:    store i32 %[[VAL1]], ptr %p
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ASM_MEM:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void asm sideeffect "mov $1, $0", "=*rm,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) [[ASM_MEM]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ASM_MEM]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P]], align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = call i32 asm sideeffect "mov $1, $0", "=rm,~{dirflag},~{fpsr},~{flags}"()
   store i32 %0, ptr %p
@@ -23,13 +32,17 @@ entry:
 }
 
 define void @test3(ptr %x_ptr) {
-; CHECK-LABEL: @test3
-; CHECK:         %asm_mem = alloca i32
-; CHECK-NEXT:    %x = load i32, ptr %x_ptr
-; CHECK-NEXT:    store i32 %x, ptr %asm_mem
-; CHECK-NEXT:    call void asm sideeffect "inc $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %asm_mem, ptr %asm_mem)
-; CHECK-NEXT:    %[[VAL2:.*]] = load i32, ptr %asm_mem
-; CHECK-NEXT:    store i32 %[[VAL2]], ptr %x_ptr
+; CHECK-LABEL: define void @test3(
+; CHECK-SAME: ptr [[X_PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ASM_MEM:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_PTR]], align 4
+; CHECK-NEXT:    store i32 [[X]], ptr [[ASM_MEM]], align 4
+; CHECK-NEXT:    call void asm sideeffect "inc $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) [[ASM_MEM]], ptr [[ASM_MEM]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ASM_MEM]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[X_PTR]], align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   %x = load i32, ptr %x_ptr
   %0 = call i32 asm sideeffect "inc $0", "=rm,0,~{dirflag},~{fpsr},~{flags}"(i32 %x)