[llvm] ed41945 - [WebAssembly] Fix call unwind mismatches

Sat Feb 6 07:07:47 PST 2021

Author: Heejin Ahn
Date: 2021-02-06T07:07:04-08:00
New Revision: ed41945faadab27036d368cda9223dc3cb6eb840

URL: https://github.com/llvm/llvm-project/commit/ed41945faadab27036d368cda9223dc3cb6eb840
DIFF: https://github.com/llvm/llvm-project/commit/ed41945faadab27036d368cda9223dc3cb6eb840.diff

LOG: [WebAssembly] Fix call unwind mismatches

This adds `delegate` instruction and use it to fix unwind destination
mismatches created by marker placement in CFGStackify.

There are two kinds of unwind destination mismatches:
- Mismatches caused by throwing instructions (here we call it "call
  unwind mismatches", even though `throw` and `rethrow` can also cause
  mismatches)
- Mismatches caused by `catch`es, in case a foreign exception is not
  caught by the nearest `catch` and the next outer `catch` is not the
  catch it should unwind to. This kind of mismatches didn't exist in the
  previous version of the spec, because in the previous spec `catch` was
  effectively `catch_all`, catching all exceptions.

This implements routines to fix the first kind of unwind mismatches,
which we call "call unwind mismatches". The second mismatch (catch
unwind mismatches) will be fixed in a later CL.

This also reenables all previously disabled tests in cfg-stackify-eh.ll
and updates FileCheck lines to match the new spec. Two tests were
deleted because they specifically tested the way we fixed unwind
mismatches before using `exnref`s and branches, which we don't do
anymore.

Reviewed By: tlively

Differential Revision: https://reviews.llvm.org/D94048

Added: 
    

Modified: 
    llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
    llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
    llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
    llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
    llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index ee40b6d2adf7..1df3574e703a 100644

--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -93,36 +93,41 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address,
     case WebAssembly::LOOP:
     case WebAssembly::LOOP_S:
       printAnnotation(OS, "label" + utostr(ControlFlowCounter) + ':');
-      ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, true));
+      ControlFlowStack.push_back(std::make_pair(ControlFlowCounter, true));
+      DelegateStack.push_back(ControlFlowCounter++);
       return;
 
     case WebAssembly::BLOCK:
     case WebAssembly::BLOCK_S:
-      ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
+      ControlFlowStack.push_back(std::make_pair(ControlFlowCounter, false));
+      DelegateStack.push_back(ControlFlowCounter++);
       return;
 
     case WebAssembly::TRY:
     case WebAssembly::TRY_S:
       ControlFlowStack.push_back(std::make_pair(ControlFlowCounter, false));
-      EHPadStack.push_back(ControlFlowCounter++);
+      EHPadStack.push_back(ControlFlowCounter);
+      DelegateStack.push_back(ControlFlowCounter++);
       return;
 
     case WebAssembly::END_LOOP:
     case WebAssembly::END_LOOP_S:
-      if (ControlFlowStack.empty()) {
+      if (ControlFlowStack.empty() || DelegateStack.empty()) {
         printAnnotation(OS, "End marker mismatch!");
       } else {
         ControlFlowStack.pop_back();
+        DelegateStack.pop_back();
       }
       return;
 
     case WebAssembly::END_BLOCK:
     case WebAssembly::END_BLOCK_S:
-      if (ControlFlowStack.empty()) {
+      if (ControlFlowStack.empty() || DelegateStack.empty()) {
         printAnnotation(OS, "End marker mismatch!");
       } else {
         printAnnotation(
             OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
+        DelegateStack.pop_back();
       }
       return;
 
@@ -140,10 +145,11 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address,
     case WebAssembly::CATCH_S:
     case WebAssembly::CATCH_ALL:
     case WebAssembly::CATCH_ALL_S:
-      if (EHPadStack.empty()) {
+      if (EHPadStack.empty() || DelegateStack.empty()) {
         printAnnotation(OS, "try-catch mismatch!");
       } else {
         printAnnotation(OS, "catch" + utostr(EHPadStack.pop_back_val()) + ':');
+        DelegateStack.pop_back();
       }
       return;
 
@@ -157,6 +163,33 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address,
         printAnnotation(OS, "down to catch" + utostr(EHPadStack.back()));
       }
       return;
+
+    case WebAssembly::DELEGATE:
+    case WebAssembly::DELEGATE_S:
+      if (ControlFlowStack.empty() || EHPadStack.empty() ||
+          DelegateStack.empty()) {
+        printAnnotation(OS, "try-delegate mismatch!");
+      } else {
+        // 'delegate' is
+        // 1. A marker for the end of block label
+        // 2. A destination for throwing instructions
+        // 3. An instruction that itself rethrows to another 'catch'
+        assert(ControlFlowStack.back().first == EHPadStack.back() &&
+               EHPadStack.back() == DelegateStack.back());
+        std::string Label = "label/catch" +
+                            utostr(ControlFlowStack.pop_back_val().first) +
+                            ": ";
+        EHPadStack.pop_back();
+        DelegateStack.pop_back();
+        uint64_t Depth = MI->getOperand(0).getImm();
+        if (Depth >= DelegateStack.size()) {
+          Label += "to caller";
+        } else {
+          Label += "down to catch" + utostr(DelegateStack.rbegin()[Depth]);
+        }
+        printAnnotation(OS, Label);
+      }
+      return;
     }
 
     // Annotate any control flow label references.

diff  --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
index 2ed6d562acff..e0306bb49e91 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
@@ -27,6 +27,11 @@ class WebAssemblyInstPrinter final : public MCInstPrinter {
   uint64_t ControlFlowCounter = 0;
   SmallVector<std::pair<uint64_t, bool>, 4> ControlFlowStack;
   SmallVector<uint64_t, 4> EHPadStack;
+  // 'delegate' can target any block-like structure, but in case the target is
+  // 'try', it rethrows to the corresponding 'catch'. Because it can target all
+  // blocks but with a slightly 
diff erent semantics with branches, we need a
+  // separate stack for 'delegate'.
+  SmallVector<uint64_t, 4> DelegateStack;
 
 public:
   WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,

diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index a8e0c3efea0e..d2a3292a2805 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -38,7 +38,7 @@ using WebAssembly::SortRegionInfo;
 
 #define DEBUG_TYPE "wasm-cfg-stackify"
 
-STATISTIC(NumUnwindMismatches, "Number of EH pad unwind mismatches found");
+STATISTIC(NumCallUnwindMismatches, "Number of call unwind mismatches found");
 
 namespace {
 class WebAssemblyCFGStackify final : public MachineFunctionPass {
@@ -68,24 +68,35 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
   void placeBlockMarker(MachineBasicBlock &MBB);
   void placeLoopMarker(MachineBasicBlock &MBB);
   void placeTryMarker(MachineBasicBlock &MBB);
+
+  // Exception handling related functions
+  bool fixCallUnwindMismatches(MachineFunction &MF);
+  bool fixCatchUnwindMismatches(MachineFunction &MF);
+  void addTryDelegate(MachineInstr *RangeBegin, MachineInstr *RangeEnd,
+                      MachineBasicBlock *DelegateDest);
+  void recalculateScopeTops(MachineFunction &MF);
   void removeUnnecessaryInstrs(MachineFunction &MF);
-  bool fixUnwindMismatches(MachineFunction &MF);
+
+  // Wrap-up
+  unsigned getDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack,
+                    const MachineBasicBlock *MBB);
   void rewriteDepthImmediates(MachineFunction &MF);
   void fixEndsAtEndOfFunction(MachineFunction &MF);
+  void cleanupFunctionData(MachineFunction &MF);
 
-  // For each BLOCK|LOOP|TRY, the corresponding END_(BLOCK|LOOP|TRY).
+  // For each BLOCK|LOOP|TRY, the corresponding END_(BLOCK|LOOP|TRY) or DELEGATE
+  // (in case of TRY).
   DenseMap<const MachineInstr *, MachineInstr *> BeginToEnd;
-  // For each END_(BLOCK|LOOP|TRY), the corresponding BLOCK|LOOP|TRY.
+  // For each END_(BLOCK|LOOP|TRY) or DELEGATE, the corresponding
+  // BLOCK|LOOP|TRY.
   DenseMap<const MachineInstr *, MachineInstr *> EndToBegin;
   // <TRY marker, EH pad> map
   DenseMap<const MachineInstr *, MachineBasicBlock *> TryToEHPad;
   // <EH pad, TRY marker> map
   DenseMap<const MachineBasicBlock *, MachineInstr *> EHPadToTry;
 
-  // There can be an appendix block at the end of each function, shared for:
-  // - creating a correct signature for fallthrough returns
-  // - target for rethrows that need to unwind to the caller, but are trapped
-  //   inside another try/catch
+  // We need an appendix block to place 'end_loop' or 'end_try' marker when the
+  // loop / exception bottom block is the last block in a function
   MachineBasicBlock *AppendixBB = nullptr;
   MachineBasicBlock *getAppendixBlock(MachineFunction &MF) {
     if (!AppendixBB) {
@@ -97,6 +108,19 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
     return AppendixBB;
   }
 
+  // Before running rewriteDepthImmediates function, 'delegate' has a BB as its
+  // destination operand. getFakeCallerBlock() returns a fake BB that will be
+  // used for the operand when 'delegate' needs to rethrow to the caller. This
+  // will be rewritten as an immediate value that is the number of block depths
+  // + 1 in rewriteDepthImmediates, and this fake BB will be removed at the end
+  // of the pass.
+  MachineBasicBlock *FakeCallerBB = nullptr;
+  MachineBasicBlock *getFakeCallerBlock(MachineFunction &MF) {
+    if (!FakeCallerBB)
+      FakeCallerBB = MF.CreateMachineBasicBlock();
+    return FakeCallerBB;
+  }
+
   // Helper functions to register / unregister scope information created by
   // marker instructions.
   void registerScope(MachineInstr *Begin, MachineInstr *End);
@@ -189,6 +213,7 @@ void WebAssemblyCFGStackify::registerScope(MachineInstr *Begin,
   EndToBegin[End] = Begin;
 }
 
+// When 'End' is not an 'end_try' but 'delegate, EHPad is nullptr.
 void WebAssemblyCFGStackify::registerTryScope(MachineInstr *Begin,
                                               MachineInstr *End,
                                               MachineBasicBlock *EHPad) {
@@ -675,6 +700,8 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
     while (Cont->isEHPad()) {
       MachineInstr *Try = EHPadToTry[Cont];
       MachineInstr *EndTry = BeginToEnd[Try];
+      // We started from an EH pad, so the end marker cannot be a delegate
+      assert(EndTry->getOpcode() != WebAssembly::DELEGATE);
       Cont = EndTry->getParent();
     }
 
@@ -719,8 +746,10 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
     for (auto &MI : MBB) {
       if (MI.getOpcode() != WebAssembly::TRY)
         continue;
-
       MachineInstr *Try = &MI, *EndTry = BeginToEnd[Try];
+      if (EndTry->getOpcode() == WebAssembly::DELEGATE)
+        continue;
+
       MachineBasicBlock *TryBB = Try->getParent();
       MachineBasicBlock *Cont = EndTry->getParent();
       int64_t RetType = Try->getOperand(0).getImm();
@@ -763,12 +792,8 @@ static unsigned getCopyOpcode(const TargetRegisterClass *RC) {
 
 // When MBB is split into MBB and Split, we should unstackify defs in MBB that
 // have their uses in Split.
-// FIXME This function will be used when fixing unwind mismatches, but the old
-// version of that function was removed for the moment and the new version has
-// not yet been added. So 'LLVM_ATTRIBUTE_UNUSED' is added to suppress the
-// warning. Remove the attribute after the new functionality is added.
-LLVM_ATTRIBUTE_UNUSED static void
-unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB, MachineBasicBlock &Split) {
+static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB,
+                                         MachineBasicBlock &Split) {
   MachineFunction &MF = *MBB.getParent();
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
   auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
@@ -826,14 +851,409 @@ unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB, MachineBasicBlock &Split) {
   }
 }
 
-bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
-  // TODO Implement this
+// Wrap the given range of instruction with try-delegate. RangeBegin and
+// RangeEnd are inclusive.
+void WebAssemblyCFGStackify::addTryDelegate(MachineInstr *RangeBegin,
+                                            MachineInstr *RangeEnd,
+                                            MachineBasicBlock *DelegateDest) {
+  auto *BeginBB = RangeBegin->getParent();
+  auto *EndBB = RangeEnd->getParent();
+  MachineFunction &MF = *BeginBB->getParent();
+  const auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+  // Local expression tree before the first call of this range should go
+  // after the nested TRY.
+  SmallPtrSet<const MachineInstr *, 4> AfterSet;
+  AfterSet.insert(RangeBegin);
+  for (auto I = MachineBasicBlock::iterator(RangeBegin), E = BeginBB->begin();
+       I != E; --I) {
+    if (std::prev(I)->isDebugInstr() || std::prev(I)->isPosition())
+      continue;
+    if (WebAssembly::isChild(*std::prev(I), MFI))
+      AfterSet.insert(&*std::prev(I));
+    else
+      break;
+  }
+
+  // Create the nested try instruction.
+  auto TryPos = getLatestInsertPos(
+      BeginBB, SmallPtrSet<const MachineInstr *, 4>(), AfterSet);
+  MachineInstr *Try = BuildMI(*BeginBB, TryPos, RangeBegin->getDebugLoc(),
+                              TII.get(WebAssembly::TRY))
+                          .addImm(int64_t(WebAssembly::BlockType::Void));
+
+  // Create a BB to insert the 'delegate' instruction.
+  MachineBasicBlock *DelegateBB = MF.CreateMachineBasicBlock();
+  // If the destination of 'delegate' is not the caller, adds the destination to
+  // the BB's successors.
+  if (DelegateDest != FakeCallerBB)
+    DelegateBB->addSuccessor(DelegateDest);
+
+  auto SplitPos = std::next(RangeEnd->getIterator());
+  if (SplitPos == EndBB->end()) {
+    // If the range's end instruction is at the end of the BB, insert the new
+    // delegate BB after the current BB.
+    MF.insert(std::next(EndBB->getIterator()), DelegateBB);
+    EndBB->addSuccessor(DelegateBB);
+
+  } else {
+    // If the range's end instruction is in the middle of the BB, we split the
+    // BB into two and insert the delegate BB in between.
+    // - Before:
+    // bb:
+    //   range_end
+    //   other_insts
+    //
+    // - After:
+    // pre_bb: (previous 'bb')
+    //   range_end
+    // delegate_bb: (new)
+    //   delegate
+    // post_bb: (new)
+    //   other_insts
+    MachineBasicBlock *PreBB = EndBB;
+    MachineBasicBlock *PostBB = MF.CreateMachineBasicBlock();
+    MF.insert(std::next(PreBB->getIterator()), PostBB);
+    MF.insert(std::next(PreBB->getIterator()), DelegateBB);
+    PostBB->splice(PostBB->end(), PreBB, SplitPos, PreBB->end());
+    PostBB->transferSuccessors(PreBB);
+    unstackifyVRegsUsedInSplitBB(*PreBB, *PostBB);
+    PreBB->addSuccessor(DelegateBB);
+    PreBB->addSuccessor(PostBB);
+  }
+
+  // Add 'delegate' instruction in the delegate BB created above.
+  MachineInstr *Delegate = BuildMI(DelegateBB, RangeEnd->getDebugLoc(),
+                                   TII.get(WebAssembly::DELEGATE))
+                               .addMBB(DelegateDest);
+  registerTryScope(Try, Delegate, nullptr);
+}
+
+bool WebAssemblyCFGStackify::fixCallUnwindMismatches(MachineFunction &MF) {
+  // Linearizing the control flow by placing TRY / END_TRY markers can create
+  // mismatches in unwind destinations for throwing instructions, such as calls.
+  //
+  // We use the 'delegate' instruction to fix the unwind mismatches. 'delegate'
+  // instruction delegates an exception to an outer 'catch'. It can target not
+  // only 'catch' but all block-like structures including another 'delegate',
+  // but with slightly 
diff erent semantics than branches. When it targets a
+  // 'catch', it will delegate the exception to that catch. It is being
+  // discussed how to define the semantics when 'delegate''s target is a non-try
+  // block: it will either be a validation failure or it will target the next
+  // outer try-catch. But anyway our LLVM backend currently does not generate
+  // such code. The example below illustrates where the 'delegate' instruction
+  // in the middle will delegate the exception to, depending on the value of N.
+  // try
+  //   try
+  //     block
+  //       try
+  //         try
+  //           call @foo
+  //         delegate N    ;; Where will this delegate to?
+  //       catch           ;; N == 0
+  //       end
+  //     end               ;; N == 1 (invalid; will not be generated)
+  //   delegate            ;; N == 2
+  // catch                 ;; N == 3
+  // end
+  //                       ;; N == 4 (to caller)
+
+  // 1. When an instruction may throw, but the EH pad it will unwind to can be
+  //    
diff erent from the original CFG.
+  //
+  // Example: we have the following CFG:
+  // bb0:
+  //   call @foo    ; if it throws, unwind to bb2
+  // bb1:
+  //   call @bar    ; if it throws, unwind to bb3
+  // bb2 (ehpad):
+  //   catch
+  //   ...
+  // bb3 (ehpad)
+  //   catch
+  //   ...
+  //
+  // And the CFG is sorted in this order. Then after placing TRY markers, it
+  // will look like: (BB markers are omitted)
+  // try
+  //   try
+  //     call @foo
+  //     call @bar   ;; if it throws, unwind to bb3
+  //   catch         ;; ehpad (bb2)
+  //     ...
+  //   end_try
+  // catch           ;; ehpad (bb3)
+  //   ...
+  // end_try
+  //
+  // Now if bar() throws, it is going to end up ip in bb2, not bb3, where it
+  // is supposed to end up. We solve this problem by wrapping the mismatching
+  // call with an inner try-delegate that rethrows the exception to the right
+  // 'catch'.
+  //
+  //
+  // try
+  //   try
+  //     call @foo
+  //     try               ;; (new)
+  //       call @bar
+  //     delegate 1 (bb3)  ;; (new)
+  //   catch               ;; ehpad (bb2)
+  //     ...
+  //   end_try
+  // catch                 ;; ehpad (bb3)
+  //   ...
+  // end_try
+  //
+  // ---
+  // 2. The same as 1, but in this case an instruction unwinds to a caller
+  //    function and not another EH pad.
+  //
+  // Example: we have the following CFG:
+  // bb0:
+  //   call @foo       ; if it throws, unwind to bb2
+  // bb1:
+  //   call @bar       ; if it throws, unwind to caller
+  // bb2 (ehpad):
+  //   catch
+  //   ...
+  //
+  // And the CFG is sorted in this order. Then after placing TRY markers, it
+  // will look like:
+  // try
+  //   call @foo
+  //   call @bar     ;; if it throws, unwind to caller
+  // catch           ;; ehpad (bb2)
+  //   ...
+  // end_try
+  //
+  // Now if bar() throws, it is going to end up ip in bb2, when it is supposed
+  // throw up to the caller. We solve this problem in the same way, but in this
+  // case 'delegate's immediate argument is the number of block depths + 1,
+  // which means it rethrows to the caller.
+  // try
+  //   call @foo
+  //   try                  ;; (new)
+  //     call @bar
+  //   delegate 1 (caller)  ;; (new)
+  // catch                  ;; ehpad (bb2)
+  //   ...
+  // end_try
+  //
+  // Before rewriteDepthImmediates, delegate's argument is a BB. In case of the
+  // caller, it will take a fake BB generated by getFakeCallerBlock(), which
+  // will be converted to a correct immediate argument later.
+  //
+  // In case there are multiple calls in a BB that may throw to the caller, they
+  // can be wrapped together in one nested try-delegate scope. (In 1, this
+  // couldn't happen, because may-throwing instruction there had an unwind
+  // destination, i.e., it was an invoke before, and there could be only one
+  // invoke within a BB.)
+
+  SmallVector<const MachineBasicBlock *, 8> EHPadStack;
+  // Range of intructions to be wrapped in a new nested try/catch. A range
+  // exists in a single BB and does not span multiple BBs.
+  using TryRange = std::pair<MachineInstr *, MachineInstr *>;
+  // In original CFG, <unwind destination BB, a vector of try ranges>
+  DenseMap<MachineBasicBlock *, SmallVector<TryRange, 4>> UnwindDestToTryRanges;
+
+  // Gather possibly throwing calls (i.e., previously invokes) whose current
+  // unwind destination is not the same as the original CFG. (Case 1)
+
+  for (auto &MBB : reverse(MF)) {
+    bool SeenThrowableInstInBB = false;
+    for (auto &MI : reverse(MBB)) {
+      if (MI.getOpcode() == WebAssembly::TRY)
+        EHPadStack.pop_back();
+      else if (WebAssembly::isCatch(MI.getOpcode()))
+        EHPadStack.push_back(MI.getParent());
+
+      // In this loop we only gather calls that have an EH pad to unwind. So
+      // there will be at most 1 such call (= invoke) in a BB, so after we've
+      // seen one, we can skip the rest of BB. Also if MBB has no EH pad
+      // successor or MI does not throw, this is not an invoke.
+      if (SeenThrowableInstInBB || !MBB.hasEHPadSuccessor() ||
+          !WebAssembly::mayThrow(MI))
+        continue;
+      SeenThrowableInstInBB = true;
+
+      // If the EH pad on the stack top is where this instruction should unwind
+      // next, we're good.
+      MachineBasicBlock *UnwindDest = getFakeCallerBlock(MF);
+      for (auto *Succ : MBB.successors()) {
+        // Even though semantically a BB can have multiple successors in case an
+        // exception is not caught by a catchpad, in our backend implementation
+        // it is guaranteed that a BB can have at most one EH pad successor. For
+        // details, refer to comments in findWasmUnwindDestinations function in
+        // SelectionDAGBuilder.cpp.
+        if (Succ->isEHPad()) {
+          UnwindDest = Succ;
+          break;
+        }
+      }
+      if (EHPadStack.back() == UnwindDest)
+        continue;
+
+      // Include EH_LABELs in the range before and afer the invoke
+      MachineInstr *RangeBegin = &MI, *RangeEnd = &MI;
+      if (RangeBegin->getIterator() != MBB.begin() &&
+          std::prev(RangeBegin->getIterator())->isEHLabel())
+        RangeBegin = &*std::prev(RangeBegin->getIterator());
+      if (std::next(RangeEnd->getIterator()) != MBB.end() &&
+          std::next(RangeEnd->getIterator())->isEHLabel())
+        RangeEnd = &*std::next(RangeEnd->getIterator());
+
+      // If not, record the range.
+      UnwindDestToTryRanges[UnwindDest].push_back(
+          TryRange(RangeBegin, RangeEnd));
+      LLVM_DEBUG(dbgs() << "- Call unwind mismatch: MBB = " << MBB.getName()
+                        << "\nCall = " << MI
+                        << "\nOriginal dest = " << UnwindDest->getName()
+                        << "  Current dest = " << EHPadStack.back()->getName()
+                        << "\n\n");
+    }
+  }
+
+  assert(EHPadStack.empty());
+
+  // Gather possibly throwing calls that are supposed to unwind up to the caller
+  // if they throw, but currently unwind to an incorrect destination. Unlike the
+  // loop above, there can be multiple calls within a BB that unwind to the
+  // caller, which we should group together in a range. (Case 2)
+
+  MachineInstr *RangeBegin = nullptr, *RangeEnd = nullptr; // inclusive
+
+  // Record the range.
+  auto RecordCallerMismatchRange = [&](const MachineBasicBlock *CurrentDest) {
+    UnwindDestToTryRanges[getFakeCallerBlock(MF)].push_back(
+        TryRange(RangeBegin, RangeEnd));
+    LLVM_DEBUG(dbgs() << "- Call unwind mismatch: MBB = "
+                      << RangeBegin->getParent()->getName()
+                      << "\nRange begin = " << *RangeBegin
+                      << "Range end = " << *RangeEnd
+                      << "\nOriginal dest = caller  Current dest = "
+                      << CurrentDest->getName() << "\n\n");
+    RangeBegin = RangeEnd = nullptr; // Reset range pointers
+  };
+
+  for (auto &MBB : reverse(MF)) {
+    bool SeenThrowableInstInBB = false;
+    for (auto &MI : reverse(MBB)) {
+      if (MI.getOpcode() == WebAssembly::TRY)
+        EHPadStack.pop_back();
+      else if (WebAssembly::isCatch(MI.getOpcode()))
+        EHPadStack.push_back(MI.getParent());
+      bool MayThrow = WebAssembly::mayThrow(MI);
+
+      // If MBB has an EH pad successor and this is the last instruction that
+      // may throw, this instruction unwinds to the EH pad and not to the
+      // caller.
+      if (MBB.hasEHPadSuccessor() && MayThrow && !SeenThrowableInstInBB) {
+        SeenThrowableInstInBB = true;
+        continue;
+      }
+
+      // We wrap up the current range when we see a marker even if we haven't
+      // finished a BB.
+      if (RangeEnd && WebAssembly::isMarker(MI.getOpcode())) {
+        RecordCallerMismatchRange(EHPadStack.back());
+        continue;
+      }
+
+      // If EHPadStack is empty, that means it correctly unwinds to the caller
+      // if it throws, so we're good. If MI does not throw, we're good too.
+      if (EHPadStack.empty() || !MayThrow)
+        continue;
+
+      // We found an instruction that unwinds to the caller but currently has an
+      // incorrect unwind destination. Create a new range or increment the
+      // currently existing range.
+      if (!RangeEnd)
+        RangeBegin = RangeEnd = &MI;
+      else
+        RangeBegin = &MI;
+    }
+
+    if (RangeEnd)
+      RecordCallerMismatchRange(EHPadStack.back());
+  }
+
+  assert(EHPadStack.empty());
+
+  // We don't have any unwind destination mismatches to resolve.
+  if (UnwindDestToTryRanges.empty())
+    return false;
+
+  // Now we fix the mismatches by wrapping calls with inner try-delegates.
+  for (auto &P : UnwindDestToTryRanges) {
+    NumCallUnwindMismatches += P.second.size();
+    MachineBasicBlock *UnwindDest = P.first;
+    auto &TryRanges = P.second;
+
+    for (auto Range : TryRanges) {
+      MachineInstr *RangeBegin = nullptr, *RangeEnd = nullptr;
+      std::tie(RangeBegin, RangeEnd) = Range;
+      auto *MBB = RangeBegin->getParent();
+
+      // If this BB has an EH pad successor, i.e., ends with an 'invoke', now we
+      // are going to wrap the invoke with try-delegate, making the 'delegate'
+      // BB the new successor instead, so remove the EH pad succesor here. The
+      // BB may not have an EH pad successor if calls in this BB throw to the
+      // caller.
+      MachineBasicBlock *EHPad = nullptr;
+      for (auto *Succ : MBB->successors()) {
+        if (Succ->isEHPad()) {
+          EHPad = Succ;
+          break;
+        }
+      }
+      if (EHPad)
+        MBB->removeSuccessor(EHPad);
+
+      addTryDelegate(RangeBegin, RangeEnd, UnwindDest);
+    }
+  }
+
+  return true;
+}
+
+bool WebAssemblyCFGStackify::fixCatchUnwindMismatches(MachineFunction &MF) {
+  // TODO implement
   return false;
 }
 
-static unsigned
-getDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack,
-         const MachineBasicBlock *MBB) {
+void WebAssemblyCFGStackify::recalculateScopeTops(MachineFunction &MF) {
+  // Renumber BBs and recalculate ScopeTop info because new BBs might have been
+  // created and inserted during fixing unwind mismatches.
+  MF.RenumberBlocks();
+  ScopeTops.clear();
+  ScopeTops.resize(MF.getNumBlockIDs());
+  for (auto &MBB : reverse(MF)) {
+    for (auto &MI : reverse(MBB)) {
+      if (ScopeTops[MBB.getNumber()])
+        break;
+      switch (MI.getOpcode()) {
+      case WebAssembly::END_BLOCK:
+      case WebAssembly::END_LOOP:
+      case WebAssembly::END_TRY:
+      case WebAssembly::DELEGATE:
+        updateScopeTops(EndToBegin[&MI]->getParent(), &MBB);
+        break;
+      case WebAssembly::CATCH:
+      case WebAssembly::CATCH_ALL:
+        updateScopeTops(EHPadToTry[&MBB]->getParent(), &MBB);
+        break;
+      }
+    }
+  }
+}
+
+unsigned WebAssemblyCFGStackify::getDepth(
+    const SmallVectorImpl<const MachineBasicBlock *> &Stack,
+    const MachineBasicBlock *MBB) {
+  if (MBB == FakeCallerBB)
+    return Stack.size();
   unsigned Depth = 0;
   for (auto X : reverse(Stack)) {
     if (X == MBB)
@@ -937,13 +1357,18 @@ void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) {
   }
   // Fix mismatches in unwind destinations induced by linearizing the code.
   if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm &&
-      MF.getFunction().hasPersonalityFn())
-    fixUnwindMismatches(MF);
+      MF.getFunction().hasPersonalityFn()) {
+    bool Changed = fixCallUnwindMismatches(MF);
+    Changed |= fixCatchUnwindMismatches(MF);
+    if (Changed)
+      recalculateScopeTops(MF);
+  }
 }
 
 void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
   // Now rewrite references to basic blocks to be depth immediates.
   SmallVector<const MachineBasicBlock *, 8> Stack;
+  SmallVector<const MachineBasicBlock *, 8> DelegateStack;
   for (auto &MBB : reverse(MF)) {
     for (auto I = MBB.rbegin(), E = MBB.rend(); I != E; ++I) {
       MachineInstr &MI = *I;
@@ -954,20 +1379,34 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
                    MBB.getNumber() &&
                "Block/try marker should be balanced");
         Stack.pop_back();
+        DelegateStack.pop_back();
         break;
 
       case WebAssembly::LOOP:
         assert(Stack.back() == &MBB && "Loop top should be balanced");
         Stack.pop_back();
+        DelegateStack.pop_back();
         break;
 
       case WebAssembly::END_BLOCK:
+        Stack.push_back(&MBB);
+        DelegateStack.push_back(&MBB);
+        break;
+
       case WebAssembly::END_TRY:
+        // We handle DELEGATE in the default level, because DELEGATE has
+        // immediate operands to rewirte.
         Stack.push_back(&MBB);
         break;
 
       case WebAssembly::END_LOOP:
         Stack.push_back(EndToBegin[&MI]->getParent());
+        DelegateStack.push_back(EndToBegin[&MI]->getParent());
+        break;
+
+      case WebAssembly::CATCH:
+      case WebAssembly::CATCH_ALL:
+        DelegateStack.push_back(&MBB);
         break;
 
       default:
@@ -977,11 +1416,21 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
           while (MI.getNumOperands() > 0)
             MI.RemoveOperand(MI.getNumOperands() - 1);
           for (auto MO : Ops) {
-            if (MO.isMBB())
-              MO = MachineOperand::CreateImm(getDepth(Stack, MO.getMBB()));
+            if (MO.isMBB()) {
+              if (MI.getOpcode() == WebAssembly::DELEGATE)
+                MO = MachineOperand::CreateImm(
+                    getDepth(DelegateStack, MO.getMBB()));
+              else
+                MO = MachineOperand::CreateImm(getDepth(Stack, MO.getMBB()));
+            }
             MI.addOperand(MF, MO);
           }
         }
+
+        if (MI.getOpcode() == WebAssembly::DELEGATE) {
+          Stack.push_back(&MBB);
+          DelegateStack.push_back(&MBB);
+        }
         break;
       }
     }
@@ -989,13 +1438,18 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
   assert(Stack.empty() && "Control flow should be balanced");
 }
 
+void WebAssemblyCFGStackify::cleanupFunctionData(MachineFunction &MF) {
+  if (FakeCallerBB)
+    MF.DeleteMachineBasicBlock(FakeCallerBB);
+  AppendixBB = FakeCallerBB = nullptr;
+}
+
 void WebAssemblyCFGStackify::releaseMemory() {
   ScopeTops.clear();
   BeginToEnd.clear();
   EndToBegin.clear();
   TryToEHPad.clear();
   EHPadToTry.clear();
-  AppendixBB = nullptr;
 }
 
 bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
@@ -1031,6 +1485,8 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
            .isOSBinFormatELF())
     appendEndToFunction(MF, TII);
 
+  cleanupFunctionData(MF);
+
   MF.getInfo<WebAssemblyFunctionInfo>()->setCFGStackified();
   return true;
 }

diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 702560bea100..c687fa2dad3f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -156,6 +156,10 @@ defm CATCH : I<(outs I32:$dst), (ins event_op:$tag),
 defm CATCH_ALL : NRI<(outs), (ins), [], "catch_all", 0x05>;
 }
 
+// Delegating an exception: delegate
+let isTerminator = 1, hasCtrlDep = 1, hasSideEffects = 1 in
+defm DELEGATE : NRI<(outs), (ins bb_op:$dst), [], "delegate \t $dst", 0x18>;
+
 // Pseudo instructions: cleanupret / catchret
 let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
     isPseudo = 1, isEHScopeReturn = 1 in {

diff  --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
index 209aaea2aaf6..b9d62c6f5ad6 100644
--- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
+++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
@@ -1,11 +1,9 @@
 ; REQUIRES: asserts
-; TODO Reenable disabled lines after updating the backend to the new spec
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -exception-model=wasm -mattr=+exception-handling | FileCheck %s
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -exception-model=wasm -mattr=+exception-handling
 ; RUN: llc < %s -O0 -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -verify-machineinstrs -exception-model=wasm -mattr=+exception-handling | FileCheck %s --check-prefix=NOOPT
-; R UN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -exception-model=wasm -mattr=+exception-handling -wasm-disable-ehpad-sort | FileCheck %s --check-prefix=NOSORT
-; R UN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -exception-model=wasm -mattr=+exception-handling -wasm-disable-ehpad-sort | FileCheck %s --check-prefix=NOSORT-LOCALS
-; R UN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -exception-model=wasm -mattr=+exception-handling -wasm-disable-ehpad-sort -stats 2>&1 | FileCheck %s --check-prefix=NOSORT-STAT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -exception-model=wasm -mattr=+exception-handling -wasm-disable-ehpad-sort -stats 2>&1 | FileCheck %s --check-prefix=NOSORT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -exception-model=wasm -mattr=+exception-handling -wasm-disable-ehpad-sort | FileCheck %s --check-prefix=NOSORT-LOCALS
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
@@ -381,31 +379,24 @@ try.cont:                                         ; preds = %catch.start, %loop
 ; destination mismatches. And we use -wasm-disable-ehpad-sort to create maximum
 ; number of mismatches in several tests below.
 
-; 'call bar''s original unwind destination was 'C1', but after control flow
-; linearization, its unwind destination incorrectly becomes 'C0'. We fix this by
-; wrapping the call with a nested try/catch/end_try and branching to the right
-; destination (L0).
+; 'call bar''s original unwind destination was 'C0', but after control flow
+; linearization, its unwind destination incorrectly becomes 'C1'. We fix this by
+; wrapping the call with a nested try-delegate that targets 'C0'.
 
 ; NOSORT-LABEL: test5
-; NOSORT:   block
+; NOSORT: try
+; NOSORT:   try
+; NOSORT:     call  foo
+; --- try-delegate starts (call unwind mismatch)
 ; NOSORT:     try
-; NOSORT:       try
-; NOSORT:         call      foo
-; --- Nested try/catch/end_try starts
-; NOSORT:         try
-; NOSORT:           call      bar
-; NOSORT:         catch     $drop=
-; NOSORT:           br        2                        # 2: down to label[[L0:[0-9]+]]
-; NOSORT:         end_try
-; --- Nested try/catch/end_try ends
-; NOSORT:         br        2                          # 2: down to label[[L1:[0-9]+]]
-; NOSORT:       catch     $drop=                       # catch[[C0:[0-9]+]]:
-; NOSORT:         br        2                          # 2: down to label[[L1]]
-; NOSORT:       end_try
-; NOSORT:     catch     $drop=                         # catch[[C1:[0-9]+]]:
-; NOSORT:     end_try                                  # label[[L0]]:
-; NOSORT:   end_block                                  # label[[L1]]:
-; NOSORT:   return
+; NOSORT:       call  bar
+; NOSORT:     delegate    1     # label/catch{{[0-9]+}}: down to catch[[C0:[0-9]+]]
+; --- try-delegate ends (call unwind mismatch)
+; NOSORT:   catch   {{.*}}      # catch[[C1:[0-9]+]]:
+; NOSORT:   end_try
+; NOSORT: catch   {{.*}}        # catch[[C0]]:
+; NOSORT: end_try
+; NOSORT: return
 
 define void @test5() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
 bb0:
@@ -438,31 +429,29 @@ try.cont:                                         ; preds = %catch.start1, %catc
   ret void
 }
 
-; Two 'call bar''s original unwind destination was the caller, but after control
-; flow linearization, their unwind destination incorrectly becomes 'C0'. We fix
-; this by wrapping the call with a nested try/catch/end_try and branching to the
-; right destination (L0), from which we rethrow the exception to the caller.
+; 'call bar' and 'call baz''s original unwind destination was the caller, but
+; after control flow linearization, their unwind destination incorrectly becomes
+; 'C0'. We fix this by wrapping the calls with a nested try-delegate that
+; rethrows exceptions to the caller.
 
 ; And the return value of 'baz' should NOT be stackified because the BB is split
 ; during fixing unwind mismatches.
 
 ; NOSORT-LABEL: test6
+; NOSORT: try
+; NOSORT:   call  foo
+; --- try-delegate starts (call unwind mismatch)
 ; NOSORT:   try
-; NOSORT:     call      foo
-; --- Nested try/catch/end_try starts
-; NOSORT:     try
-; NOSORT:       call      bar
-; NOSORT:       call      ${{[0-9]+}}=, baz
-; NOSORT-NOT:   call      $push{{.*}}=, baz
-; NOSORT:     catch     $[[REG:[0-9]+]]=
-; NOSORT:       br        1                            # 1: down to label[[L0:[0-9]+]]
-; NOSORT:     end_try
-; --- Nested try/catch/end_try ends
-; NOSORT:     return
-; NOSORT:   catch     $drop=                           # catch[[C0:[0-9]+]]:
-; NOSORT:     return
-; NOSORT:   end_try                                    # label[[L0]]:
-; NOSORT:   rethrow   $[[REG]]                         # to caller
+; NOSORT:     call  bar
+; NOSORT:     call  $[[RET:[0-9]+]]=, baz
+; NOSORT-NOT: call  $push{{.*}}=, baz
+; NOSORT:   delegate    1                     # label/catch{{[0-9]+}}: to caller
+; --- try-delegate ends (call unwind mismatch)
+; NOSORT:   call  nothrow, $[[RET]]
+; NOSORT:   return
+; NOSORT: catch   {{.*}}                      # catch[[C0:[0-9]+]]:
+; NOSORT:   return
+; NOSORT: end_try
 
 define void @test6() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
 bb0:
@@ -488,34 +477,86 @@ try.cont:                                         ; preds = %catch.start0
   ret void
 }
 
+; The same as test5, but we have one more call 'call @foo' in bb1 which unwinds
+; to the caller. IN this case bb1 has two call unwind mismatches: 'call @foo'
+; unwinds to the caller and 'call @bar' unwinds to catch C0.
+
+; NOSORT-LABEL: test7
+; NOSORT: try
+; NOSORT:   try
+; NOSORT:     call  foo
+; --- try-delegate starts (call unwind mismatch)
+; NOSORT:     try
+; NOSORT:       call  foo
+; NOSORT:     delegate    2     # label/catch{{[0-9]+}}: to caller
+; --- try-delegate ends (call unwind mismatch)
+; --- try-delegate starts (call unwind mismatch)
+; NOSORT:     try
+; NOSORT:       call  bar
+; NOSORT:     delegate    1     # label/catch{{[0-9]+}}: down to catch[[C0:[0-9]+]]
+; --- try-delegate ends (call unwind mismatch)
+; NOSORT:   catch   {{.*}}      # catch[[C1:[0-9]+]]:
+; NOSORT:   end_try
+; NOSORT: catch   {{.*}}        # catch[[C0]]:
+; NOSORT: end_try
+; NOSORT: return
+
+define void @test7() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+bb0:
+  invoke void @foo()
+          to label %bb1 unwind label %catch.dispatch0
+
+bb1:                                              ; preds = %bb0
+  call void @foo()
+  invoke void @bar()
+          to label %try.cont unwind label %catch.dispatch1
+
+catch.dispatch0:                                  ; preds = %bb0
+  %0 = catchswitch within none [label %catch.start0] unwind to caller
+
+catch.start0:                                     ; preds = %catch.dispatch0
+  %1 = catchpad within %0 [i8* null]
+  %2 = call i8* @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  catchret from %1 to label %try.cont
+
+catch.dispatch1:                                  ; preds = %bb1
+  %4 = catchswitch within none [label %catch.start1] unwind to caller
+
+catch.start1:                                     ; preds = %catch.dispatch1
+  %5 = catchpad within %4 [i8* null]
+  %6 = call i8* @llvm.wasm.get.exception(token %5)
+  %7 = call i32 @llvm.wasm.get.ehselector(token %5)
+  catchret from %5 to label %try.cont
+
+try.cont:                                         ; preds = %catch.start1, %catch.start0, %bb1
+  ret void
+}
+
 ; Similar situation as @test6. Here 'call @qux''s original unwind destination
 ; was the caller, but after control flow linearization, their unwind destination
 ; incorrectly becomes 'C0' within the function. We fix this by wrapping the call
-; with a nested try/catch/end_try and branching to the right destination, from
-; which we rethrow the exception to the caller.
+; with a nested try-delegate that rethrows the exception to the caller.
 
 ; Because 'call @qux' pops an argument pushed by 'i32.const 5' from stack, the
 ; nested 'try' should be placed before `i32.const 5', not between 'i32.const 5'
 ; and 'call @qux'.
 
-; NOSORT-LABEL: test7
+; NOSORT-LABEL: test8
+; NOSORT: try       i32
+; NOSORT:   call  foo
+; --- try-delegate starts (call unwind mismatch)
 ; NOSORT:   try
-; NOSORT:     call      foo
-; --- Nested try/catch/end_try starts
-; NOSORT:     try
-; NOSORT-NEXT:  i32.const $push{{[0-9]+}}=, 5
-; NOSORT-NEXT:  call      ${{[0-9]+}}=, qux
-; NOSORT:     catch     $[[REG:[0-9]+]]=
-; NOSORT:       br        1                            # 1: down to label[[L0:[0-9]+]]
-; NOSORT:     end_try
-; --- Nested try/catch/end_try ends
-; NOSORT:     return
-; NOSORT:   catch     $drop=                           # catch[[C0:[0-9]+]]:
-; NOSORT:     return
-; NOSORT:   end_try                                    # label[[L0]]:
-; NOSORT:   rethrow   $[[REG]]                         # to caller
-
-define i32 @test7() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+; NOSORT:     i32.const  $push{{[0-9]+}}=, 5
+; NOSORT:     call  ${{[0-9]+}}=, qux
+; NOSORT:   delegate    1                     # label/catch{{[0-9]+}}: to caller
+; --- try-delegate ends (call unwind mismatch)
+; NOSORT:   return
+; NOSORT: catch   {{.*}}                      # catch[[C0:[0-9]+]]:
+; NOSORT:   return
+; NOSORT: end_try
+
+define i32 @test8() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
 bb0:
   invoke void @foo()
           to label %bb1 unwind label %catch.dispatch0
@@ -538,10 +579,10 @@ try.cont:                                         ; preds = %catch.start0
 }
 
 ; Tests the case when TEE stackifies a register in RegStackify but it gets
-; unstackified in fixUnwindMismatches in CFGStackify.
+; unstackified in fixCallUnwindMismatches in CFGStackify.
 
-; NOSORT-LOCALS-LABEL: test8
-define void @test8(i32 %x) personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+; NOSORT-LOCALS-LABEL: test9
+define void @test9(i32 %x) personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
 bb0:
   invoke void @foo()
           to label %bb1 unwind label %catch.dispatch0
@@ -551,7 +592,7 @@ bb1:                                              ; preds = %bb0
   ; This %addr is used in multiple places, so tee is introduced in RegStackify,
   ; which stackifies the use of %addr in store instruction. A tee has two dest
   ; registers, the first of which is stackified and the second is not.
-  ; But when we introduce a nested try-catch in fixUnwindMismatches in
+  ; But when we introduce a nested try-delegate in fixCallUnwindMismatches in
   ; CFGStackify, it is possible that we end up unstackifying the first dest
   ; register. In that case, we convert that tee into a copy.
   %addr = inttoptr i32 %t to i32*
@@ -577,62 +618,6 @@ try.cont:                                         ; preds = %catch.start0
   ret void
 }
 
-; If not for the unwind destination mismatch, the LOOP marker here would have an
-; i32 signature. But because we add a rethrow instruction at the end of the
-; appendix block, now the LOOP marker does not have a signature (= has a void
-; signature). Here the two calls two 'bar' are supposed to throw up to the
-; caller, but incorrectly unwind to 'C0' after linearizing the CFG.
-
-; NOSORT-LABEL: test9
-; NOSORT: block
-; NOSORT-NOT: loop      i32
-; NOSORT:   loop                                       # label[[L0:[0-9]+]]:
-; NOSORT:     try
-; NOSORT:       call      foo
-; --- Nested try/catch/end_try starts
-; NOSORT:       try
-; NOSORT:         call      bar
-; NOSORT:         call      bar
-; NOSORT:       catch     $[[REG:[0-9]+]]=
-; NOSORT:         br        1                          # 1: down to label[[L1:[0-9]+]]
-; NOSORT:       end_try
-; --- Nested try/catch/end_try ends
-; NOSORT:       return    {{.*}}
-; NOSORT:     catch     $drop=                         # catch[[C0:[0-9]+]]:
-; NOSORT:       br        1                            # 1: up to label[[L0]]
-; NOSORT:     end_try                                  # label[[L1]]:
-; NOSORT:   end_loop
-; NOSORT: end_block
-; NOSORT: rethrow   $[[REG]]                           # to caller
-
-define i32 @test9(i32* %p) personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
-entry:
-  store volatile i32 0, i32* %p
-  br label %loop
-
-loop:                                             ; preds = %try.cont, %entry
-  store volatile i32 1, i32* %p
-  invoke void @foo()
-          to label %bb unwind label %catch.dispatch
-
-bb:                                               ; preds = %loop
-  call void @bar()
-  call void @bar()
-  ret i32 0
-
-catch.dispatch:                                   ; preds = %loop
-  %0 = catchswitch within none [label %catch.start] unwind to caller
-
-catch.start:                                      ; preds = %catch.dispatch
-  %1 = catchpad within %0 [i8* null]
-  %2 = call i8* @llvm.wasm.get.exception(token %1)
-  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
-  catchret from %1 to label %try.cont
-
-try.cont:                                         ; preds = %catch.start
-  br label %loop
-}
-
 ; When we have both kinds of EH pad unwind mismatches:
 ; - A may-throw instruction unwinds to an incorrect EH pad after linearizing the
 ;   CFG, when it is supposed to unwind to another EH pad.
@@ -640,48 +625,28 @@ try.cont:                                         ; preds = %catch.start
 ;   CFG, when it is supposed to unwind to the caller.
 
 ; NOSORT-LABEL: test10
-; NOSORT: block
-; NOSORT:   block
+; NOSORT: try
+; NOSORT:   try
+; NOSORT:     call  foo
+; --- try-delegate starts (call unwind mismatch)
 ; NOSORT:     try
-; NOSORT:       try
-; NOSORT:         call      foo
-; --- Nested try/catch/end_try starts
-; NOSORT:         try
-; NOSORT:           call      bar
-; NOSORT:         catch     $[[REG0:[0-9]+]]=
-; NOSORT:           br        2                        # 2: down to label[[L0:[0-9]+]]
-; NOSORT:         end_try
-; --- Nested try/catch/end_try ends
-; NOSORT:         br        2                          # 2: down to label[[L1:[0-9]+]]
-; NOSORT:       catch     {{.*}}
-; NOSORT:         block     i32
-; NOSORT:           br_on_exn   0, {{.*}}              # 0: down to label[[L2:[0-9]+]]
-; --- Nested try/catch/end_try starts
-; NOSORT:           try
-; NOSORT:             rethrow   0                      # down to catch[[C0:[0-9]+]]
-; NOSORT:           catch     $[[REG1:[0-9]+]]=        # catch[[C0]]:
-; NOSORT:             br        5                      # 5: down to label[[L3:[0-9]+]]
-; NOSORT:           end_try
-; --- Nested try/catch/end_try ends
-; NOSORT:         end_block                            # label[[L2]]:
-; NOSORT:         call      $drop=, __cxa_begin_catch
-; --- Nested try/catch/end_try starts
-; NOSORT:         try
-; NOSORT:           call      __cxa_end_catch
-; NOSORT:         catch     $[[REG1]]=
-; NOSORT:           br        4                        # 4: down to label[[L3]]
-; NOSORT:         end_try
-; --- Nested try/catch/end_try ends
-; NOSORT:         br        2                          # 2: down to label[[L1]]
-; NOSORT:       end_try
-; NOSORT:     catch     $[[REG0]]=
-; NOSORT:     end_try                                  # label[[L0]]:
-; NOSORT:     call      $drop=, __cxa_begin_catch
-; NOSORT:     call      __cxa_end_catch
-; NOSORT:   end_block                                  # label[[L1]]:
-; NOSORT:   return
-; NOSORT: end_block                                    # label[[L3]]:
-; NOSORT: rethrow   $[[REG1]]                          # to caller
+; NOSORT:       call  bar
+; NOSORT:     delegate    1            # label/catch{{[0-9]+}}: down to catch[[C0:[0-9]+]]
+; --- try-delegate ends (call unwind mismatch)
+; NOSORT:   catch
+; NOSORT:     call  {{.*}} __cxa_begin_catch
+; --- try-delegate starts (call unwind mismatch)
+; NOSORT:     try
+; NOSORT:       call  __cxa_end_catch
+; NOSORT:     delegate    1            # label/catch{{[0-9]+}}: to caller
+; --- try-delegate ends (call unwind mismatch)
+; NOSORT:   end_try
+; NOSORT: catch  {{.*}}                # catch[[C0]]:
+; NOSORT:   call  {{.*}} __cxa_begin_catch
+; NOSORT:   call  __cxa_end_catch
+; NOSORT: end_try
+; NOSORT: return
+
 define void @test10() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
 bb0:
   invoke void @foo()
@@ -724,6 +689,7 @@ try.cont:                                         ; preds = %catch.start1, %catc
 ; (before 'cont' is sorted) and there should not be any unwind destination
 ; mismatches in CFGStackify.
 
+; NOOPT-LABEL: test11
 ; NOOPT: block
 ; NOOPT:   try
 ; NOOPT:     call      foo
@@ -774,8 +740,8 @@ if.end:                                           ; preds = %cont, %catch.start,
 ; NOSORT:   call {{.*}} memmove
 ; NOSORT:   call {{.*}} memset
 ; NOSORT:   return
-; NOSORT: catch
-; NOSORT:   rethrow
+; NOSORT: catch_all
+; NOSORT:   rethrow 0
 ; NOSORT: end_try
 define void @test12(i8* %a, i8* %b) personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
 entry:
@@ -891,49 +857,6 @@ terminate7:                                       ; preds = %ehcleanup
   unreachable
 }
 
-; We don't need to call placeBlockMarker after fixUnwindMismatches unless the
-; destination is the appendix BB at the very end. This should not crash.
-define void @test16(i32* %p, i32 %a, i32 %b) personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
-entry:
-  br label %loop
-
-loop:                                             ; preds = %try.cont, %entry
-  invoke void @foo()
-          to label %bb0 unwind label %catch.dispatch0
-
-bb0:                                              ; preds = %loop
-  %cmp = icmp ne i32 %a, %b
-  br i1 %cmp, label %bb1, label %last
-
-bb1:                                              ; preds = %bb0
-  invoke void @bar()
-          to label %try.cont unwind label %catch.dispatch1
-
-catch.dispatch0:                                  ; preds = %loop
-  %0 = catchswitch within none [label %catch.start0] unwind to caller
-
-catch.start0:                                     ; preds = %catch.dispatch0
-  %1 = catchpad within %0 [i8* null]
-  %2 = call i8* @llvm.wasm.get.exception(token %1)
-  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
-  catchret from %1 to label %try.cont
-
-catch.dispatch1:                                  ; preds = %bb1
-  %4 = catchswitch within none [label %catch.start1] unwind to caller
-
-catch.start1:                                     ; preds = %catch.dispatch1
-  %5 = catchpad within %4 [i8* null]
-  %6 = call i8* @llvm.wasm.get.exception(token %5)
-  %7 = call i32 @llvm.wasm.get.ehselector(token %5)
-  catchret from %5 to label %try.cont
-
-try.cont:                                         ; preds = %catch.start1, %catch.start0, %bb1
-  br label %loop
-
-last:                                             ; preds = %bb0
-  ret void
-}
-
 ; Tests if CFGStackify's removeUnnecessaryInstrs() removes unnecessary branches
 ; correctly. The code is in the form below, where 'br' is unnecessary because
 ; after running the 'try' body the control flow will fall through to bb2 anyway.
@@ -947,8 +870,8 @@ last:                                             ; preds = %bb0
 ;     ...
 ; bb2:            <- Continuation BB
 ;   end
-; CHECK-LABEL: test17
-define void @test17(i32 %n) personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+; CHECK-LABEL: test15
+define void @test15(i32 %n) personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
 entry:
   invoke void @foo()
           to label %for.body unwind label %catch.dispatch
@@ -986,7 +909,7 @@ try.cont:                                         ; preds = %catch.start, %for.e
 }
 
 ; void foo();
-; void test18() {
+; void test16() {
 ;   try {
 ;     foo();
 ;     try {
@@ -1016,8 +939,8 @@ try.cont:                                         ; preds = %catch.start, %for.e
 ; bb3:            <- Continuation BB
 ;   end
 ;
-; CHECK-LABEL: test18
-define void @test18() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+; CHECK-LABEL: test16
+define void @test16() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
 ; CHECK: call foo
 entry:
   invoke void @foo()
@@ -1064,12 +987,12 @@ invoke.cont2:                                     ; preds = %catch.start
 ; path back to the loop header), and is placed after the loop latch block
 ; 'invoke.cont' intentionally. This tests if 'end_loop' marker is placed
 ; correctly not right after 'invoke.cont' part but after 'ehcleanup' part,
-; NOSORT-LABEL: test19
+; NOSORT-LABEL: test17
 ; NOSORT: loop
 ; NOSORT: try
 ; NOSORT: end_try
 ; NOSORT: end_loop
-define void @test19(i32 %n) personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+define void @test17(i32 %n) personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
 entry:
   br label %while.cond
 
@@ -1111,17 +1034,17 @@ while.end:                                        ; preds = %while.body, %while.
 ; very end of a function, CFGStackify's fixEndsAtEndOfFunction function fixes
 ; the corresponding block/loop/try's type to match the function's return type.
 ; But when a `try`'s type is fixed, we should also check `end` instructions
-; before its corresponding `catch`, because both `try` and `catch` body should
-; satisfy the return type requirements.
+; before its corresponding `catch_all`, because both `try` and `catch_all` body
+; should satisfy the return type requirements.
 
-; NOSORT-LABEL: test20
+; NOSORT-LABEL: test18
 ; NOSORT: try i32
 ; NOSORT: loop i32
 ; NOSORT: end_loop
-; NOSORT: catch
+; NOSORT: catch_all
 ; NOSORT: end_try
 ; NOSORT-NEXT: end_function
-define i32 @test20(i32 %n) personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+define i32 @test18(i32 %n) personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
 entry:
   %t = alloca %class.Object, align 1
   br label %for.cond
@@ -1154,9 +1077,8 @@ ehcleanup:                                        ; preds = %if.then
   cleanupret from %0 unwind to caller
 }
 
-
 ; Check if the unwind destination mismatch stats are correct
-; NOSORT-STAT: 17 wasm-cfg-stackify    - Number of EH pad unwind mismatches found
+; NOSORT: 18 wasm-cfg-stackify    - Number of call unwind mismatches found
 
 declare void @foo()
 declare void @bar()